diff --git a/2F6NCTHH5/note.json b/2F6NCTHH5/note.json new file mode 100644 index 0000000..863beee --- /dev/null +++ b/2F6NCTHH5/note.json @@ -0,0 +1,293 @@ +{ + "paragraphs": [ + { + "text": "%md\n\n\nSet spark default conf \n\nspark.jars.packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.7,postgresql:postgresql:9.1-901-1.jdbc4,org.apache.hadoop:hadoop-aws:2.7.3,com.amazonaws:aws-java-sdk:1.7.4,org.mongodb.spark:mongo-spark-connector_2.11:2.3.0,org.elasticsearch:elasticsearch-hadoop:6.7.0,joda-time:joda-time:2.9.1,org.apache.spark:spark-streaming-twitter_2.10:1.4.1,org.twitter4j:twitter4j-core:3.0.3,org.twitter4j:twitter4j-media-support:3.0.3,org.twitter4j:twitter4j-async:3.0.3,org.twitter4j:twitter4j-examples:3.0.3,org.twitter4j:twitter4j-stream:3.0.3\n\nOR \n\n%dep\n\n/*\n\nBEFORE START\n\nif your are using Hortonworks Sandbox, make sure you added more cores and more memory to YARN\n\nspark streaming requires at least 3 containers to run\n\n\n*/\n\n/* this step must be executed as first command, if you already executed other commands, please click in \"Interpreter\" and restart spark interpreter */\n\nz.reset()\nz.load(\"org.apache.spark:spark-streaming-twitter_2.10:1.4.1\")\nz.load(\"org.twitter4j:twitter4j-core:3.0.3\")\nz.load(\"org.twitter4j:twitter4j-media-support:3.0.3\")\nz.load(\"org.twitter4j:twitter4j-async:3.0.3\")\nz.load(\"org.twitter4j:twitter4j-examples:3.0.3\")\nz.load(\"org.twitter4j:twitter4j-stream:3.0.3\")", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:42:33.049", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eSet spark default conf \u003c/p\u003e\n\u003cp\u003espark.jars.packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.7,postgresql:postgresql:9.1-901-1.jdbc4,org.apache.hadoop:hadoop-aws:2.7.3,com.amazonaws:aws-java-sdk:1.7.4,org.mongodb.spark:mongo-spark-connector_2.11:2.3.0,org.elasticsearch:elasticsearch-hadoop:6.7.0,joda-time:joda-time:2.9.1,org.apache.spark:spark-streaming-twitter_2.10:1.4.1,org.twitter4j:twitter4j-core:3.0.3,org.twitter4j:twitter4j-media-support:3.0.3,org.twitter4j:twitter4j-async:3.0.3,org.twitter4j:twitter4j-examples:3.0.3,org.twitter4j:twitter4j-stream:3.0.3\u003c/p\u003e\n\u003cp\u003eOR \u003c/p\u003e\n\u003cp\u003e%dep\u003c/p\u003e\n\u003cp\u003e/*\u003c/p\u003e\n\u003cp\u003eBEFORE START\u003c/p\u003e\n\u003cp\u003eif your are using Hortonworks Sandbox, make sure you added more cores and more memory to YARN\u003c/p\u003e\n\u003cp\u003espark streaming requires at least 3 containers to run\u003c/p\u003e\n\u003cp\u003e*/\u003c/p\u003e\n\u003cp\u003e/* this step must be executed as first command, if you already executed other commands, please click in \u0026ldquo;Interpreter\u0026rdquo; and restart spark interpreter */\u003c/p\u003e\n\u003cp\u003ez.reset()\u003cbr/\u003ez.load(\u0026ldquo;org.apache.spark:spark-streaming-twitter_2.10:1.4.1\u0026rdquo;)\u003cbr/\u003ez.load(\u0026ldquo;org.twitter4j:twitter4j-core:3.0.3\u0026rdquo;)\u003cbr/\u003ez.load(\u0026ldquo;org.twitter4j:twitter4j-media-support:3.0.3\u0026rdquo;)\u003cbr/\u003ez.load(\u0026ldquo;org.twitter4j:twitter4j-async:3.0.3\u0026rdquo;)\u003cbr/\u003ez.load(\u0026ldquo;org.twitter4j:twitter4j-examples:3.0.3\u0026rdquo;)\u003cbr/\u003ez.load(\u0026ldquo;org.twitter4j:twitter4j-stream:3.0.3\u0026rdquo;)\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587050985101_-1918385824", + "id": "20200416-152945_597074387", + "dateCreated": "2020-04-16 15:29:45.102", + "dateStarted": "2020-04-16 15:42:33.078", + "dateFinished": "2020-04-16 15:42:33.100", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\n/*\nUPDATE YOUR TWITTER CREDENTIALS\n*/\n\n\nimport org.apache.spark.streaming._\nimport org.apache.spark.streaming.twitter._\nimport org.apache.spark.storage.StorageLevel\nimport scala.io.Source\nimport scala.collection.mutable.HashMap\nimport java.io.File\nimport org.apache.log4j.Logger\nimport org.apache.log4j.Level\nimport sys.process.stringSeqToProcess\nimport twitter4j.HashtagEntity\n\n/** Configures the Oauth Credentials for accessing Twitter */\ndef configureTwitterCredentials(apiKey: String, apiSecret: String, accessToken: String, accessTokenSecret: String) {\n val configs \u003d new HashMap[String, String] ++\u003d Seq(\n \"apiKey\" -\u003e apiKey, \"apiSecret\" -\u003e apiSecret, \"accessToken\" -\u003e accessToken, \"accessTokenSecret\" -\u003e accessTokenSecret)\n println(\"Configuring Twitter OAuth\")\n configs.foreach{ case(key, value) \u003d\u003e\n if (value.trim.isEmpty) {\n throw new Exception(\"Error setting authentication - value for \" + key + \" not set\")\n }\n val fullKey \u003d \"twitter4j.oauth.\" + key.replace(\"api\", \"consumer\")\n System.setProperty(fullKey, value.trim)\n println(\"\\tProperty \" + fullKey + \" set as [\" + value.trim + \"]\")\n }\n println()\n}\n\n// Configure Twitter credentials\nval apiKey \u003d \"\"\nval apiSecret \u003d \"\"\nval accessToken \u003d \"\"\nval accessTokenSecret \u003d \"\"\nconfigureTwitterCredentials(apiKey, apiSecret, accessToken, accessTokenSecret)\n\nimport org.apache.spark.streaming.twitter._\nval ssc \u003d new StreamingContext(sc, Seconds(2))\nval tweets \u003d TwitterUtils.createStream(ssc, None, Array(\"#hadoop\", \"#bigdata\", \"#spark\", \"#hortonworks\", \"#HDP\"))\n//tweets.saveAsObjectFiles(\"hdfs://sandbox.hortonworks.com:8020/test/twitter-spark/twitter_\", \".txt\")\n\nval twt \u003d tweets.window(Seconds(600))\n\ncase class Tweet(createdAt:Long, text:String, screenName:String)\ntwt.map(status\u003d\u003e\n Tweet(status.getCreatedAt().getTime()/1000, status.getText(), status.getUser().getScreenName())\n).foreachRDD(rdd\u003d\u003e\n rdd.toDF().registerTempTable(\"tweets\")\n)\n\n//twt.print\n\nssc.start()\n\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:42:33.174", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "ERROR", + "msg": [ + { + "type": "TEXT", + "data": "\u003cconsole\u003e:19: error: object twitter is not a member of package org.apache.spark.streaming\n import org.apache.spark.streaming.twitter._\n ^\n\u003cconsole\u003e:52: error: object twitter is not a member of package org.apache.spark.streaming\n import org.apache.spark.streaming.twitter._\n ^\n\u003cconsole\u003e:27: error: not found: value twitter4j\n import twitter4j.HashtagEntity\n ^\n\u003cconsole\u003e:54: error: not found: value TwitterUtils\n val tweets \u003d TwitterUtils.createStream(ssc, None, Array(\"#hadoop\", \"#bigdata\", \"#spark\", \"#hortonworks\", \"#HDP\"))\n ^\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587051544840_-284142077", + "id": "20200416-153904_1319192733", + "dateCreated": "2020-04-16 15:39:04.840", + "dateStarted": "2020-04-16 15:42:33.198", + "dateFinished": "2020-04-16 15:42:33.592", + "status": "ERROR", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\n-- checking window contents\n\nselect from_unixtime(createdAt), count(1) from tweets group by createdAt order by createdAt\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:40:30.550", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "sql", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/sql" + }, + "settings": { + "params": {}, + "forms": {} + }, + "apps": [], + "jobName": "paragraph_1587051613440_1804273809", + "id": "20200416-154013_1555972031", + "dateCreated": "2020-04-16 15:40:13.440", + "status": "READY", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\n/* declaring a function in Scala */\n\ndef sentiment(s:String) : String \u003d {\n val positive \u003d Array(\"like\", \"love\", \"good\", \"great\", \"happy\", \"cool\", \"the\", \"one\", \"that\")\n val negative \u003d Array(\"hate\", \"bad\", \"stupid\", \"is\")\n\n var st \u003d 0;\n\n val words \u003d s.split(\" \") \n positive.foreach(p \u003d\u003e\n words.foreach(w \u003d\u003e\n if(p\u003d\u003dw) st \u003d st+1\n )\n )\n\n negative.foreach(p\u003d\u003e\n words.foreach(w\u003d\u003e\n if(p\u003d\u003dw) st \u003d st-1\n )\n )\n if(st\u003e0)\n \"positivie\"\n else if(st\u003c0)\n \"negative\"\n else\n \"neutral\"\n}\n\nsqlc.udf.register(\"sentiment\", sentiment _)", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:41:11.990", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "apps": [], + "jobName": "paragraph_1587051655860_1808435087", + "id": "20200416-154055_475437300", + "dateCreated": "2020-04-16 15:40:55.860", + "status": "READY", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n#declaring a function in Python\n\nimport re\n\ndef wordcount(a):\n return len(re.split(\"\\W+\",a))\n \nsqlContext.registerFunction(\"wordcount\", wordcount)\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:41:25.295", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/python" + }, + "settings": { + "params": {}, + "forms": {} + }, + "apps": [], + "jobName": "paragraph_1587051679740_-838401990", + "id": "20200416-154119_2099153382", + "dateCreated": "2020-04-16 15:41:19.740", + "status": "READY", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\n\n--using SQL to mix window data, with scala function and python function\n\nselect from_unixtime(createdAt) as created, screenName, sentiment(text) as sentiment, wordcount(text) as wordcount, text from tweets\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:41:36.240", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "sql", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/sql" + }, + "settings": { + "params": {}, + "forms": {} + }, + "apps": [], + "jobName": "paragraph_1587051685360_-1734620202", + "id": "20200416-154125_1588696872", + "dateCreated": "2020-04-16 15:41:25.360", + "status": "READY", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\n\n--ploting sentiment\n\nselect sentiment(text) as sentiment, count(1) from tweets group by sentiment(text)", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:41:52.595", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "apps": [], + "jobName": "paragraph_1587051703200_-1582474404", + "id": "20200416-154143_91820265", + "dateCreated": "2020-04-16 15:41:43.200", + "status": "READY", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\n\n--show most common words for positive and negatives tweets\n\nselect word, sentiment, count(1) as cnt\nfrom \n(\n select from_unixtime(createdAt) as created, screenName, sentiment(text) as sentiment, wordcount(text) as wordcount, text from tweets\n) sub1\nlateral view explode(split(text, \u0027 \u0027)) t as word \nwhere sentiment \u003c\u003e \u0027neutral\u0027\ngroup by word, sentiment\n--having count(1) \u003e 1\norder by cnt desc\n\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:42:05.860", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "apps": [], + "jobName": "paragraph_1587051721600_-463177037", + "id": "20200416-154201_814758669", + "dateCreated": "2020-04-16 15:42:01.600", + "status": "READY", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\n\nselect minute, sentiment, count(1) as cnt from\n(\n select substr(from_unixtime(createdAt), 0, 16) as minute, screenName, sentiment(text) as sentiment, wordcount(text) as wordcount, text from tweets\n) sub1\ngroup by minute, sentiment\norder by minute\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:42:17.730", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "apps": [], + "jobName": "paragraph_1587051732930_373519832", + "id": "20200416-154212_1403931045", + "dateCreated": "2020-04-16 15:42:12.930", + "status": "READY", + "progressUpdateIntervalMs": 500 + } + ], + "name": "demo-notebooks/Twitter", + "id": "2F6NCTHH5", + "noteParams": {}, + "noteForms": {}, + "angularObjects": { + "md:shared_process": [], + "sh:shared_process": [], + "spark:shared_process": [] + }, + "config": { + "isZeppelinNotebookCronEnable": false + }, + "info": {} +} \ No newline at end of file diff --git a/2F6S8N3CE/note.json b/2F6S8N3CE/note.json new file mode 100644 index 0000000..3c58958 --- /dev/null +++ b/2F6S8N3CE/note.json @@ -0,0 +1,1192 @@ +{ + "paragraphs": [ + { + "text": "%md\nHortonworks Blog - Predicting Airline Delays\nThis notebook is based on Blog posts below, by Ofer Mendelevitch \nhttp://hortonworks.com/blog/data-science-apacheh-hadoop-predicting-airline-delays/ \nhttp://hortonworks.com/blog/data-science-hadoop-spark-scala-part-2/\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:23:56.729", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eHortonworks Blog - Predicting Airline Delays\u003cbr/\u003eThis notebook is based on Blog posts below, by Ofer Mendelevitch\u003cbr/\u003e\u003ca href\u003d\"http://hortonworks.com/blog/data-science-apacheh-hadoop-predicting-airline-delays/\"\u003ehttp://hortonworks.com/blog/data-science-apacheh-hadoop-predicting-airline-delays/\u003c/a\u003e\u003cbr/\u003e\u003ca href\u003d\"http://hortonworks.com/blog/data-science-hadoop-spark-scala-part-2/\"\u003ehttp://hortonworks.com/blog/data-science-hadoop-spark-scala-part-2/\u003c/a\u003e\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587039802779_1094579905", + "id": "20200416-122322_245296189", + "dateCreated": "2020-04-16 12:23:22.779", + "dateStarted": "2020-04-16 12:23:56.720", + "dateFinished": "2020-04-16 12:23:56.726", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nDownload data sets", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:24:07.609", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eDownload data sets\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587039836710_400045853", + "id": "20200416-122356_1742803364", + "dateCreated": "2020-04-16 12:23:56.710", + "dateStarted": "2020-04-16 12:24:07.618", + "dateFinished": "2020-04-16 12:24:07.624", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sh\n\nwget http://stat-computing.org/dataexpo/2009/2007.csv.bz2 -O /tmp/flights_2007.csv.bz2\nwget http://stat-computing.org/dataexpo/2009/2008.csv.bz2 -O /tmp/flights_2008.csv.bz2\nwget ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2007.csv.gz -O /tmp/weather_2007.csv.gz\nwget ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2008.csv.gz -O /tmp/weather_2008.csv.gz\necho \"downloaded\"", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:24:18.849", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "sh", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/sh" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "--2020-04-16 12:24:18-- http://stat-computing.org/dataexpo/2009/2007.csv.bz2\nResolving stat-computing.org (stat-computing.org)... 52.218.225.58\nConnecting to stat-computing.org (stat-computing.org)|52.218.225.58|:80... connected.\nHTTP request sent, awaiting response... 404 Not Found\n2020-04-16 12:24:18 ERROR 404: Not Found.\n\n--2020-04-16 12:24:18-- http://stat-computing.org/dataexpo/2009/2008.csv.bz2\nResolving stat-computing.org (stat-computing.org)... 52.218.225.58\nConnecting to stat-computing.org (stat-computing.org)|52.218.225.58|:80... connected.\nHTTP request sent, awaiting response... 404 Not Found\n2020-04-16 12:24:19 ERROR 404: Not Found.\n\n--2020-04-16 12:24:19-- ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2007.csv.gz\n \u003d\u003e ‘/tmp/weather_2007.csv.gz’\nResolving ftp.ncdc.noaa.gov (ftp.ncdc.noaa.gov)... 205.167.25.101, 2610:20:8040:2::101\nConnecting to ftp.ncdc.noaa.gov (ftp.ncdc.noaa.gov)|205.167.25.101|:21... connected.\nLogging in as anonymous ... Logged in!\n\u003d\u003d\u003e SYST ... done. \u003d\u003d\u003e PWD ... done.\n\u003d\u003d\u003e TYPE I ... done. \u003d\u003d\u003e CWD (1) /pub/data/ghcn/daily/by_year ... done.\n\u003d\u003d\u003e SIZE 2007.csv.gz ... 215136890\n\u003d\u003d\u003e PASV ... done. \u003d\u003d\u003e RETR 2007.csv.gz ... done.\nLength: 215136890 (205M) (unauthoritative)\n\n 0K .......... .......... .......... .......... .......... 0% 242K 14m29s\n 50K .......... .......... .......... .......... .......... 0% 286K 13m22s\n 100K .......... .......... .......... .......... .......... 0% 286K 12m59s\n 150K .......... .......... .......... .......... .......... 0% 570K 11m16s\n 200K .......... .......... .......... .......... .......... 0% 175M 9m1s\n 250K .......... .......... .......... .......... .......... 0% 287K 9m33s\n 300K .......... .......... .......... .......... .......... 0% 65.5M 8m11s\n 350K .......... .......... .......... .......... .......... 0% 573K 7m56s\n 400K .......... .......... .......... .......... .......... 0% 208M 7m3s\n 450K .......... .......... .......... .......... .......... 0% 573K 6m57s\n 500K .......... .......... .......... .......... .......... 0% 221M 6m19s\n 550K .......... .......... .......... .......... .......... 0% 572K 6m18s\n 600K .......... .......... .......... .......... .......... 0% 251M 5m49s\n 650K .......... .......... .......... .......... .......... 0% 574K 5m50s\n 700K .......... .......... .......... .......... .......... 0% 101M 5m27s\n 750K .......... .......... .......... .......... .......... 0% 1.25M 5m16s\n 800K .......... .......... .......... .......... .......... 0% 1.02M 5m9s\n 850K .......... .......... .......... .......... .......... 0% 64.7M 4m52s\n 900K .......... .......... .......... .......... .......... 0% 1.26M 4m45s\n 950K .......... .......... .......... .......... .......... 0% 1.02M 4m41s\n 1000K .......... .......... .......... .......... .......... 0% 81.3M 4m28s\n 1050K .......... .......... .......... .......... .......... 0% 133M 4m16s\n 1100K .......... .......... .......... .......... .......... 0% 580K 4m20s\n 1150K .......... .......... .......... .......... .......... 0% 83.8M 4m9s\n 1200K .......... .......... .......... .......... .......... 0% 89.8M 3m59s\n 1250K .......... .......... .......... .......... .......... 0% 1.26M 3m56s\n 1300K .......... .......... .......... .......... .......... 0% 1.02M 3m55s\n 1350K .......... .......... .......... .......... .......... 0% 46.5M 3m47s\n 1400K .......... .......... .......... .......... .......... 0% 267M 3m39s\n 1450K .......... .......... .......... .......... .......... 0% 289M 3m31s\n 1500K .......... .......... .......... .......... .......... 0% 585K 3m36s\n 1550K .......... .......... .......... .......... .......... 0% 63.1M 3m29s\n 1600K .......... .......... .......... .......... .......... 0% 99.7M 3m23s\n 1650K .......... .......... .......... .......... .......... 0% 67.7M 3m17s\n 1700K .......... .......... .......... .......... .......... 0% 1.30M 3m16s\n 1750K .......... .......... .......... .......... .......... 0% 1.01M 3m16s\n 1800K .......... .......... .......... .......... .......... 0% 80.3M 3m11s\n 1850K .......... .......... .......... .......... .......... 0% 97.1M 3m6s\n 1900K .......... .......... .......... .......... .......... 0% 84.5M 3m1s\n 1950K .......... .......... .......... .......... .......... 0% 113M 2m56s\n 2000K .......... .......... .......... .......... .......... 0% 586K 3m1s\n 2050K .......... .......... .......... .......... .......... 0% 92.4M 2m56s\n 2100K .......... .......... .......... .......... .......... 1% 124M 2m52s\n 2150K .......... .......... .......... .......... .......... 1% 73.7M 2m48s\n 2200K .......... .......... .......... .......... .......... 1% 84.9M 2m45s\n 2250K .......... .......... .......... .......... .......... 1% 125M 2m41s\n 2300K .......... .......... .......... .......... .......... 1% 590K 2m45s\n 2350K .......... .......... .......... .......... .......... 1% 118M 2m42s\n 2400K .......... .......... .......... .......... .......... 1% 101M 2m38s\n 2450K .......... .......... .......... .......... .......... 1% 110M 2m35s\n 2500K .......... .......... .......... .......... .......... 1% 114M 2m32s\n 2550K .......... .......... .......... .......... .......... 1% 109M 2m29s\n 2600K .......... .......... .......... .......... .......... 1% 84.3M 2m27s\n 2650K .......... .......... .......... .......... .......... 1% 595K 2m30s\n 2700K .......... .......... .......... .......... .......... 1% 45.8M 2m28s\n 2750K .......... .......... .......... .......... .......... 1% 91.9M 2m25s\n 2800K .......... .......... .......... .......... .......... 1% 90.0M 2m22s\n 2850K .......... .......... .......... .......... .......... 1% 90.3M 2m20s\n 2900K .......... .......... .......... .......... .......... 1% 77.5M 2m18s\n 2950K .......... .......... .......... .......... .......... 1% 91.5M 2m15s\n 3000K .......... .......... .......... .......... .......... 1% 170M 2m13s\n 3050K .......... .......... .......... .......... .......... 1% 1.35M 2m13s\n 3100K .......... .......... .......... .......... .......... 1% 1.02M 2m14s\n 3150K .......... .......... .......... .......... .......... 1% 87.3M 2m12s\n 3200K .......... .......... .......... .......... .......... 1% 104M 2m10s\n 3250K .......... .......... .......... .......... .......... 1% 66.7M 2m8s\n 3300K .......... .......... .......... .......... .......... 1% 26.5M 2m6s\n 3350K .......... .......... .......... .......... .......... 1% 72.4M 2m4s\n 3400K .......... .......... .......... .......... .......... 1% 85.7M 2m3s\n 3450K .......... .......... .......... .......... .......... 1% 139M 2m1s\n 3500K .......... .......... .......... .......... .......... 1% 1.43M 2m1s\n 3550K .......... .......... .......... .......... .......... 1% 1.02M 2m2s\n 3600K .......... .......... .......... .......... .......... 1% 99.9M 2m1s\n 3650K .......... .......... .......... .......... .......... 1% 102M 1m59s\n 3700K .......... .......... .......... .......... .......... 1% 93.1M 1m57s\n 3750K .......... .......... .......... .......... .......... 1% 110M 1m56s\n 3800K .......... .......... .......... .......... .......... 1% 45.2M 1m54s\n 3850K .......... .......... .......... .......... .......... 1% 60.1M 1m53s\n 3900K .......... .......... .......... .......... .......... 1% 77.4M 1m51s\n 3950K .......... .......... .......... .......... .......... 1% 68.3M 1m50s\n 4000K .......... .......... .......... .......... .......... 1% 97.0M 1m49s\n 4050K .......... .......... .......... .......... .......... 1% 1.45M 1m49s\n 4100K .......... .......... .......... .......... .......... 1% 1.01M 1m50s\n 4150K .......... .......... .......... .......... .......... 1% 183M 1m49s\n 4200K .......... .......... .......... .......... .......... 2% 110M 1m47s\n 4250K .......... .......... .......... .......... .......... 2% 109M 1m46s\n 4300K .......... .......... .......... .......... .......... 2% 98.7M 1m45s\n 4350K .......... .......... .......... .......... .......... 2% 79.5M 1m44s\n 4400K .......... .......... .......... .......... .......... 2% 107M 1m43s\n 4450K .......... .......... .......... .......... .......... 2% 73.3M 1m42s\n 4500K .......... .......... .......... .......... .......... 2% 131M 1m40s\n 4550K .......... .......... .......... .......... .......... 2% 59.7M 99s\n 4600K .......... .......... .......... .......... .......... 2% 140M 98s\n 4650K .......... .......... .......... .......... .......... 2% 117M 97s\n 4700K .......... .......... .......... .......... .......... 2% 1.47M 98s\n 4750K .......... .......... .......... .......... .......... 2% 72.3M 97s\n 4800K .......... .......... .......... .......... .......... 2% 1.02M 98s\n 4850K .......... .......... .......... .......... .......... 2% 130M 97s\n 4900K .......... .......... .......... .......... .......... 2% 76.8M 96s\n 4950K .......... .......... .......... .......... .......... 2% 143M 95s\n 5000K .......... .......... .......... .......... .......... 2% 137M 94s\n 5050K .......... .......... .......... .......... .......... 2% 93.7M 93s\n 5100K .......... .......... .......... .......... .......... 2% 118M 92s\n 5150K .......... .......... .......... .......... .......... 2% 112M 91s\n 5200K .......... .......... .......... .......... .......... 2% 124M 90s\n 5250K .......... .......... .......... .......... .......... 2% 74.6M 89s\n 5300K .......... .......... .......... .......... .......... 2% 110M 88s\n 5350K .......... .......... .......... .......... .......... 2% 82.0M 88s\n 5400K .......... .......... .......... .......... .......... 2% 148M 87s\n 5450K .......... .......... .......... .......... .......... 2% 112M 86s\n 5500K .......... .......... .......... .......... .......... 2% 87.1M 85s\n 5550K .......... .......... .......... .......... .......... 2% 1.48M 86s\n 5600K .......... .......... .......... .......... .......... 2% 1.31M 86s\n 5650K .......... .......... .......... .......... .......... 2% 4.38M 86s\n 5700K .......... .......... .......... .......... .......... 2% 119M 85s\n 5750K .......... .......... .......... .......... .......... 2% 79.7M 84s\n 5800K .......... .......... .......... .......... .......... 2% 103M 84s\n 5850K .......... .......... .......... .......... .......... 2% 53.4M 83s\n 5900K .......... .......... .......... .......... .......... 2% 91.4M 82s\n 5950K .......... .......... .......... .......... .......... 2% 103M 82s\n 6000K .......... .......... .......... .......... .......... 2% 104M 81s\n 6050K .......... .......... .......... .......... .......... 2% 105M 80s\n 6100K .......... .......... .......... .......... .......... 2% 87.4M 80s\n 6150K .......... .......... .......... .......... .......... 2% 108M 79s\n 6200K .......... .......... .......... .......... .......... 2% 111M 78s\n 6250K .......... .......... .......... .......... .......... 2% 66.8M 78s\n 6300K .......... .......... .......... .......... .......... 3% 91.9M 77s\n 6350K .......... .......... .......... .......... .......... 3% 298M 76s\n 6400K .......... .......... .......... .......... .......... 3% 258M 76s\n 6450K .......... .......... .......... .......... .......... 3% 1.58M 76s\n 6500K .......... .......... .......... .......... .......... 3% 77.2M 76s\n 6550K .......... .......... .......... .......... .......... 3% 81.8M 75s\n 6600K .......... .......... .......... .......... .......... 3% 1.03M 76s\n 6650K .......... .......... .......... .......... .......... 3% 95.8M 75s\n 6700K .......... .......... .......... .......... .......... 3% 92.8M 75s\n 6750K .......... .......... .......... .......... .......... 3% 111M 74s\n 6800K .......... .......... .......... .......... .......... 3% 95.3M 74s\n 6850K .......... .......... .......... .......... .......... 3% 99.4M 73s\n 6900K .......... .......... .......... .......... .......... 3% 99.1M 73s\n 6950K .......... .......... .......... .......... .......... 3% 82.3M 72s\n 7000K .......... .......... .......... .......... .......... 3% 60.9M 72s\n 7050K .......... .......... .......... .......... .......... 3% 111M 71s\n 7100K .......... .......... .......... .......... .......... 3% 114M 71s\n 7150K .......... .......... .......... .......... .......... 3% 114M 70s\n 7200K .......... .......... .......... .......... .......... 3% 107M 70s\n 7250K .......... .......... .......... .......... .......... 3% 101M 69s\n 7300K .......... .......... .......... .......... .......... 3% 95.2M 69s\n 7350K .......... .......... .......... .......... .......... 3% 154M 68s\n 7400K .......... .......... .......... .......... .......... 3% 112M 68s\n 7450K .......... .......... .......... .......... .......... 3% 125M 67s\n 7500K .......... .......... .......... .......... .......... 3% 1.61M 68s\n 7550K .......... .......... .......... .......... .......... 3% 88.8M 67s\n 7600K .......... .......... .......... .......... .......... 3% 99.0M 67s\n 7650K .......... .......... .......... .......... .......... 3% 1.02M 68s\n 7700K .......... .......... .......... .......... .......... 3% 121M 67s\n 7750K .......... .......... .......... .......... .......... 3% 82.5M 67s\n 7800K .......... .......... .......... .......... .......... 3% 184M 66s\n 7850K .......... .......... .......... .......... .......... 3% 101M 66s\n 7900K .......... .......... .......... .......... .......... 3% 110M 65s\n 7950K .......... .......... .......... .......... .......... 3% 99.5M 65s\n 8000K .......... .......... .......... .......... .......... 3% 113M 65s\n 8050K .......... .......... .......... .......... .......... 3% 107M 64s\n 8100K .......... .......... .......... .......... .......... 3% 93.3M 64s\n 8150K .......... .......... .......... .......... .......... 3% 94.5M 63s\n 8200K .......... .......... .......... .......... .......... 3% 81.3M 63s\n 8250K .......... .......... .......... .......... .......... 3% 96.7M 63s\n 8300K .......... .......... .......... .......... .......... 3% 85.3M 62s\n 8350K .......... .......... .......... .......... .......... 3% 111M 62s\n 8400K .......... .......... .......... .......... .......... 4% 122M 61s\n 8450K .......... .......... .......... .......... .......... 4% 134M 61s\n 8500K .......... .......... .......... .......... .......... 4% 129M 61s\n 8550K .......... .......... .......... .......... .......... 4% 138M 60s\n 8600K .......... .......... .......... .......... .......... 4% 122M 60s\n 8650K .......... .......... .......... .......... .......... 4% 138M 60s\n 8700K .......... .......... .......... .......... .......... 4% 143M 59s\n 8750K .......... .......... .......... .......... .......... 4% 1.66M 60s\n 8800K .......... .......... .......... .......... .......... 4% 104M 59s\n 8850K .......... .......... .......... .......... .......... 4% 105M 59s\n 8900K .......... .......... .......... .......... .......... 4% 1.03M 60s\n 8950K .......... .......... .......... .......... .......... 4% 129M 59s\n 9000K .......... .......... .......... .......... .......... 4% 102M 59s\n 9050K .......... .......... .......... .......... .......... 4% 102M 59s\n 9100K .......... .......... .......... .......... .......... 4% 105M 58s\n 9150K .......... .......... .......... .......... .......... 4% 117M 58s\n 9200K .......... .......... .......... .......... .......... 4% 110M 58s\n 9250K .......... .......... .......... .......... .......... 4% 93.0M 57s\n 9300K .......... .......... .......... .......... .......... 4% 98.2M 57s\n 9350K .......... .......... .......... .......... .......... 4% 146M 57s\n 9400K .......... .......... .......... .......... .......... 4% 116M 56s\n 9450K .......... .......... .......... .......... .......... 4% 103M 56s\n 9500K .......... .......... .......... .......... .......... 4% 110M 56s\n 9550K .......... .......... .......... .......... .......... 4% 131M 56s\n 9600K .......... .......... .......... .......... .......... 4% 115M 55s\n 9650K .......... .......... .......... .......... .......... 4% 108M 55s\n 9700K .......... .......... .......... .......... .......... 4% 116M 55s\n 9750K .......... .......... .......... .......... .......... 4% 110M 54s\n 9800K .......... .......... .......... .......... .......... 4% 102M 54s\n 9850K .......... .......... .......... .......... .......... 4% 86.6M 54s\n 9900K .......... .......... .......... .......... .......... 4% 64.7M 54s\n 9950K .......... .......... .......... .......... .......... 4% 147M 53s\n 10000K .......... .......... .......... .......... .......... 4% 163M 53s\n 10050K .......... .......... .......... .......... .......... 4% 146M 53s\n 10100K .......... .......... .......... .......... .......... 4% 102M 53s\n 10150K .......... .......... .......... .......... .......... 4% 1.73M 53s\n 10200K .......... .......... .......... .......... .......... 4% 93.8M 53s\n 10250K .......... .......... .......... .......... .......... 4% 140M 52s\n 10300K .......... .......... .......... .......... .......... 4% 118M 52s\n 10350K .......... .......... .......... .......... .......... 4% 1.03M 53s\n 10400K .......... .......... .......... .......... .......... 4% 93.0M 52s\n 10450K .......... .......... .......... .......... .......... 4% 120M 52s\n 10500K .......... .......... .......... .......... .......... 5% 88.8M 52s\n 10550K .......... .......... .......... .......... .......... 5% 107M 52s\n 10600K .......... .......... .......... .......... .......... 5% 137M 51s\n 10650K .......... .......... .......... .......... .......... 5% 76.8M 51s\n 10700K .......... .......... .......... .......... .......... 5% 132M 51s\n 10750K .......... .......... .......... .......... .......... 5% 165M 51s\n 10800K .......... .......... .......... .......... .......... 5% 74.2M 50s\n 10850K .......... .......... .......... .......... .......... 5% 88.1M 50s\n 10900K .......... .......... .......... .......... .......... 5% 94.7M 50s\n 10950K .......... .......... .......... .......... .......... 5% 108M 50s\n 11000K .......... .......... .......... .......... .......... 5% 112M 50s\n 11050K .......... .......... .......... .......... .......... 5% 118M 49s\n 11100K .......... .......... .......... .......... .......... 5% 134M 49s\n 11150K .......... .......... .......... .......... .......... 5% 145M 49s\n 11200K .......... .......... .......... .......... .......... 5% 118M 49s\n 11250K .......... .......... .......... .......... .......... 5% 132M 48s\n 11300K .......... .......... .......... .......... .......... 5% 115M 48s\n 11350K .......... .......... .......... .......... .......... 5% 112M 48s\n 11400K .......... .......... .......... .......... .......... 5% 89.9M 48s\n 11450K .......... .......... .......... .......... .......... 5% 151M 48s\n 11500K .......... .......... .......... .......... .......... 5% 85.9M 47s\n 11550K .......... .......... .......... .......... .......... 5% 110M 47s\n 11600K .......... .......... .......... .......... .......... 5% 101M 47s\n 11650K .......... .......... .......... .......... .......... 5% 102M 47s\n 11700K .......... .......... .......... .......... .......... 5% 1.82M 47s\n 11750K .......... .......... .......... .......... .......... 5% 94.5M 47s\n 11800K .......... .......... .......... .......... .......... 5% 84.3M 47s\n 11850K .......... .......... .......... .......... .......... 5% 84.7M 46s\n 11900K .......... .......... .......... .......... .......... 5% 1.33M 47s\n 11950K .......... .......... .......... .......... .......... 5% 4.63M 47s\n 12000K .......... .......... .......... .......... .......... 5% 94.9M 47s\n 12050K .......... .......... .......... .......... .......... 5% 72.3M 46s\n 12100K .......... .......... .......... .......... .......... 5% 75.8M 46s\n 12150K .......... .......... .......... .......... .......... 5% 83.3M 46s\n 12200K .......... .......... .......... .......... .......... 5% 89.8M 46s\n 12250K .......... .......... .......... .......... .......... 5% 72.8M 46s\n 12300K .......... .......... .......... .......... .......... 5% 95.7M 45s\n 12350K .......... .......... .......... .......... .......... 5% 90.9M 45s\n 12400K .......... .......... .......... .......... .......... 5% 97.3M 45s\n 12450K .......... .......... .......... .......... .......... 5% 70.3M 45s\n 12500K .......... .......... .......... .......... .......... 5% 86.7M 45s\n 12550K .......... .......... .......... .......... .......... 5% 87.7M 44s\n 12600K .......... .......... .......... .......... .......... 6% 79.6M 44s\n 12650K .......... .......... .......... .......... .......... 6% 77.0M 44s\n 12700K .......... .......... .......... .......... .......... 6% 87.4M 44s\n 12750K .......... .......... .......... .......... .......... 6% 84.4M 44s\n 12800K .......... .......... .......... .......... .......... 6% 90.7M 44s\n 12850K .......... .......... .......... .......... .......... 6% 109M 43s\n 12900K .......... .......... .......... .......... .......... 6% 135M 43s\n 12950K .......... .......... .......... .......... .......... 6% 129M 43s\n 13000K .......... .......... .......... .......... .......... 6% 132M 43s\n 13050K .......... .......... .......... .......... .......... 6% 157M 43s\n 13100K .......... .......... .......... .......... .......... 6% 131M 43s\n 13150K .......... .......... .......... .......... .......... 6% 131M 42s\n 13200K .......... .......... .......... .......... .......... 6% 217M 42s\n 13250K .......... .......... .......... .......... .......... 6% 236M 42s\n 13300K .......... .......... .......... .......... .......... 6% 235M 42s\n 13350K .......... .......... .......... .......... .......... 6% 255M 42s\n 13400K .......... .......... .......... .......... .......... 6% 1.97M 42s\n 13450K .......... .......... .......... .......... .......... 6% 104M 42s\n 13500K .......... .......... .......... .......... .......... 6% 81.7M 42s\n 13550K .......... .......... .......... .......... .......... 6% 123M 41s\n 13600K .......... .......... .......... .......... .......... 6% 98.1M 41s\n 13650K .......... .......... .......... .......... .......... 6% 104M 41s\n 13700K .......... .......... .......... .......... .......... 6% 1.05M 42s\n 13750K .......... .......... .......... .......... .......... 6% 112M 42s\n 13800K .......... .......... .......... .......... .......... 6% 91.5M 41s\n 13850K .......... .......... .......... .......... .......... 6% 134M 41s\n 13900K .......... .......... .......... .......... .......... 6% 99.7M 41s\n 13950K .......... .......... .......... .......... .......... 6% 130M 41s\n 14000K .......... .......... .......... .......... .......... 6% 112M 41s\n 14050K .......... .......... .......... .......... .......... 6% 124M 41s\n 14100K .......... .......... .......... .......... .......... 6% 99.6M 40s\n 14150K .......... .......... .......... .......... .......... 6% 122M 40s\n 14200K .......... .......... .......... .......... .......... 6% 112M 40s\n 14250K .......... .......... .......... .......... .......... 6% 95.9M 40s\n 14300K .......... .......... .......... .......... .......... 6% 128M 40s\n 14350K .......... .......... .......... .......... .......... 6% 105M 40s\n 14400K .......... .......... .......... .......... .......... 6% 123M 40s\n 14450K .......... .......... .......... .......... .......... 6% 96.9M 39s\n 14500K .......... .......... .......... .......... .......... 6% 113M 39s\n 14550K .......... .......... .......... .......... .......... 6% 146M 39s\n 14600K .......... .......... .......... .......... .......... 6% 94.3M 39s\n 14650K .......... .......... .......... .......... .......... 6% 86.9M 39s\n 14700K .......... .......... .......... .......... .......... 7% 103M 39s\n 14750K .......... .......... .......... .......... .......... 7% 109M 39s\n 14800K .......... .......... .......... .......... .......... 7% 97.3M 38s\n 14850K .......... .......... .......... .......... .......... 7% 110M 38s\n 14900K .......... .......... .......... .......... .......... 7% 133M 38s\n 14950K .......... .......... .......... .......... .......... 7% 102M 38s\n 15000K .......... .......... .......... .......... .......... 7% 115M 38s\n 15050K .......... .......... .......... .......... .......... 7% 96.9M 38s\n 15100K .......... .......... .......... .......... .......... 7% 124M 38s\n 15150K .......... .......... .......... .......... .......... 7% 98.8M 38s\n 15200K .......... .......... .......... .......... .......... 7% 130M 37s\n 15250K .......... .......... .......... .......... .......... 7% 78.2M 37s\n 15300K .......... .......... .......... .......... .......... 7% 1.95M 38s\n 15350K .......... .......... .......... .......... .......... 7% 57.9M 37s\n 15400K .......... .......... .......... .......... .......... 7% 48.2M 37s\n 15450K .......... .......... .......... .......... .......... 7% 225M 37s\n 15500K .......... .......... .......... .......... .......... 7% 255M 37s\n 15550K .......... .......... .......... .......... .......... 7% 248M 37s\n 15600K .......... .......... .......... .......... .......... 7% 1.38M 37s\n 15650K .......... .......... .......... .......... .......... 7% 4.60M 37s\n 15700K .......... .......... .......... .......... .......... 7% 45.0M 37s\n 15750K .......... .......... .......... .......... .......... 7% 69.8M 37s\n 15800K .......... .......... .......... .......... .......... 7% 39.5M 37s\n 15850K .......... .......... .......... .......... .......... 7% 66.8M 37s\n 15900K .......... .......... .......... .......... .......... 7% 42.8M 37s\n 15950K .......... .......... .......... .......... .......... 7% 65.5M 37s\n 16000K .......... .......... .......... .......... .......... 7% 82.4M 36s\n 16050K .......... .......... .......... .......... .......... 7% 106M 36s\n 16100K .......... .......... .......... .......... .......... 7% 55.3M 36s\n 16150K .......... .......... .......... .......... .......... 7% 60.0M 36s\n 16200K .......... .......... .......... .......... .......... 7% 67.1M 36s\n 16250K .......... .......... .......... .......... .......... 7% 38.8M 36s\n 16300K .......... .......... .......... .......... .......... 7% 27.0M 36s\n 16350K .......... .......... .......... .......... .......... 7% 208M 36s\n 16400K .......... .......... .......... .......... .......... 7% 43.0M 36s\n 16450K .......... .......... .......... .......... .......... 7% 42.4M 35s\n 16500K .......... .......... .......... .......... .......... 7% 105M 35s\n 16550K .......... .......... .......... .......... .......... 7% 282M 35s\n 16600K .......... .......... .......... .......... .......... 7% 263M 35s\n 16650K .......... .......... .......... .......... .......... 7% 257M 35s\n 16700K .......... .......... .......... .......... .......... 7% 220M 35s\n 16750K .......... .......... .......... .......... .......... 7% 259M 35s\n 16800K .......... .......... .......... .......... .......... 8% 249M 35s\n 16850K .......... .......... .......... .......... .......... 8% 219M 35s\n 16900K .......... .......... .......... .......... .......... 8% 221M 34s\n 16950K .......... .......... .......... .......... .......... 8% 253M 34s\n 17000K .......... .......... .......... .......... .......... 8% 268M 34s\n 17050K .......... .......... .......... .......... .......... 8% 264M 34s\n 17100K .......... .......... .......... .......... .......... 8% 233M 34s\n 17150K .......... .......... .......... .......... .......... 8% 272M 34s\n 17200K .......... .......... .......... .......... .......... 8% 256M 34s\n 17250K .......... .......... .......... .......... .......... 8% 253M 34s\n 17300K .......... .......... .......... .......... .......... 8% 205M 34s\n 17350K .......... .......... .......... .......... .......... 8% 2.49M 34s\n 17400K .......... .......... .......... .......... .......... 8% 50.8M 34s\n 17450K .......... .......... .......... .......... .......... 8% 40.3M 34s\n 17500K .......... .......... .......... .......... .......... 8% 41.0M 33s\n 17550K .......... .......... .......... .......... .......... 8% 240M 33s\n 17600K .......... .......... .......... .......... .......... 8% 217M 33s\n 17650K .......... .......... .......... .......... .......... 8% 267M 33s\n 17700K .......... .......... .......... .......... .......... 8% 1.40M 33s\n 17750K .......... .......... .......... .......... .......... 8% 4.60M 33s\n 17800K .......... .......... .......... .......... .......... 8% 67.2M 33s\n 17850K .......... .......... .......... .......... .......... 8% 100M 33s\n 17900K .......... .......... .......... .......... .......... 8% 42.5M 33s\n 17950K .......... .......... .......... .......... .......... 8% 44.2M 33s\n 18000K .......... .......... .......... .......... .......... 8% 96.0M 33s\n 18050K .......... .......... .......... .......... .......... 8% 247M 33s\n 18100K .......... .......... .......... .......... .......... 8% 222M 33s\n 18150K .......... .......... .......... .......... .......... 8% 249M 33s\n 18200K .......... .......... .......... .......... .......... 8% 96.7M 33s\n 18250K .......... .......... .......... .......... .......... 8% 78.5M 32s\n 18300K .......... .......... .......... .......... .......... 8% 35.8M 32s\n 18350K .......... .......... .......... .......... .......... 8% 49.8M 32s\n 18400K .......... .......... .......... .......... .......... 8% 49.6M 32s\n 18450K .......... .......... .......... .......... .......... 8% 43.8M 32s\n 18500K .......... .......... .......... .......... .......... 8% 49.6M 32s\n 18550K .......... .......... .......... .......... .......... 8% 140M 32s\n 18600K .......... .......... .......... .......... .......... 8% 69.4M 32s\n 18650K .......... .......... .......... .......... .......... 8% 74.2M 32s\n 18700K .......... .......... .......... .......... .......... 8% 42.9M 32s\n 18750K .......... .......... .......... .......... .......... 8% 77.6M 32s\n 18800K .......... .......... .......... .......... .......... 8% 44.1M 32s\n 18850K .......... .......... .......... .......... .......... 8% 51.7M 31s\n 18900K .......... .......... .......... .......... .......... 9% 75.0M 31s\n 18950K .......... .......... .......... .......... .......... 9% 264M 31s\n 19000K .......... .......... .......... .......... .......... 9% 254M 31s\n 19050K .......... .......... .......... .......... .......... 9% 242M 31s\n 19100K .......... .......... .......... .......... .......... 9% 225M 31s\n 19150K .......... .......... .......... .......... .......... 9% 247M 31s\n 19200K .......... .......... .......... .......... .......... 9% 260M 31s\n 19250K .......... .......... .......... .......... .......... 9% 202M 31s\n 19300K .......... .......... .......... .......... .......... 9% 167M 31s\n 19350K .......... .......... .......... .......... .......... 9% 261M 31s\n 19400K .......... .......... .......... .......... .......... 9% 296M 31s\n 19450K .......... .......... .......... .......... .......... 9% 255M 30s\n 19500K .......... .......... .......... .......... .......... 9% 2.63M 31s\n 19550K .......... .......... .......... .......... .......... 9% 107M 30s\n 19600K .......... .......... .......... .......... .......... 9% 63.3M 30s\n 19650K .......... .......... .......... .......... .......... 9% 89.6M 30s\n 19700K .......... .......... .......... .......... .......... 9% 65.7M 30s\n 19750K .......... .......... .......... .......... .......... 9% 69.4M 30s\n 19800K .......... .......... .......... .......... .......... 9% 113M 30s\n 19850K .......... .......... .......... .......... .......... 9% 260M 30s\n 19900K .......... .......... .......... .......... .......... 9% 1.09M 30s\n 19950K .......... .......... .......... .......... .......... 9% 54.8M 30s\n 20000K .......... .......... .......... .......... .......... 9% 98.9M 30s\n 20050K .......... .......... .......... .......... .......... 9% 85.6M 30s\n 20100K .......... .......... .......... .......... .......... 9% 80.6M 30s\n 20150K .......... .......... .......... .......... .......... 9% 84.6M 30s\n 20200K .......... .......... .......... .......... .......... 9% 117M 30s\n 20250K .......... .......... .......... .......... .......... 9% 112M 30s\n 20300K .......... .......... .......... .......... .......... 9% 13.6M 30s\n 20350K .......... .......... .......... .......... .......... 9% 65.0M 30s\n 20400K .......... .......... .......... .......... .......... 9% 66.8M 30s\n 20450K .......... .......... .......... .......... .......... 9% 57.8M 29s\n 20500K .......... .......... .......... .......... .......... 9% 66.9M 29s\n 20550K .......... .......... .......... .......... .......... 9% 65.2M 29s\n 20600K .......... .......... .......... .......... .......... 9% 51.2M 29s\n 20650K .......... .......... .......... .......... .......... 9% 56.7M 29s\n 20700K .......... .......... .......... .......... .......... 9% 62.4M 29s\n 20750K .......... .......... .......... .......... .......... 9% 65.2M 29s\n 20800K .......... .......... .......... .......... .......... 9% 71.9M 29s\n 20850K .......... .......... .......... .......... .......... 9% 54.9M 29s\n 20900K .......... .......... .......... .......... .......... 9% 66.9M 29s\n 20950K .......... .......... .......... .......... .......... 9% 44.2M 29s\n 21000K .......... .......... .......... .......... .......... 10% 62.2M 29s\n 21050K .......... .......... .......... .......... .......... 10% 65.5M 29s\n 21100K .......... .......... .......... .......... .......... 10% 270M 29s\n 21150K .......... .......... .......... .......... .......... 10% 195M 29s\n 21200K .......... .......... .......... .......... .......... 10% 255M 28s\n 21250K .......... .......... .......... .......... .......... 10% 219M 28s\n 21300K .......... .......... .......... .......... .......... 10% 264M 28s\n 21350K .......... .......... .......... .......... .......... 10% 264M 28s\n 21400K .......... .......... .......... .......... .......... 10% 264M 28s\n 21450K .......... .......... .......... .......... .......... 10% 224M 28s\n 21500K .......... .......... .......... .......... .......... 10% 251M 28s\n 21550K .......... .......... .......... .......... .......... 10% 264M 28s\n 21600K .......... .......... .......... .......... .......... 10% 298M 28s\n 21650K .......... .......... .......... .......... .......... 10% 170M 28s\n 21700K .......... .......... .......... .......... .......... 10% 3.00M 28s\n 21750K .......... .......... .......... .......... .......... 10% 51.3M 28s\n 21800K .......... .......... .......... .......... .......... 10% 101M 28s\n 21850K .......... .......... .......... .......... .......... 10% 65.5M 28s\n 21900K .......... .......... .......... .......... .......... 10% 57.8M 28s\n 21950K .......... .......... .......... .......... .......... 10% 67.6M 28s\n 22000K .......... .......... .......... .......... .......... 10% 115M 27s\n 22050K .......... .......... .......... .......... .......... 10% 198M 27s\n 22100K .......... .......... .......... .......... .......... 10% 136M 27s\n 22150K .......... .......... .......... .......... .......... 10% 1.44M 28s\n 22200K .......... .......... .......... .......... .......... 10% 4.40M 28s\n 22250K .......... .......... .......... .......... .......... 10% 102M 28s\n 22300K .......... .......... .......... .......... .......... 10% 143M 27s\n 22350K .......... .......... .......... .......... .......... 10% 81.0M 27s\n 22400K .......... .......... .......... .......... .......... 10% 58.0M 27s\n 22450K .......... .......... .......... .......... .......... 10% 8.10M 27s\n 22500K .......... .......... .......... .......... .......... 10% 60.6M 27s\n 22550K .......... .......... .......... .......... .......... 10% 67.6M 27s\n 22600K .......... .......... .......... .......... .......... 10% 72.1M 27s\n 22650K .......... .......... .......... .......... .......... 10% 61.8M 27s\n 22700K .......... .......... .......... .......... .......... 10% 61.7M 27s\n 22750K .......... .......... .......... .......... .......... 10% 70.4M 27s\n 22800K .......... .......... .......... .......... .......... 10% 72.3M 27s\n 22850K .......... .......... .......... .......... .......... 10% 54.2M 27s\n 22900K .......... .......... .......... .......... .......... 10% 75.2M 27s\n 22950K .......... .......... .......... .......... .......... 10% 59.9M 27s\n 23000K .......... .......... .......... .......... .......... 10% 72.4M 27s\n 23050K .......... .......... .......... .......... .......... 10% 62.2M 27s\n 23100K .......... .......... .......... .......... .......... 11% 74.2M 27s\n 23150K .......... .......... .......... .......... .......... 11% 71.1M 26s\n 23200K .......... .......... .......... .......... .......... 11% 67.3M 26s\n 23250K .......... .......... .......... .......... .......... 11% 55.9M 26s\n 23300K .......... .......... .......... .......... .......... 11% 80.8M 26s\n 23350K .......... .......... .......... .......... .......... 11% 71.4M 26s\n 23400K .......... .......... .......... .......... .......... 11% 139M 26s\n 23450K .......... .......... .......... .......... .......... 11% 223M 26s\n 23500K .......... .......... .......... .......... .......... 11% 254M 26s\n 23550K .......... .......... .......... .......... .......... 11% 265M 26s\n 23600K .......... .......... .......... .......... .......... 11% 262M 26s\n 23650K .......... .......... .......... .......... .......... 11% 233M 26s\n 23700K .......... .......... .......... .......... .......... 11% 264M 26s\n 23750K .......... .......... .......... .......... .......... 11% 259M 26s\n 23800K .......... .......... .......... .......... .......... 11% 260M 26s\n 23850K .......... .......... .......... .......... .......... 11% 224M 26s\n 23900K .......... .......... .......... .......... .......... 11% 254M 26s\n 23950K .......... .......... .......... .......... .......... 11% 266M 26s\n 24000K .......... .......... .......... .......... .......... 11% 241M 25s\n 24050K .......... .......... .......... .......... .......... 11% 3.48M 26s\n 24100K .......... .......... .......... .......... .......... 11% 57.0M 25s\n 24150K .......... .......... .......... .......... .......... 11% 54.5M 25s\n 24200K .......... .......... .......... .......... .......... 11% 74.4M 25s\n 24250K .......... .......... .......... .......... .......... 11% 87.8M 25s\n 24300K .......... .......... .......... .......... .......... 11% 75.4M 25s\n 24350K .......... .......... .......... .......... .......... 11% 192M 25s\n 24400K .......... .......... .......... .......... .......... 11% 257M 25s\n 24450K .......... .......... .......... .......... .......... 11% 220M 25s\n 24500K .......... .......... .......... .......... .......... 11% 1.45M 25s\n 24550K .......... .......... .......... .......... .......... 11% 4.42M 25s\n 24600K .......... .......... .......... .......... .......... 11% 4.83M 25s\n 24650K .......... .......... .......... .......... .......... 11% 77.9M 25s\n 24700K .......... .......... .......... .......... .......... 11% 66.2M 25s\n 24750K .......... .......... .......... .......... .......... 11% 62.3M 25s\n 24800K .......... .......... .......... .......... .......... 11% 71.8M 25s\n 24850K .......... .......... .......... .......... .......... 11% 76.5M 25s\n 24900K .......... .......... .......... .......... .......... 11% 90.3M 25s\n 24950K .......... .......... .......... .......... .......... 11% 90.8M 25s\n 25000K .......... .......... .......... .......... .......... 11% 59.6M 25s\n 25050K .......... .......... .......... .......... .......... 11% 81.3M 25s\n 25100K .......... .......... .......... .......... .......... 11% 61.9M 25s\n 25150K .......... .......... .......... .......... .......... 11% 64.0M 25s\n 25200K .......... .......... .......... .......... .......... 12% 69.7M 25s\n 25250K .......... .......... .......... .......... .......... 12% 59.3M 25s\n 25300K .......... .......... .......... .......... .......... 12% 73.4M 25s\n 25350K .......... .......... .......... .......... .......... 12% 71.1M 25s\n 25400K .......... .......... .......... .......... .......... 12% 71.3M 24s\n 25450K .......... .......... .......... .......... .......... 12% 85.7M 24s\n 25500K .......... .......... .......... .......... .......... 12% 97.7M 24s\n 25550K .......... .......... .......... .......... .......... 12% 145M 24s\n 25600K .......... .......... .......... .......... .......... 12% 69.5M 24s\n 25650K .......... .......... .......... .......... .......... 12% 75.4M 24s\n 25700K .......... .......... .......... .......... .......... 12% 64.6M 24s\n 25750K .......... .......... .......... .......... .......... 12% 49.1M 24s\n 25800K .......... .......... .......... .......... .......... 12% 114M 24s\n 25850K .......... .......... .......... .......... .......... 12% 247M 24s\n 25900K .......... .......... .......... .......... .......... 12% 284M 24s\n 25950K .......... .......... .......... .......... .......... 12% 294M 24s\n 26000K .......... .......... .......... .......... .......... 12% 192M 24s\n 26050K .......... .......... .......... .......... .......... 12% 273M 24s\n 26100K .......... .......... .......... .......... .......... 12% 256M 24s\n 26150K .......... .......... .......... .......... .......... 12% 258M 24s\n 26200K .......... .......... .......... .......... .......... 12% 222M 24s\n 26250K .......... .......... .......... .......... .......... 12% 185M 24s\n 26300K .......... .......... .......... .......... .......... 12% 257M 24s\n 26350K .......... .......... .......... .......... .......... 12% 254M 24s\n 26400K .......... .......... .......... .......... .......... 12% 4.93M 24s\n 26450K .......... .......... .......... .......... .......... 12% 70.4M 24s\n 26500K .......... .......... .......... .......... .......... 12% 41.5M 23s\n 26550K .......... .......... .......... .......... .......... 12% 43.1M 23s\n 26600K .......... .......... .......... .......... .......... 12% 49.8M 23s\n 26650K .......... .......... .......... .......... .......... 12% 121M 23s\n 26700K .......... .......... .......... .......... .......... 12% 219M 23s\n 26750K .......... .......... .......... .......... .......... 12% 292M 23s\n 26800K .......... .......... .......... .......... .......... 12% 239M 23s\n 26850K .......... .......... .......... .......... .......... 12% 266M 23s\n 26900K .......... .......... .......... .......... .......... 12% 225M 23s\n 26950K .......... .......... .......... .......... .......... 12% 867K 23s\n 27000K .......... .......... .......... .......... .......... 12% 46.6M 23s\n 27050K .......... .......... .......... .......... .......... 12% 46.2M 23s\n 27100K .......... .......... .......... .......... .......... 12% 44.9M 23s\n 27150K .......... .......... .......... .......... .......... 12% 44.6M 23s\n 27200K .......... .......... .......... .......... .......... 12% 64.4M 23s\n 27250K .......... .......... .......... .......... .......... 12% 72.3M 23s\n 27300K .......... .......... .......... .......... .......... 13% 65.8M 23s\n 27350K .......... .......... .......... .......... .......... 13% 76.8M 23s\n 27400K .......... .......... .......... .......... .......... 13% 71.6M 23s\n 27450K .......... .......... .......... .......... .......... 13% 105M 23s\n 27500K .......... .......... .......... .......... .......... 13% 72.2M 23s\n 27550K .......... .......... .......... .......... .......... 13% 64.9M 23s\n 27600K .......... .......... .......... .......... .......... 13% 94.8M 23s\n 27650K .......... .......... .......... .......... .......... 13% 67.5M 23s\n 27700K .......... .......... .......... .......... .......... 13% 63.3M 23s\n 27750K .......... .......... .......... .......... .......... 13% 60.5M 23s\n 27800K .......... .......... .......... .......... .......... 13% 73.3M 23s\n 27850K .......... .......... .......... .......... .......... 13% 76.8M 23s\n 27900K .......... .......... .......... .......... .......... 13% 69.1M 23s\n 27950K .......... .......... .......... .......... .......... 13% 63.2M 23s\n 28000K .......... .......... .......... .......... .......... 13% 61.3M 23s\n 28050K .......... .......... .......... .......... .......... 13% 124M 22s\n 28100K .......... .......... .......... .......... .......... 13% 318M 22s\n 28150K .......... .......... .......... .......... .......... 13% 288M 22s\n 28200K .......... .......... .......... .......... .......... 13% 314M 22s\n 28250K .......... .......... .......... .......... .......... 13% 327M 22s\n 28300K .......... .......... .......... .......... .......... 13% 341M 22s\n 28350K .......... .......... .......... .......... .......... 13% 239M 22s\n 28400K .......... .......... .......... .......... .......... 13% 327M 22s\n 28450K .......... .......... .......... .......... .......... 13% 326M 22s\n 28500K .......... .......... .......... .......... .......... 13% 330M 22s\n 28550K .......... .......... .......... .......... .......... 13% 290M 22s\n 28600K .......... .......... .......... .......... .......... 13% 327M 22s\n 28650K .......... .......... .......... .......... .......... 13% 314M 22s\n 28700K .......... .......... .......... .......... .......... 13% 242M 22s\n 28750K .......... .......... .......... .......... .......... 13% 248M 22s\n 28800K .......... .......... .......... .......... .......... 13% 10.3M 22s\n 28850K .......... .......... .......... .......... .......... 13% 59.0M 22s\n 28900K .......... .......... .......... .......... .......... 13% 120M 22s\n 28950K .......... .......... .......... .......... .......... 13% 90.9M 22s\n 29000K .......... .......... .......... .......... .......... 13% 62.4M 22s\n 29050K .......... .......... .......... .......... .......... 13% 110M 22s\n 29100K .......... .......... .......... .......... .......... 13% 113M 22s\n 29150K .......... .......... .......... .......... .......... 13% 83.8M 22s\n 29200K .......... .......... .......... .......... .......... 13% 124M 22s\n 29250K .......... .......... .......... .......... .......... 13% 801K 22s\n 29300K .......... .......... .......... .......... .......... 13% 81.3M 22s\n 29350K .......... .......... .......... .......... .......... 13% 125M 22s\n 29400K .......... .......... .......... .......... .......... 14% 91.2M 22s\n 29450K .......... .......... .......... .......... .......... 14% 142M 22s\n 29500K .......... .......... .......... .......... .......... 14% 122M 22s\n 29550K .......... .......... .......... .......... .......... 14% 109M 22s\n 29600K .......... .......... .......... .......... .......... 14% 128M 22s\n 29650K .......... .......... .......... .......... .......... 14% 113M 22s\n 29700K .......... .......... .......... .......... .......... 14% 82.6M 22s\n 29750K .......... .......... .......... .......... .......... 14% 117M 21s\n 29800K .......... .......... .......... .......... .......... 14% 166M 21s\n 29850K .......... .......... .......... .......... .......... 14% 94.7M 21s\n 29900K .......... .......... .......... .......... .......... 14% 108M 21s\n 29950K .......... .......... .......... .......... .......... 14% 118M 21s\n 30000K .......... .......... .......... .......... .......... 14% 78.6M 21s\n 30050K .......... .......... .......... .......... .......... 14% 113M 21s\n 30100K .......... .......... .......... .......... .......... 14% 109M 21s\n 30150K .......... .......... .......... .......... .......... 14% 119M 21s\n 30200K .......... .......... .......... .......... .......... 14% 136M 21s\n 30250K .......... .......... .......... .......... .......... 14% 115M 21s\n 30300K .......... .......... .......... .......... .......... 14% 72.8M 21s\n 30350K .......... .......... .......... .......... .......... 14% 124M 21s\n 30400K .......... .......... .......... .......... .......... 14% 94.3M 21s\n 30450K .......... .......... .......... .......... .......... 14% 121M 21s\n 30500K .......... .......... .......... .......... .......... 14% 111M 21s\n 30550K .......... .......... .......... .......... .......... 14% 98.9M 21s\n 30600K .......... .......... .......... .......... .......... 14% 51.3M 21s\n 30650K .......... .......... .......... .......... .......... 14% 131M 21s\n 30700K .......... .......... .......... .......... .......... 14% 137M 21s\n 30750K .......... .......... .......... .......... .......... 14% 143M 21s\n 30800K .......... .......... .......... .......... .......... 14% 118M 21s\n 30850K .......... .......... .......... .......... .......... 14% 152M 21s\n 30900K .......... .......... .......... .......... .......... 14% 143M 21s\n 30950K .......... .......... .......... .......... .......... 14% 111M 21s\n 31000K .......... .......... .......... .......... .......... 14% 114M 21s\n 31050K .......... .......... .......... .......... .......... 14% 99.1M 21s\n 31100K .......... .......... .......... .......... .......... 14% 137M 20s\n 31150K .......... .......... .......... .......... .......... 14% 106M 20s\n 31200K .......... .......... .......... .......... .......... 14% 97.1M 20s\n 31250K .......... .......... .......... .......... .......... 14% 16.6M 20s\n 31300K .......... .......... .......... .......... .......... 14% 127M 20s\n 31350K .......... .......... .......... .......... .......... 14% 87.7M 20s\n 31400K .......... .......... .......... .......... .......... 14% 126M 20s\n 31450K .......... .......... .......... .......... .......... 14% 67.6M 20s\n 31500K .......... .......... .......... .......... .......... 15% 66.3M 20s\n 31550K .......... .......... .......... .......... .......... 15% 75.2M 20s\n 31600K .......... .......... .......... .......... .......... 15% 104M 20s\n 31650K .......... .......... .......... .......... .......... 15% 71.2M 20s\n 31700K .......... .......... .......... .......... .......... 15% 247M 20s\n 31750K .......... .......... .......... .......... .......... 15% 729K 20s\n 31800K .......... .......... .......... .......... .......... 15% 129M 20s\n 31850K .......... .......... .......... .......... .......... 15% 178M 20s\n 31900K .......... .......... .......... .......... .......... 15% 133M 20s\n 31950K .......... .......... .......... .......... .......... 15% 170M 20s\n 32000K .......... .......... .......... .......... .......... 15% 183M 20s\n 32050K .......... .......... .......... .......... .......... 15% 144M 20s\n 32100K .......... .......... .......... .......... .......... 15% 213M 20s\n 32150K .......... .......... .......... .......... .......... 15% 125M 20s\n 32200K .......... .......... .......... .......... .......... 15% 130M 20s\n 32250K .......... .......... .......... .......... .......... 15% 130M 20s\n 32300K .......... .......... .......... .......... .......... 15% 149M 20s\n 32350K .......... .......... .......... .......... .......... 15% 221M 20s\n 32400K .......... .......... .......... .......... .......... 15% 111M 20s\n 32450K .......... .......... .......... .......... .......... 15% 137M 20s\n 32500K .......... .......... .......... .......... .......... 15% 124M 20s\n 32550K .......... .......... .......... .......... .......... 15% 79.8M 20s\n 32600K .......... .......... .......... .......... .......... 15% 94.3M 20s\n 32650K .......... .......... .......... .......... .......... 15% 88.0M 20s\n 32700K .......... .......... .......... .......... .......... 15% 250M 20s\n 32750K .......... .......... .......... .......... .......... 15% 72.1M 20s\n 32800K .......... .......... .......... .......... .......... 15% 92.8M 20s\n 32850K .......... .......... .......... .......... .......... 15% 97.3M 20s\n 32900K .......... .......... .......... .......... .......... 15% 95.5M 20s\n 32950K .......... .......... .......... .......... .......... 15% 89.5M 20s\n 33000K .......... .......... .......... .......... .......... 15% 115M 20s\n 33050K .......... .......... .......... .......... .......... 15% 127M 20s\n 33100K .......... .......... .......... .......... .......... 15% 193M 19s\n 33150K .......... .......... .......... .......... .......... 15% 99.2M 19s\n 33200K .......... .......... .......... .......... .......... 15% 120M 19s\n 33250K .......... .......... .......... .......... .......... 15% 128M 19s\n 33300K .......... .......... .......... .......... .......... 15% 104M 19s\n 33350K .......... .......... .......... .......... .......... 15% 154M 19s\n 33400K .......... .......... .......... .......... .......... 15% 128M 19s\n 33450K .......... .......... .......... .......... .......... 15% 87.2M 19s\n 33500K .......... .......... .......... .......... .......... 15% 133M 19s\n 33550K .......... .......... .......... .......... .......... 15% 126M 19s\n 33600K .......... .......... .......... .......... .......... 16% 91.8M 19s\n 33650K .......... .......... .......... .......... .......... 16% 105M 19s\n 33700K .......... .......... .......... .......... .......... 16% 136M 19s\n 33750K .......... .......... .......... .......... .......... 16% 124M 19s\n 33800K .......... .......... .......... .......... .......... 16% 116M 19s\n 33850K .......... .......... .......... .......... .......... 16% 87.5M 19s\n 33900K .......... .......... .......... .......... .......... 16% 112M 19s\n 33950K .......... .......... .......... .......... .......... 16% 52.1M 19s\n 34000K .......... .......... .......... .......... .......... 16% 150M 19s\n 34050K .......... .......... .......... .......... .......... 16% 96.1M 19s\n 34100K .......... .......... .......... .......... .......... 16% 121M 19s\n 34150K .......... .......... .......... .......... .......... 16% 128M 19s\n 34200K .......... .......... .......... .......... .......... 16% 716K 19s\n 34250K .......... .......... .......... .......... .......... 16% 112M 19s\n 34300K .......... .......... .......... .......... .......... 16% 45.7M 19s\n 34350K .......... .......... .......... .......... .......... 16% 71.0M 19s\n 34400K .......... .......... .......... .......... .......... 16% 60.4M 19s\n 34450K .......... .......... .......... .......... .......... 16% 121M 19s\n 34500K .......... .......... .......... .......... .......... 16% 81.9M 19s\n 34550K .......... .......... .......... .......... .......... 16% 42.5M 19s\n 34600K .......... .......... .......... .......... .......... 16% 109M 19s\n 34650K .......... .......... .......... .......... .......... 16% 104M 19s\n 34700K .......... .......... .......... .......... .......... 16% 122M 19s\n 34750K .......... .......... .......... .......... .......... 16% 94.0M 19s\n 34800K .......... .......... .......... .......... .......... 16% 117M 19s\n 34850K .......... .......... .......... .......... .......... 16% 119M 19s\n 34900K .......... .......... .......... .......... .......... 16% 90.8M 19s\n 34950K .......... .......... .......... .......... .......... 16% 130M 19s\n 35000K .......... .......... .......... .......... .......... 16% 104M 19s\n 35050K .......... .......... .......... .......... .......... 16% 115M 19s\n 35100K .......... .......... .......... .......... .......... 16% 91.8M 19s\n 35150K .......... .......... .......... .......... .......... 16% 139M 19s\n 35200K .......... .......... .......... .......... .......... 16% 117M 19s\n 35250K .......... .......... .......... .......... .......... 16% 113M 19s\n 35300K .......... .......... .......... .......... .......... 16% 104M 18s\n 35350K .......... .......... .......... .......... .......... 16% 107M 18s\n 35400K .......... .......... .......... .......... .......... 16% 110M 18s\n 35450K .......... .......... .......... .......... .......... 16% 90.2M 18s\n 35500K .......... .......... .......... .......... .......... 16% 142M 18s\n 35550K .......... .......... .......... .......... .......... 16% 52.8M 18s\n 35600K .......... .......... .......... .......... .......... 16% 92.2M 18s\n 35650K .......... .......... .......... .......... .......... 16% 34.9M 18s\n 35700K .......... .......... .......... .......... .......... 17% 217M 18s\n 35750K .......... .......... .......... .......... .......... 17% 278M 18s\n 35800K .......... .......... .......... .......... .......... 17% 149M 18s\n 35850K .......... .......... .......... .......... .......... 17% 104M 18s\n 35900K .......... .......... .......... .......... .......... 17% 89.5M 18s\n 35950K .......... .......... .......... .......... .......... 17% 144M 18s\n 36000K .......... .......... .......... .......... .......... 17% 173M 18s\n 36050K .......... .......... .......... .......... .......... 17% 138M 18s\n 36100K .......... .......... .......... .......... .......... 17% 139M 18s\n 36150K .......... .......... .......... .......... .......... 17% 116M 18s\n 36200K .......... .......... .......... .......... .......... 17% 98.3M 18s\n 36250K .......... .......... .......... .......... .......... 17% 70.6M 18s\n 36300K .......... .......... .......... .......... .......... 17% 104M 18s\n 36350K .......... .......... .......... .......... .......... 17% 132M 18s\n 36400K .......... .......... .......... .......... .......... 17% 83.4M 18s\n 36450K .......... .......... .......... .......... .......... 17% 137M 18s\n 36500K .......... .......... .......... .......... .......... 17% 92.3M 18s\n 36550K .......... .......... .......... .......... .......... 17% 132M 18s\n 36600K .......... .......... .......... .......... .......... 17% 103M 18s\n 36650K .......... .......... .......... .......... .......... 17% 127M 18s\n 36700K .......... .......... .......... .......... .......... 17% 90.5M 18s\n 36750K .......... .......... .......... .......... .......... 17% 805K 18s\n 36800K .......... .......... .......... .......... .......... 17% 92.8M 18s\n 36850K .......... .......... .......... .......... .......... 17% 55.3M 18s\n 36900K .......... .......... .......... .......... .......... 17% 110M 18s\n 36950K .......... .......... .......... .......... .......... 17% 137M 18s\n 37000K .......... .......... .......... .......... .......... 17% 51.2M 18s\n 37050K .......... .......... .......... .......... .......... 17% 99.4M 18s\n 37100K .......... .......... .......... .......... .......... 17% 7.14M 18s\n 37150K .......... .......... .......... .......... .......... 17% 103M 18s\n 37200K .......... .......... .......... .......... .......... 17% 121M 18s\n 37250K .......... .......... .......... .......... .......... 17% 100M 18s\n 37300K .......... .......... .......... .......... .......... 17% 113M 18s\n 37350K .......... .......... .......... .......... .......... 17% 98.8M 18s\n 37400K .......... .......... .......... .......... .......... 17% 96.6M 18s\n 37450K .......... .......... .......... .......... .......... 17% 79.2M 18s\n 37500K .......... .......... .......... .......... .......... 17% 111M 18s\n 37550K .......... .......... .......... .......... .......... 17% 104M 18s\n 37600K .......... .......... .......... .......... .......... 17% 85.4M 18s\n 37650K .......... .......... .......... .......... .......... 17% 155M 18s\n 37700K .......... .......... .......... .......... .......... 17% 92.4M 18s\n 37750K .......... .......... .......... .......... .......... 17% 107M 17s\n 37800K .......... .......... .......... .......... .......... 18% 55.6M 17s\n 37850K .......... .......... .......... .......... .......... 18% 67.4M 17s\n 37900K .......... .......... .......... .......... .......... 18% 87.1M 17s\n 37950K .......... .......... .......... .......... .......... 18% 117M 17s\n 38000K .......... .......... .......... .......... .......... 18% 103M 17s\n 38050K .......... .......... .......... .......... .......... 18% 90.4M 17s\n 38100K .......... .......... .......... .......... .......... 18% 95.1M 17s\n 38150K .......... .......... .......... .......... .......... 18% 83.9M 17s\n 38200K .......... .......... .......... .......... .......... 18% 54.2M 17s\n 38250K .......... .......... .......... .......... .......... 18% 93.1M 17s\n 38300K .......... .......... .......... .......... .......... 18% 86.0M 17s\n 38350K .......... .......... .......... .......... .......... 18% 71.1M 17s\n 38400K .......... .......... .......... .......... .......... 18% 74.0M 17s\n 38450K .......... .......... .......... .......... .......... 18% 75.7M 17s\n 38500K .......... .......... .......... .......... .......... 18% 70.6M 17s\n 38550K .......... .......... .......... .......... .......... 18% 86.2M 17s\n 38600K .......... .......... .......... .......... .......... 18% 63.8M 17s\n 38650K .......... .......... .......... .......... .......... 18% 55.7M 17s\n 38700K .......... .......... .......... .......... .......... 18% 53.5M 17s\n 38750K .......... .......... .......... .......... .......... 18% 68.8M 17s\n 38800K .......... .......... .......... .......... .......... 18% 78.5M 17s\n 38850K .......... .......... .......... .......... .......... 18% 99.1M 17s\n 38900K .......... .......... .......... .......... .......... 18% 225M 17s\n 38950K .......... .......... .......... .......... .......... 18% 244M 17s\n 39000K .......... .......... .......... .......... .......... 18% 220M 17s\n 39050K .......... .......... .......... .......... .......... 18% 253M 17s\n 39100K .......... .......... .......... .......... .......... 18% 249M 17s\n 39150K .......... .......... .......... .......... .......... 18% 255M 17s\n 39200K .......... .......... .......... .......... .......... 18% 214M 17s\n 39250K .......... .......... .......... .......... .......... 18% 258M 17s\n 39300K .......... .......... .......... .......... .......... 18% 263M 17s\n 39350K .......... .......... .......... .......... .......... 18% 912K 17s\n 39400K .......... .......... .......... .......... .......... 18% 84.5M 17s\n 39450K .......... .......... .......... .......... .......... 18% 125M 17s\n 39500K .......... .......... .......... .......... .......... 18% 91.7M 17s\n 39550K .......... .......... .......... .......... .......... 18% 73.3M 17s\n 39600K .......... .......... .......... .......... .......... 18% 140M 17s\n 39650K .......... .......... .......... .......... .......... 18% 3.65M 17s\n 39700K .......... .......... .......... .......... .......... 18% 110M 17s\n 39750K .......... .......... .......... .......... .......... 18% 140M 17s\n 39800K .......... .......... .......... .......... .......... 18% 17.6M 17s\n 39850K .......... .......... .......... .......... .......... 18% 135M 17s\n 39900K .......... .......... .......... .......... .......... 19% 92.3M 17s\n 39950K .......... .......... .......... .......... .......... 19% 93.8M 17s\n 40000K .......... .......... .......... .......... .......... 19% 125M 17s\n 40050K .......... .......... .......... .......... .......... 19% 129M 17s\n 40100K .......... .......... .......... .......... .......... 19% 89.3M 17s\n 40150K .......... .......... .......... .......... .......... 19% 108M 17s\n 40200K .......... .......... .......... .......... .......... 19% 124M 17s\n 40250K .......... .......... .......... .......... .......... 19% 110M 17s\n 40300K .......... .......... .......... .......... .......... 19% 118M 17s\n 40350K .......... .......... .......... .......... .......... 19% 86.1M 17s\n 40400K .......... .......... .......... .......... .......... 19% 77.4M 16s\n 40450K .......... .......... .......... .......... .......... 19% 69.4M 16s\n 40500K .......... .......... .......... .......... .......... 19% 85.9M 16s\n 40550K .......... .......... .......... .......... .......... 19% 67.5M 16s\n 40600K .......... .......... .......... .......... .......... 19% 83.2M 16s\n 40650K .......... .......... .......... .......... .......... 19% 93.7M 16s\n 40700K .......... .......... .......... .......... .......... 19% 68.3M 16s\n 40750K .......... .......... .......... .......... .......... 19% 80.3M 16s\n 40800K .......... .......... .......... .......... .......... 19% 60.8M 16s\n 40850K .......... .......... .......... .......... .......... 19% 73.7M 16s\n 40900K .......... .......... .......... .......... .......... 19% 92.4M 16s\n 40950K .......... .......... .......... .......... .......... 19% 99.3M 16s\n 41000K .......... .......... .......... .......... .......... 19% 63.3M 16s\n 41050K .......... .......... .......... .......... .......... 19% 89.1M 16s\n 41100K .......... .......... .......... .......... .......... 19% 80.8M 16s\n 41150K .......... .......... .......... .......... .......... 19% 127M 16s\n 41200K .......... .......... .......... .......... .......... 19% 99.6M 16s\n 41250K .......... .......... .......... .......... .......... 19% 83.4M 16s\n 41300K .......... .......... .......... .......... .......... 19% 91.9M 16s\n 41350K .......... .......... .......... .......... .......... 19% 105M 16s\n 41400K .......... .......... .......... .......... .......... 19% 142M 16s\n 41450K .......... .......... .......... .......... .......... 19% 92.4M 16s\n 41500K .......... .......... .......... .......... .......... 19% 104M 16s\n 41550K .......... .......... .......... .......... .......... 19% 86.8M 16s\n 41600K .......... .......... .......... .......... .......... 19% 75.4M 16s\n 41650K .......... .......... .......... .......... .......... 19% 192M 16s\n 41700K .......... .......... .......... .......... .......... 19% 251M 16s\n 41750K .......... .......... .......... .......... .......... 19% 268M 16s\n 41800K .......... .......... .......... .......... .......... 19% 214M 16s\n 41850K .......... .......... .......... .......... .......... 19% 277M 16s\n 41900K .......... .......... .......... .......... .......... 19% 259M 16s\n 41950K .......... .......... .......... .......... .......... 19% 250M 16s\n 42000K .......... .......... .......... .......... .......... 20% 1.04M 16s\n 42050K .......... .......... .......... .......... .......... 20% 78.8M 16s\n 42100K .......... .......... .......... .......... .......... 20% 68.5M 16s\n 42150K .......... .......... .......... .......... .......... 20% 77.5M 16s\n 42200K .......... .......... .......... .......... .......... 20% 232M 16s\n 42250K .......... .......... .......... .......... .......... 20% 2.22M 16s\n 42300K .......... .......... .......... .......... .......... 20% 72.6M 16s\n 42350K .......... .......... .......... .......... .......... 20% 74.7M 16s\n 42400K .......... .......... .......... .......... .......... 20% 60.2M 16s\n 42450K .......... .......... .......... .......... .......... 20% 74.1M 16s\n 42500K .......... .......... .......... .......... .......... 20% 93.1M 16s\n 42550K .......... .......... .......... .......... .......... 20% 86.1M 16s\n 42600K .......... .......... .......... .......... .......... 20% 73.8M 16s\n 42650K .......... .......... .......... .......... .......... 20% 74.8M 16s\n 42700K .......... .......... .......... .......... .......... 20% 82.0M 16s\n 42750K .......... .......... .......... .......... .......... 20% 99.3M 16s\n 42800K .......... .......... .......... .......... .......... 20% 93.6M 16s\n 42850K .......... .......... .......... .......... .......... 20% 83.8M 16s\n 42900K .......... .......... .......... .......... .......... 20% 83.1M 16s\n 42950K .......... .......... .......... .......... .......... 20% 78.8M 16s\n 43000K .......... .......... .......... .......... .......... 20% 79.4M 16s\n 43050K .......... .......... .......... .......... .......... 20% 93.8M 16s\n 43100K .......... .......... .......... .......... .......... 20% 92.4M 16s\n 43150K .......... .......... .......... .......... .......... 20% 85.1M 16s\n 43200K .......... .......... .......... .......... .......... 20% 74.1M 16s\n 43250K .......... .......... .......... .......... .......... 20% 78.1M 16s\n 43300K .......... .......... .......... .......... .......... 20% 81.7M 15s\n 43350K .......... .......... .......... .......... .......... 20% 94.0M 15s\n 43400K .......... .......... .......... .......... .......... 20% 81.2M 15s\n 43450K .......... .......... .......... .......... .......... 20% 128M 15s\n 43500K .......... .......... .......... .......... .......... 20% 123M 15s\n 43550K .......... .......... .......... .......... .......... 20% 158M 15s\n 43600K .......... .......... .......... .......... .......... 20% 98.9M 15s\n 43650K .......... .......... .......... .......... .......... 20% 252M 15s\n 43700K .......... .......... .......... .......... .......... 20% 80.5M 15s\n 43750K .......... .......... .......... .......... .......... 20% 73.2M 15s\n 43800K .......... .......... .......... .......... .......... 20% 59.1M 15s\n 43850K .......... .......... .......... .......... .......... 20% 118M 15s\n 43900K .......... .......... .......... .......... .......... 20% 71.7M 15s\n 43950K .......... .......... .......... .......... .......... 20% 79.7M 15s\n 44000K .......... .......... .......... .......... .......... 20% 213M 15s\n 44050K .......... .......... .......... .......... .......... 20% 146M 15s\n 44100K .......... .......... .......... .......... .......... 21% 161M 15s\n 44150K .......... .......... .......... .......... .......... 21% 252M 15s\n 44200K .......... .......... .......... .......... .......... 21% 84.7M 15s\n 44250K .......... .......... .......... .......... .......... 21% 267M 15s\n 44300K .......... .......... .......... .......... .......... 21% 94.2M 15s\n 44350K .......... .......... .......... .......... .......... 21% 83.6M 15s\n 44400K .......... .......... .......... .......... .......... 21% 74.8M 15s\n 44450K .......... .......... .......... .......... .......... 21% 117M 15s\n 44500K .......... .......... .......... .......... .......... 21% 252M 15s\n 44550K .......... .......... .......... .......... .......... 21% 213M 15s\n 44600K .......... .......... .......... .......... .......... 21% 242M 15s\n 44650K .......... .......... .......... .......... .......... 21% 253M 15s\n 44700K .......... .......... .......... .......... .......... 21% 1.24M 15s\n 44750K .......... .......... .......... .......... .......... 21% 63.6M 15s\n 44800K .......... .......... .......... .......... .......... 21% 91.2M 15s\n 44850K .......... .......... .......... .......... .......... 21% 188M 15s\n 44900K .......... .......... .......... .......... .......... 21% 1.73M 15s\n 44950K .......... .......... .......... .......... .......... 21% 57.2M 15s\n 45000K .......... .......... .......... .......... .......... 21% 80.9M 15s\n 45050K .......... .......... .......... .......... .......... 21% 68.2M 15s\n 45100K .......... .......... .......... .......... .......... 21% 113M 15s\n 45150K .......... .......... .......... .......... .......... 21% 61.5M 15s\n 45200K .......... .......... .......... .......... .......... 21% 120M 15s\n 45250K .......... .......... .......... .......... .......... 21% 148M 15s\n 45300K .......... .......... .......... .......... .......... 21% 120M 15s\n 45350K .......... .......... .......... .......... .......... 21% 115M 15s\n 45400K .......... .......... .......... .......... .......... 21% 74.6M 15s\n 45450K .......... .......... .......... .......... .......... 21% 99.5M 15s\n 45500K .......... .......... .......... .......... .......... 21% 74.5M 15s\n 45550K .......... .......... .......... .......... .......... 21% 85.9M 15s\n 45600K .......... .......... .......... .......... .......... 21% 89.5M 15s\n 45650K .......... .......... .......... .......... .......... 21% 109M 15s\n 45700K .......... .......... .......... .......... .......... 21% 97.0M 15s\n 45750K .......... .......... .......... .......... .......... 21% 83.0M 15s\n 45800K .......... .......... .......... .......... .......... 21% 151M 15s\n 45850K .......... .......... .......... .......... .......... 21% 188M 15s\n 45900K .......... .......... .......... .......... .......... 21% 250M 15s\n 45950K .......... .......... .......... .......... .......... 21% 290M 15s\n 46000K .......... .......... .......... .......... .......... 21% 125M 15s\n 46050K .......... .......... .......... .......... .......... 21% 118M 15s\n 46100K .......... .......... .......... .......... .......... 21% 135M 15s\n 46150K .......... .......... .......... .......... .......... 21% 88.6M 15s\n 46200K .......... .......... .......... .......... .......... 22% 176M 15s\n 46250K .......... .......... .......... .......... .......... 22% 331M 15s\n 46300K .......... .......... .......... .......... .......... 22% 79.5M 15s\n 46350K .......... .......... .......... .......... .......... 22% 133M 15s\n 46400K .......... .......... .......... .......... .......... 22% 123M 15s\n 46450K .......... .......... .......... .......... .......... 22% 89.4M 15s\n 46500K .......... .......... .......... .......... .......... 22% 153M 14s\n 46550K .......... .......... .......... .......... .......... 22% 77.2M 14s\n 46600K .......... .......... .......... .......... .......... 22% 101M 14s\n 46650K .......... .......... .......... .......... .......... 22% 159M 14s\n 46700K .......... .......... .......... .......... .......... 22% 114M 14s\n 46750K .......... .......... .......... .......... .......... 22% 96.5M 14s\n 46800K .......... .......... .......... .......... .......... 22% 70.3M 14s\n 46850K .......... .......... .......... .......... .......... 22% 84.4M 14s\n 46900K .......... .......... .......... .......... .......... 22% 101M 14s\n 46950K .......... .......... .......... .......... .......... 22% 99.9M 14s\n 47000K .......... .......... .......... .......... .......... 22% 103M 14s\n 47050K .......... .......... .......... .......... .......... 22% 106M 14s\n 47100K .......... .......... .......... .......... .......... 22% 130M 14s\n 47150K .......... .......... .......... .......... .......... 22% 267M 14s\n 47200K .......... .......... .......... .......... .......... 22% 716K 14s\n 47250K .......... .......... .......... .......... .......... 22% 103M 14s\n 47300K .......... .......... .......... .......... .......... 22% 49.0M 14s\n 47350K .......... .......... .......... .......... .......... 22% 95.5M 14s\n 47400K .......... .......... .......... .......... .......... 22% 132M 14s\n 47450K .......... .......... .......... .......... .......... 22% 157M 14s\n 47500K .......... .......... .......... .......... .......... 22% 130M 14s\n 47550K .......... .......... .......... .......... .......... 22% 225M 14s\n 47600K .......... .......... .......... .......... .......... 22% 85.2M 14s\n 47650K .......... .......... .......... .......... .......... 22% 100M 14s\n 47700K .......... .......... .......... .......... .......... 22% 107M 14s\n 47750K .......... .......... .......... .......... .......... 22% 78.5M 14s\n 47800K .......... .......... .......... .......... .......... 22% 115M 14s\n 47850K .......... .......... .......... .......... .......... 22% 171M 14s\n 47900K .......... .......... .......... .......... .......... 22% 68.6M 14s\n 47950K .......... .......... .......... .......... .......... 22% 63.7M 14s\n 48000K .......... .......... .......... .......... .......... 22% 112M 14s\n 48050K .......... .......... .......... .......... .......... 22% 127M 14s\n 48100K .......... .......... .......... .......... .......... 22% 79.7M 14s\n 48150K .......... .......... .......... .......... .......... 22% 89.8M 14s\n 48200K .......... .......... .......... .......... .......... 22% 64.8M 14s\n 48250K .......... .......... .......... .......... .......... 22% 74.0M 14s\n 48300K .......... .......... .......... .......... .......... 23% 106M 14s\n 48350K .......... .......... .......... .......... .......... 23% 91.0M 14s\n 48400K .......... .......... .......... .......... .......... 23% 89.5M 14s\n 48450K .......... .......... .......... .......... .......... 23% 75.0M 14s\n 48500K .......... .......... .......... .......... .......... 23% 83.3M 14s\n 48550K .......... .......... .......... .......... .......... 23% 99.7M 14s\n 48600K .......... .......... .......... .......... .......... 23% 130M 14s\n 48650K .......... .......... .......... .......... .......... 23% 77.0M 14s\n 48700K .......... .......... .......... .......... .......... 23% 166M 14s\n 48750K .......... .......... .......... .......... .......... 23% 129M 14s\n 48800K .......... .......... .......... .......... .......... 23% 71.0M 14s\n 48850K .......... .......... .......... .......... .......... 23% 85.6M 14s\n 48900K .......... .......... .......... .......... .......... 23% 102M 14s\n 48950K .......... .......... .......... .......... .......... 23% 115M 14s\n 49000K .......... .......... .......... .......... .......... 23% 76.4M 14s\n 49050K .......... .......... .......... .......... .......... 23% 96.0M 14s\n 49100K .......... .......... .......... .......... .......... 23% 95.7M 14s\n 49150K .......... .......... .......... .......... .......... 23% 101M 14s\n 49200K .......... .......... .......... .......... .......... 23% 174M 14s\n 49250K .......... .......... .......... .......... .......... 23% 258M 14s\n 49300K .......... .......... .......... .......... .......... 23% 255M 14s\n 49350K .......... .......... .......... .......... .......... 23% 201M 14s\n 49400K .......... .......... .......... .......... .......... 23% 245M 14s\n 49450K .......... .......... .......... .......... .......... 23% 723K 14s\n 49500K .......... .......... .......... .......... .......... 23% 83.3M 14s\n 49550K .......... .......... .......... .......... .......... 23% 82.8M 14s\n 49600K .......... .......... .......... .......... .......... 23% 70.1M 14s\n 49650K .......... .......... .......... .......... .......... 23% 96.0M 14s\n 49700K .......... .......... .......... .......... .......... 23% 74.5M 14s\n 49750K .......... .......... .......... .......... .......... 23% 104M 14s\n 49800K .......... .......... .......... .......... .......... 23% 151M 14s\n 49850K .......... .......... .......... .......... .......... 23% 106M 14s\n 49900K .......... .......... .......... .......... .......... 23% 118M 14s\n 49950K .......... .......... .......... .......... .......... 23% 72.0M 14s\n 50000K .......... .......... .......... .......... .......... 23% 109M 14s\n 50050K .......... .......... .......... .......... .......... 23% 86.5M 14s\n 50100K .......... .......... .......... .......... .......... 23% 115M 14s\n 50150K .......... .......... .......... .......... .......... 23% 76.7M 14s\n 50200K .......... .......... .......... .......... .......... 23% 91.2M 14s\n 50250K .......... .......... .......... .......... .......... 23% 92.0M 14s\n 50300K .......... .......... .......... .......... .......... 23% 95.3M 14s\n 50350K .......... .......... .......... .......... .......... 23% 102M 14s\n 50400K .......... .......... .......... .......... .......... 24% 86.6M 14s\n 50450K .......... .......... .......... .......... .......... 24% 94.8M 14s\n 50500K .......... .......... .......... .......... .......... 24% 124M 14s\n 50550K .......... .......... .......... .......... .......... 24% 108M 14s\n 50600K .......... .......... .......... .......... .......... 24% 109M 14s\n 50650K .......... .......... .......... .......... .......... 24% 109M 14s\n 50700K .......... .......... .......... .......... .......... 24% 91.2M 14s\n 50750K .......... .......... .......... .......... .......... 24% 100M 13s\n 50800K .......... .......... .......... .......... .......... 24% 99.6M 13s\n 50850K .......... .......... .......... .......... .......... 24% 93.1M 13s\n 50900K .......... .......... .......... .......... .......... 24% 125M 13s\n 50950K .......... .......... .......... .......... .......... 24% 71.1M 13s\n 51000K .......... .......... .......... .......... .......... 24% 99.4M 13s\n 51050K .......... .......... .......... .......... .......... 24% 111M 13s\n 51100K .......... .......... .......... .......... .......... 24% 66.9M 13s\n 51150K .......... .......... .......... .......... .......... 24% 77.5M 13s\n 51200K .......... .......... .......... .......... .......... 24% 106M 13s\n 51250K .......... .......... .......... .......... .......... 24% 98.5M 13s\n 51300K .......... .......... .......... .......... .......... 24% 172M 13s\n 51350K .......... .......... .......... .......... .......... 24% 89.9M 13s\n 51400K .......... .......... .......... .......... .......... 24% 108M 13s\n 51450K .......... .......... .......... .......... .......... 24% 48.7M 13s\n 51500K .......... .......... .......... .......... .......... 24% 48.4M 13s\n 51550K .......... .......... .......... .......... .......... 24% 73.5M 13s\n 51600K .......... .......... .......... .......... .......... 24% 119M 13s\n 51650K .......... .......... .......... .......... .......... 24% 269M 13s\n 51700K .......... .......... .......... .......... .......... 24% 100M 13s\n 51750K .......... .......... .......... .......... .......... 24% 79.1M 13s\n 51800K .......... .......... .......... .......... .......... 24% 76.2M 13s\n 51850K .......... .......... .......... .......... .......... 24% 102M 13s\n 51900K .......... .......... .......... .......... .......... 24% 107M 13s\n 51950K .......... .......... .......... .......... .......... 24% 120M 13s\n 52000K .......... .......... .......... .......... .......... 24% 224M 13s\n 52050K .......... .......... .......... .......... .......... 24% 255M 13s\n 52100K .......... .......... .......... .......... .......... 24% 250M 13s\n 52150K .......... .......... .......... .......... .......... 24% 233M 13s\n 52200K .......... .......... .......... .......... .......... 24% 727K 13s\n 52250K .......... .......... .......... .......... .......... 24% 116M 13s\n 52300K .......... .......... .......... .......... .......... 24% 118M 13s\n 52350K .......... .......... .......... .......... .......... 24% 108M 13s\n 52400K .......... .......... .......... .......... .......... 24% 103M 13s\n 52450K .......... .......... .......... .......... .......... 24% 99.0M 13s\n 52500K .......... .......... .......... .......... .......... 25% 114M 13s\n 52550K .......... .......... .......... .......... .......... 25% 116M 13s\n 52600K .......... .......... .......... .......... .......... 25% 110M 13s\n 52650K .......... .......... .......... .......... .......... 25% 112M 13s\n 52700K .......... .......... .......... .......... .......... 25% 105M 13s\n 52750K .......... .......... .......... .......... .......... 25% 101M 13s\n 52800K .......... .......... .......... .......... .......... 25% 148M 13s\n 52850K .......... .......... .......... .......... .......... 25% 105M 13s\n 52900K .......... .......... .......... .......... .......... 25% 65.6M 13s\n 52950K .......... .......... .......... .......... .......... 25% 82.8M 13s\n 53000K .......... .......... .......... .......... .......... 25% 108M 13s\n 53050K .......... .......... .......... .......... .......... 25% 99.0M 13s\n 53100K .......... .......... .......... .......... .......... 25% 66.9M 13s\n 53150K .......... .......... .......... .......... .......... 25% 98.3M 13s\n 53200K .......... .......... .......... .......... .......... 25% 79.8M 13s\n 53250K .......... .......... .......... .......... .......... 25% 93.0M 13s\n 53300K .......... .......... .......... .......... .......... 25% 95.9M 13s\n 53350K .......... .......... .......... .......... .......... 25% 104M 13s\n 53400K .......... .......... .......... .......... .......... 25% 80.1M 13s\n 53450K .......... .......... .......... .......... .......... 25% 89.9M 13s\n 53500K .......... .......... .......... .......... .......... 25% 110M 13s\n 53550K .......... .......... .......... .......... .......... 25% 100M 13s\n 53600K .......... .......... .......... .......... .......... 25% 124M 13s\n 53650K .......... .......... .......... .......... .......... 25% 106M 13s\n 53700K .......... .......... .......... .......... .......... 25% 96.1M 13s\n 53750K .......... .......... .......... .......... .......... 25% 94.6M 13s\n 53800K .......... .......... .......... .......... .......... 25% 94.1M 13s\n 53850K .......... .......... .......... .......... .......... 25% 94.2M 13s\n 53900K .......... .......... .......... .......... .......... 25% 95.0M 13s\n 53950K .......... .......... .......... .......... .......... 25% 122M 13s\n 54000K .......... .......... .......... .......... .......... 25% 97.2M 13s\n 54050K .......... .......... .......... .......... .......... 25% 136M 13s\n 54100K .......... .......... .......... .......... .......... 25% 144M 13s\n 54150K .......... .......... .......... .......... .......... 25% 118M 13s\n 54200K .......... .......... .......... .......... .......... 25% 149M 13s\n 54250K .......... .......... .......... .......... .......... 25% 156M 13s\n 54300K .......... .......... .......... .......... .......... 25% 96.3M 13s\n 54350K .......... .......... .......... .......... .......... 25% 97.5M 13s\n 54400K .......... .......... .......... .......... .......... 25% 100M 13s\n 54450K .......... .......... .......... .......... .......... 25% 102M 13s\n 54500K .......... .......... .......... .......... .......... 25% 83.0M 13s\n 54550K .......... .......... .......... .......... .......... 25% 102M 13s\n 54600K .......... .......... .......... .......... .......... 26% 96.2M 13s\n 54650K .......... .......... .......... .......... .......... 26% 103M 13s\n 54700K .......... .......... .......... .......... .......... 26% 145M 13s\n 54750K .......... .......... .......... .......... .......... 26% 725K 13s\n 54800K .......... .......... .......... .......... .......... 26% 93.3M 13s\n 54850K .......... .......... .......... .......... .......... 26% 119M 13s\n 54900K .......... .......... .......... .......... .......... 26% 110M 13s\n 54950K .......... .......... .......... .......... .......... 26% 124M 13s\n 55000K .......... .......... .......... .......... .......... 26% 87.4M 13s\n 55050K .......... .......... .......... .......... .......... 26% 129M 13s\n 55100K .......... .......... .......... .......... .......... 26% 105M 13s\n 55150K .......... .......... .......... .......... .......... 26% 132M 13s\n 55200K .......... .......... .......... .......... .......... 26% 104M 13s\n 55250K .......... .......... .......... .......... .......... 26% 84.2M 13s\n 55300K .......... .......... .......... .......... .......... 26% 132M 13s\n 55350K .......... .......... .......... .......... .......... 26% 99.2M 13s\n 55400K .......... .......... .......... .......... .......... 26% 97.8M 13s\n 55450K .......... .......... .......... .......... .......... 26% 135M 12s\n 55500K .......... .......... .......... .......... .......... 26% 142M 12s\n 55550K .......... .......... .......... .......... .......... 26% 88.1M 12s\n 55600K .......... .......... .......... .......... .......... 26% 117M 12s\n 55650K .......... .......... .......... .......... .......... 26% 114M 12s\n 55700K .......... .......... .......... .......... .......... 26% 144M 12s\n 55750K .......... .......... .......... .......... .......... 26% 103M 12s\n 55800K .......... .......... .......... .......... .......... 26% 107M 12s\n 55850K .......... .......... .......... .......... .......... 26% 101M 12s\n 55900K .......... .......... .......... .......... .......... 26% 66.2M 12s\n 55950K .......... .......... .......... .......... .......... 26% 74.3M 12s\n 56000K .......... .......... .......... .......... .......... 26% 65.0M 12s\n 56050K .......... .......... .......... .......... .......... 26% 63.0M 12s\n 56100K .......... .......... .......... .......... .......... 26% 81.5M 12s\n 56150K .......... .......... .......... .......... .......... 26% 79.7M 12s\n 56200K .......... .......... .......... .......... .......... 26% 112M 12s\n 56250K .......... .......... .......... .......... .......... 26% 87.9M 12s\n 56300K .......... .......... .......... .......... .......... 26% 80.7M 12s\n 56350K .......... .......... .......... .......... .......... 26% 74.2M 12s\n 56400K .......... .......... .......... .......... .......... 26% 86.2M 12s\n 56450K .......... .......... .......... .......... .......... 26% 107M 12s\n 56500K .......... .......... .......... .......... .......... 26% 77.1M 12s\n 56550K .......... .......... .......... .......... .......... 26% 85.1M 12s\n 56600K .......... .......... .......... .......... .......... 26% 64.6M 12s\n 56650K .......... .......... .......... .......... .......... 26% 78.8M 12s\n 56700K .......... .......... .......... .......... .......... 27% 99.4M 12s\n 56750K .......... .......... .......... .......... .......... 27% 87.8M 12s\n 56800K .......... .......... .......... .......... .......... 27% 157M 12s\n 56850K .......... .......... .......... .......... .......... 27% 255M 12s\n 56900K .......... .......... .......... .......... .......... 27% 269M 12s\n 56950K .......... .......... .......... .......... .......... 27% 204M 12s\n 57000K .......... .......... .......... .......... .......... 27% 251M 12s\n 57050K .......... .......... .......... .......... .......... 27% 257M 12s\n 57100K .......... .......... .......... .......... .......... 27% 725K 12s\n 57150K .......... .......... .......... .......... .......... 27% 68.4M 12s\n 57200K .......... .......... .......... .......... .......... 27% 81.1M 12s\n 57250K .......... .......... .......... .......... .......... 27% 81.6M 12s\n 57300K .......... .......... .......... .......... .......... 27% 82.9M 12s\n 57350K .......... .......... .......... .......... .......... 27% 85.7M 12s\n 57400K .......... .......... .......... .......... .......... 27% 85.9M 12s\n 57450K .......... .......... .......... .......... .......... 27% 103M 12s\n 57500K .......... .......... .......... .......... .......... 27% 87.8M 12s\n 57550K .......... .......... .......... .......... .......... 27% 76.0M 12s\n 57600K .......... .......... .......... .......... .......... 27% 88.5M 12s\n 57650K .......... .......... .......... .......... .......... 27% 112M 12s\n 57700K .......... .......... .......... .......... .......... 27% 46.2M 12s\n 57750K .......... .......... .......... .......... .......... 27% 49.7M 12s\n 57800K .......... .......... .......... .......... .......... 27% 48.7M 12s\n 57850K .......... .......... .......... .......... .......... 27% 53.6M 12s\n 57900K .......... .......... .......... .......... .......... 27% 63.3M 12s\n 57950K .......... .......... .......... .......... .......... 27% 53.5M 12s\n 58000K .......... .......... .......... .......... .......... 27% 51.0M 12s\n 58050K .......... .......... .......... .......... .......... 27% 63.0M 12s\n 58100K .......... .......... .......... .......... .......... 27% 61.9M 12s\n 58150K .......... .......... .......... .......... .......... 27% 87.4M 12s\n 58200K .......... .......... .......... .......... .......... 27% 58.7M 12s\n 58250K .......... .......... .......... .......... .......... 27% 69.8M 12s\n 58300K .......... .......... .......... .......... .......... 27% 128M 12s\n 58350K .......... .......... .......... .......... .......... 27% 93.8M 12s\n 58400K .......... .......... .......... .......... .......... 27% 81.4M 12s\n 58450K .......... .......... .......... .......... .......... 27% 146M 12s\n 58500K .......... .......... .......... .......... .......... 27% 122M 12s\n 58550K .......... .......... .......... .......... .......... 27% 100M 12s\n 58600K .......... .......... .......... .......... .......... 27% 85.9M 12s\n 58650K .......... .......... .......... .......... .......... 27% 79.1M 12s\n 58700K .......... .......... .......... .......... .......... 27% 106M 12s\n 58750K .......... .......... .......... .......... .......... 27% 217M 12s\n 58800K .......... .......... .......... .......... .......... 28% 276M 12s\n 58850K .......... .......... .......... .......... .......... 28% 259M 12s\n 58900K .......... .......... .......... .......... .......... 28% 174M 12s\n 58950K .......... .......... .......... .......... .......... 28% 220M 12s\n 59000K .......... .......... .......... .......... .......... 28% 239M 12s\n 59050K .......... .......... .......... .......... .......... 28% 264M 12s\n 59100K .......... .......... .......... .......... .......... 28% 258M 12s\n 59150K .......... .......... .......... .......... .......... 28% 222M 12s\n 59200K .......... .......... .......... .......... .......... 28% 278M 12s\n 59250K .......... .......... .......... .......... .......... 28% 251M 12s\n 59300K .......... .......... .......... .......... .......... 28% 261M 12s\n 59350K .......... .......... .......... .......... .......... 28% 749K 12s\n 59400K .......... .......... .......... .......... .......... 28% 77.0M 12s\n 59450K .......... .......... .......... .......... .......... 28% 54.2M 12s\n 59500K .......... .......... .......... .......... .......... 28% 65.6M 12s\n 59550K .......... .......... .......... .......... .......... 28% 82.6M 12s\n 59600K .......... .......... .......... .......... .......... 28% 99.4M 12s\n 59650K .......... .......... .......... .......... .......... 28% 79.4M 12s\n 59700K .......... .......... .......... .......... .......... 28% 84.0M 12s\n 59750K .......... .......... .......... .......... .......... 28% 95.8M 12s\n 59800K .......... .......... .......... .......... .......... 28% 88.7M 12s\n 59850K .......... .......... .......... .......... .......... 28% 71.6M 12s\n 59900K .......... .......... .......... .......... .......... 28% 68.7M 12s\n 59950K .......... .......... .......... .......... .......... 28% 77.2M 12s\n 60000K .......... .......... .......... .......... .......... 28% 91.1M 12s\n 60050K .......... .......... .......... .......... .......... 28% 72.2M 12s\n 60100K .......... .......... .......... .......... .......... 28% 70.8M 12s\n 60150K .......... .......... .......... .......... .......... 28% 111M 12s\n 60200K .......... .......... .......... .......... .......... 28% 105M 12s\n 60250K .......... .......... .......... .......... .......... 28% 162M 12s\n 60300K .......... .......... .......... .......... .......... 28% 108M 12s\n 60350K .......... .......... .......... .......... .......... 28% 95.6M 12s\n 60400K .......... .......... .......... .......... .......... 28% 102M 12s\n 60450K .......... .......... .......... .......... .......... 28% 86.7M 12s\n 60500K .......... .......... .......... .......... .......... 28% 155M 12s\n 60550K .......... .......... .......... .......... .......... 28% 101M 12s\n 60600K .......... .......... .......... .......... .......... 28% 194M 12s\n 60650K .......... .......... .......... .......... .......... 28% 157M 12s\n 60700K .......... .......... .......... .......... .......... 28% 68.7M 11s\n 60750K .......... .......... .......... .......... .......... 28% 88.7M 11s\n 60800K .......... .......... .......... .......... .......... 28% 81.1M 11s\n 60850K .......... .......... .......... .......... .......... 28% 82.6M 11s\n 60900K .......... .......... .......... .......... .......... 29% 81.6M 11s\n 60950K .......... .......... .......... .......... .......... 29% 72.9M 11s\n 61000K .......... .......... .......... .......... .......... 29% 88.2M 11s\n 61050K .......... .......... .......... .......... .......... 29% 68.2M 11s\n 61100K .......... .......... .......... .......... .......... 29% 74.0M 11s\n 61150K .......... .......... .......... .......... .......... 29% 84.5M 11s\n 61200K .......... .......... .......... .......... .......... 29% 95.2M 11s\n 61250K .......... .......... .......... .......... .......... 29% 69.3M 11s\n 61300K .......... .......... .......... .......... .......... 29% 108M 11s\n 61350K .......... .......... .......... .......... .......... 29% 86.4M 11s\n 61400K .......... .......... .......... .......... .......... 29% 78.9M 11s\n 61450K .......... .......... .......... .......... .......... 29% 91.5M 11s\n 61500K .......... .......... .......... .......... .......... 29% 144M 11s\n 61550K .......... .......... .......... .......... .......... 29% 167M 11s\n 61600K .......... .......... .......... .......... .......... 29% 250M 11s\n 61650K .......... .......... .......... .......... .......... 29% 131M 11s\n 61700K .......... .......... .......... .......... .......... 29% 258M 11s\n 61750K .......... .......... .......... .......... .......... 29% 757K 11s\n 61800K .......... .......... .......... .......... .......... 29% 58.2M 11s\n 61850K .......... .......... .......... .......... .......... 29% 126M 11s\n 61900K .......... .......... .......... .......... .......... 29% 78.6M 11s\n 61950K .......... .......... .......... .......... .......... 29% 51.8M 11s\n 62000K .......... .......... .......... .......... .......... 29% 51.5M 11s\n 62050K .......... .......... .......... .......... .......... 29% 55.8M 11s\n 62100K .......... .......... .......... .......... .......... 29% 58.7M 11s\n 62150K .......... .......... .......... .......... .......... 29% 107M 11s\n 62200K .......... .......... .......... .......... .......... 29% 116M 11s\n 62250K .......... .......... .......... .......... .......... 29% 67.9M 11s\n 62300K .......... .......... .......... .......... .......... 29% 80.7M 11s\n 62350K .......... .......... .......... .......... .......... 29% 98.6M 11s\n 62400K .......... .......... .......... .......... .......... 29% 81.3M 11s\n 62450K .......... .......... .......... .......... .......... 29% 76.2M 11s\n 62500K .......... .......... .......... .......... .......... 29% 86.0M 11s\n 62550K .......... .......... .......... .......... .......... 29% 83.8M 11s\n 62600K .......... .......... .......... .......... .......... 29% 93.7M 11s\n 62650K .......... .......... .......... .......... .......... 29% 82.4M 11s\n 62700K .......... .......... .......... .......... .......... 29% 66.7M 11s\n 62750K .......... .......... .......... .......... .......... 29% 69.8M 11s\n 62800K .......... .......... .......... .......... .......... 29% 91.1M 11s\n 62850K .......... .......... .......... .......... .......... 29% 89.1M 11s\n 62900K .......... .......... .......... .......... .......... 29% 103M 11s\n 62950K .......... .......... .......... .......... .......... 29% 154M 11s\n 63000K .......... .......... .......... .......... .......... 30% 117M 11s\n 63050K .......... .......... .......... .......... .......... 30% 154M 11s\n 63100K .......... .......... .......... .......... .......... 30% 174M 11s\n 63150K .......... .......... .......... .......... .......... 30% 128M 11s\n 63200K .......... .......... .......... .......... .......... 30% 135M 11s\n 63250K .......... .......... .......... .......... .......... 30% 113M 11s\n 63300K .......... .......... .......... .......... .......... 30% 83.4M 11s\n 63350K .......... .......... .......... .......... .......... 30% 64.8M 11s\n 63400K .......... .......... .......... .......... .......... 30% 106M 11s\n 63450K .......... .......... .......... .......... .......... 30% 96.7M 11s\n 63500K .......... .......... .......... .......... .......... 30% 78.9M 11s\n 63550K .......... .......... .......... .......... .......... 30% 77.6M 11s\n 63600K .......... .......... .......... .......... .......... 30% 81.0M 11s\n 63650K .......... .......... .......... .......... .......... 30% 87.1M 11s\n 63700K .......... .......... .......... .......... .......... 30% 96.6M 11s\n 63750K .......... .......... .......... .......... .......... 30% 317M 11s\n 63800K .......... .......... .......... .......... .......... 30% 331M 11s\n 63850K .......... .......... .......... .......... .......... 30% 747K 11s\n 63900K .......... .......... .......... .......... .......... 30% 74.6M 11s\n 63950K .......... .......... .......... .......... .......... 30% 52.7M 11s\n 64000K .......... .......... .......... .......... .......... 30% 88.0M 11s\n 64050K .......... .......... .......... .......... .......... 30% 118M 11s\n 64100K .......... .......... .......... .......... .......... 30% 98.1M 11s\n 64150K .......... .......... .......... .......... .......... 30% 190M 11s\n 64200K .......... .......... .......... .......... .......... 30% 223M 11s\n 64250K .......... .......... .......... .......... .......... 30% 181M 11s\n 64300K .......... .......... .......... .......... .......... 30% 179M 11s\n 64350K .......... .......... .......... .......... .......... 30% 187M 11s\n 64400K .......... .......... .......... .......... .......... 30% 88.4M 11s\n 64450K .......... .......... .......... .......... .......... 30% 50.9M 11s\n 64500K .......... .......... .......... .......... .......... 30% 90.4M 11s\n 64550K .......... .......... .......... .......... .......... 30% 79.4M 11s\n 64600K .......... .......... .......... .......... .......... 30% 93.7M 11s\n 64650K .......... .......... .......... .......... .......... 30% 119M 11s\n 64700K .......... .......... .......... .......... .......... 30% 131M 11s\n 64750K .......... .......... .......... .......... .......... 30% 106M 11s\n 64800K .......... .......... .......... .......... .......... 30% 94.7M 11s\n 64850K .......... .......... .......... .......... .......... 30% 95.1M 11s\n 64900K .......... .......... .......... .......... .......... 30% 121M 11s\n 64950K .......... .......... .......... .......... .......... 30% 131M 11s\n 65000K .......... .......... .......... .......... .......... 30% 122M 11s\n 65050K .......... .......... .......... .......... .......... 30% 122M 11s\n 65100K .......... .......... .......... .......... .......... 31% 168M 11s\n 65150K .......... .......... .......... .......... .......... 31% 115M 11s\n 65200K .......... .......... .......... .......... .......... 31% 165M 11s\n 65250K .......... .......... .......... .......... .......... 31% 116M 11s\n 65300K .......... .......... .......... .......... .......... 31% 127M 11s\n 65350K .......... .......... .......... .......... .......... 31% 85.7M 11s\n 65400K .......... .......... .......... .......... .......... 31% 152M 11s\n 65450K .......... .......... .......... .......... .......... 31% 211M 11s\n 65500K .......... .......... .......... .......... .......... 31% 13.8M 11s\n 65550K .......... .......... .......... .......... .......... 31% 81.8M 11s" + }, + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"result-alert alert-warning\" role\u003d\"alert\"\u003e\u003cbutton type\u003d\"button\" class\u003d\"close\" data-dismiss\u003d\"alert\" aria-label\u003d\"Close\"\u003e\u003cspan aria-hidden\u003d\"true\"\u003e\u0026times;\u003c/span\u003e\u003c/button\u003e\u003cstrong\u003eOutput is truncated\u003c/strong\u003e to 102400 bytes. Learn more about \u003cstrong\u003eZEPPELIN_INTERPRETER_OUTPUT_LIMIT\u003c/strong\u003e\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587039847609_-1594440154", + "id": "20200416-122407_1820825261", + "dateCreated": "2020-04-16 12:24:07.609", + "dateStarted": "2020-04-16 12:24:18.863", + "dateFinished": "2020-04-16 12:24:41.675", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nload data to dcefs\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:25:05.608", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eload data to dcefs\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587039875645_416561962", + "id": "20200416-122435_372176047", + "dateCreated": "2020-04-16 12:24:35.645", + "dateStarted": "2020-04-16 12:25:05.608", + "dateFinished": "2020-04-16 12:25:05.615", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sh\n\n#remove existing copies of dataset from HDFS\nhadoop fs -rm -r -f /tmp/airflightsdelays\nhadoop fs -mkdir /tmp/airflightsdelays\n\n#put data into HDFS\nhadoop fs -put /tmp/flights_200*.bz2 /tmp/airflightsdelays/\nhadoop fs -put /tmp/weather_200*.gz /tmp/airflightsdelays/\nhadoop fs -ls -h /tmp/airflightsdelays/\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:25:16.469", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "apps": [], + "jobName": "paragraph_1587039858850_149173217", + "id": "20200416-122418_1975970370", + "dateCreated": "2020-04-16 12:24:18.850", + "status": "READY", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nDeclare dependencies/libraries\n\nin spark conf (spark interpretor settings) specify maven dependency for joda-time:joda-time:2.9.1\nspark.jars.packages\tjoda-time:joda-time:2.9.1\n\nFor zeppelin 0.9\n%dep\n\nz.reset()\nz.load(\"joda-time:joda-time:2.9.1\")\n\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 13:19:09.465", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eDeclare dependencies/libraries\u003c/p\u003e\n\u003cp\u003ein spark conf (spark interpretor settings) specify maven dependency for joda-time:joda-time:2.9.1\u003cbr/\u003espark.jars.packages joda-time:joda-time:2.9.1\u003c/p\u003e\n\u003cp\u003eFor zeppelin 0.9\u003cbr/\u003e%dep\u003c/p\u003e\n\u003cp\u003ez.reset()\u003cbr/\u003ez.load(\u0026ldquo;joda-time:joda-time:2.9.1\u0026rdquo;)\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587039916469_344537130", + "id": "20200416-122516_419460393", + "dateCreated": "2020-04-16 12:25:16.469", + "dateStarted": "2020-04-16 13:19:09.464", + "dateFinished": "2020-04-16 13:19:12.368", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nData Science with Hadoop - Predicting airline delays - Spark and ML-Lib\nIntroduction\nIn this demo, we demonstrate how to build a predictive model with Hadoop, this time we\u0027ll use Apache Spark and ML-Lib.\n\nWe will show how to use Apache Spark via its Scala API to generate our feature matrix and also use ML-Lib (Spark\u0027s machine learning library) to build and evaluate our classification models.\n\nRecall from part 1 that we are constructing a predictive model for flight delays. Our source dataset resides here, and includes details about flights in the US from the years 1987-2008. We have also enriched the data with weather information, where we find daily temperatures (min/max), wind speed, snow conditions and precipitation.\n\nWe will build a supervised learning model to predict flight delays for flights leaving O\u0027Hare International airport (ORD). We will use the year 2007 data to build the model, and test its validity using data from 2008.\n\nPre-processing with Hadoop and Spark\nApache Spark\u0027s basic data abstraction is that of an RDD (resilient distributed dataset), which is a fault-tolerant collection of elements that can be operated on in parallel across your Hadoop cluster.\n\nSpark\u0027s API (available in Scala, Python or Java) supports a variety of transformations such as map() and flatMap(), filter(), join(), and others to create and manipulate RDDs. For a full description of the API please check the Spark API programming guide.\n\nSimilar to the Scikit-learn demo, in our first iteration we generate the following features for each flight:\n\nmonth: winter months should have more delays than summer months\nday of month: this is likely not a very predictive variable, but let\u0027s keep it in anyway\nday of week: weekend vs. weekday\nhour of the day: later hours tend to have more delays\nDistance: interesting to see if this variable is a good predictor of delay\nDays from nearest holiday: number of days from the nearest US holiday\nWe will use Spark RDDs to perform the same pre-processing, transforming the raw flight delay dataset into the two feature matrices: data_2007 (our training set) and data_2008 (our test set).\n\nThe case class DelayRec that encapsulates a flight delay record represents the feature vector, and its methods do most of the heavy lifting:\n\nto_date() is a helper method to convert year/month/day to a string\ngen_features(row) takes a row of inputs and generates a key/value tuple where the key is the date string (output of to_date) and the value is the feature value. We don\u0027t use the key in this iteraion, but we will use it in the second iteration to join with the weather data.\nthe get_hour() method extracts the 2-digit hour portion of the departure time\nThe days_from_nearest_holiday() method computes the minimum distance (in days) of the provided year/month/date from any holiday in the list holidays.\nWith DelayRec in place, our processing takes on the following steps (in the function prepFlightDelays):\n\nWe read the raw input file with Spark\u0027s SparkContext.textFile method, resulting in an RDD\nEach row is parsed with CSVReader into fields, and populated into a DelayRec object\nWe then perform a sequence of RDD transformations on the input RDD to make sure we only have rows that correspond to flights that did not get cancelled and originated from ORD.\nFinally, we use the gen_features method to generate the final feature vector per row, as a set of doubles.\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:52:45.770", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": {} + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eData Science with Hadoop - Predicting airline delays - Spark and ML-Lib\u003cbr/\u003eIntroduction\u003cbr/\u003eIn this demo, we demonstrate how to build a predictive model with Hadoop, this time we\u0026rsquo;ll use Apache Spark and ML-Lib.\u003c/p\u003e\n\u003cp\u003eWe will show how to use Apache Spark via its Scala API to generate our feature matrix and also use ML-Lib (Spark\u0026rsquo;s machine learning library) to build and evaluate our classification models.\u003c/p\u003e\n\u003cp\u003eRecall from part 1 that we are constructing a predictive model for flight delays. Our source dataset resides here, and includes details about flights in the US from the years 1987-2008. We have also enriched the data with weather information, where we find daily temperatures (min/max), wind speed, snow conditions and precipitation.\u003c/p\u003e\n\u003cp\u003eWe will build a supervised learning model to predict flight delays for flights leaving O\u0026rsquo;Hare International airport (ORD). We will use the year 2007 data to build the model, and test its validity using data from 2008.\u003c/p\u003e\n\u003cp\u003ePre-processing with Hadoop and Spark\u003cbr/\u003eApache Spark\u0026rsquo;s basic data abstraction is that of an RDD (resilient distributed dataset), which is a fault-tolerant collection of elements that can be operated on in parallel across your Hadoop cluster.\u003c/p\u003e\n\u003cp\u003eSpark\u0026rsquo;s API (available in Scala, Python or Java) supports a variety of transformations such as map() and flatMap(), filter(), join(), and others to create and manipulate RDDs. For a full description of the API please check the Spark API programming guide.\u003c/p\u003e\n\u003cp\u003eSimilar to the Scikit-learn demo, in our first iteration we generate the following features for each flight:\u003c/p\u003e\n\u003cp\u003emonth: winter months should have more delays than summer months\u003cbr/\u003eday of month: this is likely not a very predictive variable, but let\u0026rsquo;s keep it in anyway\u003cbr/\u003eday of week: weekend vs. weekday\u003cbr/\u003ehour of the day: later hours tend to have more delays\u003cbr/\u003eDistance: interesting to see if this variable is a good predictor of delay\u003cbr/\u003eDays from nearest holiday: number of days from the nearest US holiday\u003cbr/\u003eWe will use Spark RDDs to perform the same pre-processing, transforming the raw flight delay dataset into the two feature matrices: data_2007 (our training set) and data_2008 (our test set).\u003c/p\u003e\n\u003cp\u003eThe case class DelayRec that encapsulates a flight delay record represents the feature vector, and its methods do most of the heavy lifting:\u003c/p\u003e\n\u003cp\u003eto_date() is a helper method to convert year/month/day to a string\u003cbr/\u003egen_features(row) takes a row of inputs and generates a key/value tuple where the key is the date string (output of to_date) and the value is the feature value. We don\u0026rsquo;t use the key in this iteraion, but we will use it in the second iteration to join with the weather data.\u003cbr/\u003ethe get_hour() method extracts the 2-digit hour portion of the departure time\u003cbr/\u003eThe days_from_nearest_holiday() method computes the minimum distance (in days) of the provided year/month/date from any holiday in the list holidays.\u003cbr/\u003eWith DelayRec in place, our processing takes on the following steps (in the function prepFlightDelays):\u003c/p\u003e\n\u003cp\u003eWe read the raw input file with Spark\u0026rsquo;s SparkContext.textFile method, resulting in an RDD\u003cbr/\u003eEach row is parsed with CSVReader into fields, and populated into a DelayRec object\u003cbr/\u003eWe then perform a sequence of RDD transformations on the input RDD to make sure we only have rows that correspond to flights that did not get cancelled and originated from ORD.\u003cbr/\u003eFinally, we use the gen_features method to generate the final feature vector per row, as a set of doubles.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587040867791_930357852", + "id": "20200416-124107_381789886", + "dateCreated": "2020-04-16 12:41:07.791", + "dateStarted": "2020-04-16 12:52:45.821", + "dateFinished": "2020-04-16 12:52:45.878", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\n\nimport org.apache.spark.rdd._\nimport scala.collection.JavaConverters._\nimport au.com.bytecode.opencsv.CSVReader\n\nimport java.io._\nimport org.joda.time._\nimport org.joda.time.format._\nimport org.joda.time.format.DateTimeFormat\nimport org.joda.time.DateTime\nimport org.joda.time.Days\n\n\ncase class DelayRec(year: String,\n month: String,\n dayOfMonth: String,\n dayOfWeek: String,\n crsDepTime: String,\n depDelay: String,\n origin: String,\n distance: String,\n cancelled: String) {\n\n val holidays \u003d List(\"01/01/2007\", \"01/15/2007\", \"02/19/2007\", \"05/28/2007\", \"06/07/2007\", \"07/04/2007\",\n \"09/03/2007\", \"10/08/2007\" ,\"11/11/2007\", \"11/22/2007\", \"12/25/2007\",\n \"01/01/2008\", \"01/21/2008\", \"02/18/2008\", \"05/22/2008\", \"05/26/2008\", \"07/04/2008\",\n \"09/01/2008\", \"10/13/2008\" ,\"11/11/2008\", \"11/27/2008\", \"12/25/2008\")\n\n def gen_features: (String, Array[Double]) \u003d {\n val values \u003d Array(\n depDelay.toDouble,\n month.toDouble,\n dayOfMonth.toDouble,\n dayOfWeek.toDouble,\n get_hour(crsDepTime).toDouble,\n distance.toDouble,\n days_from_nearest_holiday(year.toInt, month.toInt, dayOfMonth.toInt)\n )\n new Tuple2(to_date(year.toInt, month.toInt, dayOfMonth.toInt), values)\n }\n\n def get_hour(depTime: String) : String \u003d \"%04d\".format(depTime.toInt).take(2)\n def to_date(year: Int, month: Int, day: Int) \u003d \"%04d%02d%02d\".format(year, month, day)\n\n def days_from_nearest_holiday(year:Int, month:Int, day:Int): Int \u003d {\n val sampleDate \u003d new DateTime(year, month, day, 0, 0)\n\n holidays.foldLeft(3000) { (r, c) \u003d\u003e\n val holiday \u003d DateTimeFormat.forPattern(\"MM/dd/yyyy\").parseDateTime(c)\n val distance \u003d Math.abs(Days.daysBetween(holiday, sampleDate).getDays)\n math.min(r, distance)\n }\n }\n }\n\n// function to do a preprocessing step for a given file\ndef prepFlightDelays(infile: String): RDD[DelayRec] \u003d {\n val data \u003d sc.textFile(infile)\n\n data.map { line \u003d\u003e\n val reader \u003d new CSVReader(new StringReader(line))\n reader.readAll().asScala.toList.map(rec \u003d\u003e DelayRec(rec(0),rec(1),rec(2),rec(3),rec(5),rec(15),rec(16),rec(18),rec(21)))\n }.map(list \u003d\u003e list(0))\n .filter(rec \u003d\u003e rec.year !\u003d \"Year\")\n .filter(rec \u003d\u003e rec.cancelled \u003d\u003d \"0\")\n .filter(rec \u003d\u003e rec.origin \u003d\u003d \"ORD\")\n}\n\nval data_2007tmp \u003d prepFlightDelays(\"/tmp/airflightsdelays/flights_2007.csv.bz2\")\nval data_2007 \u003d data_2007tmp.map(rec \u003d\u003e rec.gen_features._2)\nval data_2008 \u003d prepFlightDelays(\"/tmp/airflightsdelays/flights_2008.csv.bz2\").map(rec \u003d\u003e rec.gen_features._2)\n\ndata_2007tmp.toDF().createOrReplaceTempView(\"data_2007tmp\")\n\ndata_2007.take(5).map(x \u003d\u003e x mkString \",\").foreach(println)\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:05:55.678", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala", + "editorHide": false, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "import sqlContext.implicits._\nimport org.apache.spark.rdd._\nimport scala.collection.JavaConverters._\nimport au.com.bytecode.opencsv.CSVReader\nimport java.io._\nimport org.joda.time._\nimport org.joda.time.format._\nimport org.joda.time.format.DateTimeFormat\nimport org.joda.time.DateTime\nimport org.joda.time.Days\ndefined class DelayRec\nprepFlightDelays: (infile: String)org.apache.spark.rdd.RDD[DelayRec]\ndata_2007tmp: org.apache.spark.rdd.RDD[DelayRec] \u003d MapPartitionsRDD[22] at filter at \u003cconsole\u003e:99\ndata_2007: org.apache.spark.rdd.RDD[Array[Double]] \u003d MapPartitionsRDD[23] at map at \u003cconsole\u003e:103\ndata_2008: org.apache.spark.rdd.RDD[Array[Double]] \u003d MapPartitionsRDD[31] at map at \u003cconsole\u003e:104\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587041565770_-416193052", + "id": "20200416-125245_1958851948", + "dateCreated": "2020-04-16 12:52:45.771", + "dateStarted": "2020-04-16 14:05:55.846", + "dateFinished": "2020-04-16 14:06:13.215", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nLets explore data using SQL and visualizations\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:07:06.912", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eLets explore data using SQL and visualizations\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587041598151_-1930519993", + "id": "20200416-125318_1153961617", + "dateCreated": "2020-04-16 12:53:18.151", + "dateStarted": "2020-04-16 14:07:06.912", + "dateFinished": "2020-04-16 14:07:09.865", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\n\nselect dayofWeek, case when depDelay \u003e 15 then \u0027delayed\u0027 else \u0027ok\u0027 end , count(1)\nfrom data_2007tmp \ngroup by dayofweek , case when depDelay \u003e 15 then \u0027delayed\u0027 else \u0027ok\u0027 end ", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:07:44.590", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": { + "0": { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "setting": { + "table": { + "tableGridState": {}, + "tableColumnTypeState": { + "names": { + "dayofWeek": "string", + "CASE WHEN (CAST(depDelay AS INT) \u003e 15) THEN delayed ELSE ok END": "string", + "count(1)": "string" + }, + "updated": false + }, + "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", + "tableOptionValue": { + "useFilter": false, + "showPagination": false, + "showAggregationFooter": false + }, + "updated": false, + "initialized": false + } + }, + "commonSetting": {} + } + } + }, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TABLE", + "data": "dayofWeek\tCASE WHEN (CAST(depDelay AS INT) \u003e 15) THEN delayed ELSE ok END\tcount(1)\n" + }, + { + "type": "TEXT", + "data": "" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046026790_1840074011", + "id": "20200416-140706_1299203004", + "dateCreated": "2020-04-16 14:07:06.790", + "dateStarted": "2020-04-16 14:07:25.083", + "dateFinished": "2020-04-16 14:07:43.294", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\n\nselect cast( cast(crsDepTime as int) / 100 as int) as hour, case when depDelay \u003e 15 then \u0027delayed\u0027 else \u0027ok\u0027 end as delay, count(1) as count\nfrom data_2007tmp \ngroup by cast( cast(crsDepTime as int) / 100 as int), case when depDelay \u003e 15 then \u0027delayed\u0027 else \u0027ok\u0027 end", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:08:06.180", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": { + "0": { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "setting": { + "table": { + "tableGridState": {}, + "tableColumnTypeState": { + "names": { + "hour": "string", + "delay": "string", + "count": "string" + }, + "updated": false + }, + "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", + "tableOptionValue": { + "useFilter": false, + "showPagination": false, + "showAggregationFooter": false + }, + "updated": false, + "initialized": false + } + }, + "commonSetting": {} + } + } + }, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TABLE", + "data": "hour\tdelay\tcount\n" + }, + { + "type": "TEXT", + "data": "" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046044982_-136781518", + "id": "20200416-140724_65169917", + "dateCreated": "2020-04-16 14:07:24.982", + "dateStarted": "2020-04-16 14:07:48.268", + "dateFinished": "2020-04-16 14:08:04.987", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nModeling with Spark and ML-Lib\nWith the data_2007 dataset (which we\u0027ll use for training) and the data_2008 dataset (which we\u0027ll use for validation) as RDDs, we now build a predictive model using Spark\u0027s ML-Lib machine learning library.\n\nML-Lib is Spark’s scalable machine learning library, which includes various learning algorithms and utilities, including classification, regression, clustering, collaborative filtering, dimensionality reduction, and others.\n\nIf you compare ML-Lib to Scikit-learn, at the moment ML-Lib lacks a few important algorithms like Random Forest or Gradient Boosted Trees. Having said that, we see a strong pace of innovation from the ML-Lib community and expect more algorithms and other features to be added soon (for example, Random Forest is being actively worked on, and will likely be available in the next release).\n\nTo use ML-Lib\u0027s machine learning algorithms, first we parse our feature matrices into RDDs of LabeledPoint objects (for both the training and test datasets). LabeledPoint is ML-Lib\u0027s abstraction for a feature vector accompanied by a label. We consider flight delays of 15 minutes or more as “delays” and mark it with a label of 1.0, and under 15 minutes as “non-delay” and mark it with a label of 0.0.\n\nWe also use ML-Lib\u0027s StandardScaler class to normalize our feature values for both training and validation sets. This is important because of ML-Lib\u0027s use of Stochastic Gradient Descent, which is known to perform best if feature vectors are normalized.\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:08:13.208", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eModeling with Spark and ML-Lib\u003cbr/\u003eWith the data_2007 dataset (which we\u0026rsquo;ll use for training) and the data_2008 dataset (which we\u0026rsquo;ll use for validation) as RDDs, we now build a predictive model using Spark\u0026rsquo;s ML-Lib machine learning library.\u003c/p\u003e\n\u003cp\u003eML-Lib is Spark’s scalable machine learning library, which includes various learning algorithms and utilities, including classification, regression, clustering, collaborative filtering, dimensionality reduction, and others.\u003c/p\u003e\n\u003cp\u003eIf you compare ML-Lib to Scikit-learn, at the moment ML-Lib lacks a few important algorithms like Random Forest or Gradient Boosted Trees. Having said that, we see a strong pace of innovation from the ML-Lib community and expect more algorithms and other features to be added soon (for example, Random Forest is being actively worked on, and will likely be available in the next release).\u003c/p\u003e\n\u003cp\u003eTo use ML-Lib\u0026rsquo;s machine learning algorithms, first we parse our feature matrices into RDDs of LabeledPoint objects (for both the training and test datasets). LabeledPoint is ML-Lib\u0026rsquo;s abstraction for a feature vector accompanied by a label. We consider flight delays of 15 minutes or more as “delays” and mark it with a label of 1.0, and under 15 minutes as “non-delay” and mark it with a label of 0.0.\u003c/p\u003e\n\u003cp\u003eWe also use ML-Lib\u0026rsquo;s StandardScaler class to normalize our feature values for both training and validation sets. This is important because of ML-Lib\u0026rsquo;s use of Stochastic Gradient Descent, which is known to perform best if feature vectors are normalized.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046068206_675998847", + "id": "20200416-140748_1813541856", + "dateCreated": "2020-04-16 14:07:48.206", + "dateStarted": "2020-04-16 14:08:13.208", + "dateFinished": "2020-04-16 14:08:13.292", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\n\nimport org.apache.spark.mllib.regression.LabeledPoint\nimport org.apache.spark.mllib.linalg.Vectors\nimport org.apache.spark.mllib.feature.StandardScaler\n\ndef parseData(vals: Array[Double]): LabeledPoint \u003d {\n LabeledPoint(if (vals(0)\u003e\u003d15) 1.0 else 0.0, Vectors.dense(vals.drop(1)))\n}\n\n// Prepare training set\nval parsedTrainData \u003d data_2007.map(parseData)\nparsedTrainData.cache\nval scaler \u003d new StandardScaler(withMean \u003d true, withStd \u003d true).fit(parsedTrainData.map(x \u003d\u003e x.features))\nval scaledTrainData \u003d parsedTrainData.map(x \u003d\u003e LabeledPoint(x.label, scaler.transform(Vectors.dense(x.features.toArray))))\nscaledTrainData.cache\n\n// Prepare test/validation set\nval parsedTestData \u003d data_2008.map(parseData)\nparsedTestData.cache\nval scaledTestData \u003d parsedTestData.map(x \u003d\u003e LabeledPoint(x.label, scaler.transform(Vectors.dense(x.features.toArray))))\nscaledTestData.cache\n\nscaledTrainData.take(3).map(x \u003d\u003e (x.label, x.features)).foreach(println)", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:10:29.501", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "ERROR", + "msg": [ + { + "type": "TEXT", + "data": "java.lang.IllegalArgumentException: requirement failed: Nothing has been added to this summarizer.\n at scala.Predef$.require(Predef.scala:224)\n at org.apache.spark.mllib.stat.MultivariateOnlineSummarizer.variance(MultivariateOnlineSummarizer.scala:204)\n at org.apache.spark.mllib.feature.StandardScaler.fit(StandardScaler.scala:61)\n ... 73 elided\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046093150_-546872974", + "id": "20200416-140813_976930785", + "dateCreated": "2020-04-16 14:08:13.150", + "dateStarted": "2020-04-16 14:10:29.562", + "dateFinished": "2020-04-16 14:10:31.052", + "status": "ERROR", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nNote that we use the RDD cache method to ensure that these computed RDDs (parsedTrainData, scaledTrainData, parsedTestData and scaledTestData) are cached in memory by Spark and not re-computed with each iteration of stochastic gradient descent.\n\nWe also the Metrics class for evaluation of classification metrics: precision, recall, accuracy and the F1-measure\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:08:52.533", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eNote that we use the RDD cache method to ensure that these computed RDDs (parsedTrainData, scaledTrainData, parsedTestData and scaledTestData) are cached in memory by Spark and not re-computed with each iteration of stochastic gradient descent.\u003c/p\u003e\n\u003cp\u003eWe also the Metrics class for evaluation of classification metrics: precision, recall, accuracy and the F1-measure\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046113819_-620422113", + "id": "20200416-140833_1393341660", + "dateCreated": "2020-04-16 14:08:33.820", + "dateStarted": "2020-04-16 14:08:52.533", + "dateFinished": "2020-04-16 14:08:52.546", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\n\n// Function to compute evaluation metrics\ndef eval_metrics(labelsAndPreds: RDD[(Double, Double)]) : Tuple2[Array[Double], Array[Double]] \u003d {\n val tp \u003d labelsAndPreds.filter(r \u003d\u003e r._1\u003d\u003d1 \u0026\u0026 r._2\u003d\u003d1).count.toDouble\n val tn \u003d labelsAndPreds.filter(r \u003d\u003e r._1\u003d\u003d0 \u0026\u0026 r._2\u003d\u003d0).count.toDouble\n val fp \u003d labelsAndPreds.filter(r \u003d\u003e r._1\u003d\u003d1 \u0026\u0026 r._2\u003d\u003d0).count.toDouble\n val fn \u003d labelsAndPreds.filter(r \u003d\u003e r._1\u003d\u003d0 \u0026\u0026 r._2\u003d\u003d1).count.toDouble\n\n val precision \u003d tp / (tp+fp)\n val recall \u003d tp / (tp+fn)\n val F_measure \u003d 2*precision*recall / (precision+recall)\n val accuracy \u003d (tp+tn) / (tp+tn+fp+fn)\n new Tuple2(Array(tp, tn, fp, fn), Array(precision, recall, F_measure, accuracy))\n}\n\nimport org.apache.spark.rdd._\nimport org.apache.spark.rdd.RDD\n\nclass Metrics(labelsAndPreds: RDD[(Double, Double)]) extends java.io.Serializable {\n\n private def filterCount(lftBnd:Int,rtBnd:Int):Double \u003d labelsAndPreds\n .map(x \u003d\u003e (x._1.toInt, x._2.toInt))\n .filter(_ \u003d\u003d (lftBnd,rtBnd)).count()\n\n lazy val tp \u003d filterCount(1,1) // true positives\n lazy val tn \u003d filterCount(0,0) // true negatives\n lazy val fp \u003d filterCount(0,1) // false positives\n lazy val fn \u003d filterCount(1,0) // false negatives\n\n lazy val precision \u003d tp / (tp+fp)\n lazy val recall \u003d tp / (tp+fn)\n lazy val F1 \u003d 2*precision*recall / (precision+recall)\n lazy val accuracy \u003d (tp+tn) / (tp+tn+fp+fn)\n}", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:09:08.359", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "eval_metrics: (labelsAndPreds: org.apache.spark.rdd.RDD[(Double, Double)])(Array[Double], Array[Double])\nimport org.apache.spark.rdd._\nimport org.apache.spark.rdd.RDD\ndefined class Metrics\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046132460_-346818772", + "id": "20200416-140852_1310407650", + "dateCreated": "2020-04-16 14:08:52.460", + "dateStarted": "2020-04-16 14:09:08.443", + "dateFinished": "2020-04-16 14:09:08.798", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nML-Lib supports a few algorithms for supervised learning, among those are Linear Regression and Logistic Regression, Naive Bayes, Decision Tree, SVM, Random Forest and Gradient Boosted Trees. We will demonstrate the use of Logistic Regression, Decision Tree and Random Forest.\n\nLet\u0027s see how to build these models with ML-Lib:", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:09:25.202", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eML-Lib supports a few algorithms for supervised learning, among those are Linear Regression and Logistic Regression, Naive Bayes, Decision Tree, SVM, Random Forest and Gradient Boosted Trees. We will demonstrate the use of Logistic Regression, Decision Tree and Random Forest.\u003c/p\u003e\n\u003cp\u003eLet\u0026rsquo;s see how to build these models with ML-Lib:\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046148360_588622403", + "id": "20200416-140908_1230264462", + "dateCreated": "2020-04-16 14:09:08.360", + "dateStarted": "2020-04-16 14:09:25.204", + "dateFinished": "2020-04-16 14:09:25.218", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark \n\nimport org.apache.spark.mllib.classification.LogisticRegressionWithSGD\n\n// Build the Logistic Regression model\nval model_lr \u003d LogisticRegressionWithSGD.train(scaledTrainData, numIterations\u003d100)\n\n// Predict\nval labelsAndPreds_lr \u003d scaledTestData.map { point \u003d\u003e\n val pred \u003d model_lr.predict(point.features)\n (pred, point.label)\n}\nval m_lr \u003d eval_metrics(labelsAndPreds_lr)._2\nprintln(\"precision \u003d %.2f, recall \u003d %.2f, F1 \u003d %.2f, accuracy \u003d %.2f\".format(m_lr(0), m_lr(1), m_lr(2), m_lr(3)))\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:09:45.759", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "ERROR", + "msg": [ + { + "type": "TEXT", + "data": "\u003cconsole\u003e:78: error: not found: value scaledTrainData\n val model_lr \u003d LogisticRegressionWithSGD.train(scaledTrainData, numIterations\u003d100)\n ^\n\u003cconsole\u003e:81: error: not found: value scaledTestData\n val labelsAndPreds_lr \u003d scaledTestData.map { point \u003d\u003e\n ^\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046165139_169947894", + "id": "20200416-140925_1282143860", + "dateCreated": "2020-04-16 14:09:25.139", + "dateStarted": "2020-04-16 14:09:45.840", + "dateFinished": "2020-04-16 14:09:45.950", + "status": "ERROR", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nLet\u0027s inspect the feature weights from this model:\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:10:57.715", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eLet\u0026rsquo;s inspect the feature weights from this model:\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046185760_-1155606313", + "id": "20200416-140945_1552302837", + "dateCreated": "2020-04-16 14:09:45.760", + "dateStarted": "2020-04-16 14:10:57.715", + "dateFinished": "2020-04-16 14:10:57.724", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "println(model_lr.weights)", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:11:16.950", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "ERROR", + "msg": [ + { + "type": "TEXT", + "data": "\u003cconsole\u003e:74: error: not found: value model_lr\n println(model_lr.weights)\n ^\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046257629_288699412", + "id": "20200416-141057_211109536", + "dateCreated": "2020-04-16 14:10:57.629", + "dateStarted": "2020-04-16 14:11:17.019", + "dateFinished": "2020-04-16 14:11:17.090", + "status": "ERROR", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nLet\u0027s inspect the feature weights from this model:", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:11:37.730", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eLet\u0026rsquo;s inspect the feature weights from this model:\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046276950_-1401366004", + "id": "20200416-141116_1832536247", + "dateCreated": "2020-04-16 14:11:16.950", + "dateStarted": "2020-04-16 14:11:37.730", + "dateFinished": "2020-04-16 14:11:37.742", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nWe have built a model using Logistic Regression with SGD using 100 iterations, and then used it to predict flight delays over the validation set to measure performance: precision, recall, F1 and accuracy.\n\nNext, let\u0027s try the Support Vector Machine:", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:11:50.540", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eWe have built a model using Logistic Regression with SGD using 100 iterations, and then used it to predict flight delays over the validation set to measure performance: precision, recall, F1 and accuracy.\u003c/p\u003e\n\u003cp\u003eNext, let\u0026rsquo;s try the Support Vector Machine:\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046297650_1894766026", + "id": "20200416-141137_782875973", + "dateCreated": "2020-04-16 14:11:37.650", + "dateStarted": "2020-04-16 14:11:50.587", + "dateFinished": "2020-04-16 14:11:50.598", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\n\nimport org.apache.spark.mllib.classification.SVMWithSGD\n\n// Build the SVM model\nval svmAlg \u003d new SVMWithSGD()\nsvmAlg.optimizer.setNumIterations(100)\n .setRegParam(1.0)\n .setStepSize(1.0)\nval model_svm \u003d svmAlg.run(scaledTrainData)\n\n// Predict\nval labelsAndPreds_svm \u003d scaledTestData.map { point \u003d\u003e\n val pred \u003d model_svm.predict(point.features)\n (pred, point.label)\n}\nval m_svm \u003d eval_metrics(labelsAndPreds_svm)._2\nprintln(\"precision \u003d %.2f, recall \u003d %.2f, F1 \u003d %.2f, accuracy \u003d %.2f\".format(m_svm(0), m_svm(1), m_svm(2), m_svm(3)))", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:12:06.899", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "ERROR", + "msg": [ + { + "type": "TEXT", + "data": "\u003cconsole\u003e:82: error: not found: value scaledTrainData\n val model_svm \u003d svmAlg.run(scaledTrainData)\n ^\n\u003cconsole\u003e:85: error: not found: value scaledTestData\n val labelsAndPreds_svm \u003d scaledTestData.map { point \u003d\u003e\n ^\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046310540_-15898031", + "id": "20200416-141150_1819255675", + "dateCreated": "2020-04-16 14:11:50.540", + "dateStarted": "2020-04-16 14:12:06.976", + "dateFinished": "2020-04-16 14:12:07.068", + "status": "ERROR", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nNext, let\u0027s try a Decision Tree model:", + "user": "anonymous", + "dateUpdated": "2020-04-16 14:12:21.562", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eNext, let\u0026rsquo;s try a Decision Tree model:\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046326900_-4974505", + "id": "20200416-141206_1341860643", + "dateCreated": "2020-04-16 14:12:06.900", + "dateStarted": "2020-04-16 14:12:21.564", + "dateFinished": "2020-04-16 14:12:21.572", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\n\nimport org.apache.spark.mllib.tree.DecisionTree\n\n// Build the Decision Tree model\nval numClasses \u003d 2\nval categoricalFeaturesInfo \u003d Map[Int, Int]()\nval impurity \u003d \"gini\"\nval maxDepth \u003d 10\nval maxBins \u003d 100\nval model_dt \u003d DecisionTree.trainClassifier(parsedTrainData, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins)\n\n// Predict\nval labelsAndPreds_dt \u003d parsedTestData.map { point \u003d\u003e\n val pred \u003d model_dt.predict(point.features)\n (point.label, pred)\n}\nval m_dt \u003d new Metrics(labelsAndPreds_dt)\nprintln(\"precision \u003d %.2f, recall \u003d %.2f, F1 \u003d %.2f, accuracy \u003d %.2f\"\n .format(m_dt.precision, m_dt.recall, m_dt.F1, m_dt.accuracy))\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:27:36.526", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "ERROR", + "msg": [ + { + "type": "TEXT", + "data": "\u003cconsole\u003e:31: error: not found: value parsedTrainData\n val model_dt \u003d DecisionTree.trainClassifier(parsedTrainData, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins)\n ^\n\u003cconsole\u003e:34: error: not found: value parsedTestData\n val labelsAndPreds_dt \u003d parsedTestData.map { point \u003d\u003e\n ^\n\u003cconsole\u003e:38: error: not found: type Metrics\n val m_dt \u003d new Metrics(labelsAndPreds_dt)\n ^\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587046341499_129242998", + "id": "20200416-141221_1158080916", + "dateCreated": "2020-04-16 14:12:21.500", + "dateStarted": "2020-04-16 15:27:36.736", + "dateFinished": "2020-04-16 15:29:26.046", + "status": "ABORT", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nAnd finally, let\u0027s try the Random Forest model:", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:32:02.627", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eAnd finally, let\u0026rsquo;s try the Random Forest model:\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587050856530_-1441545987", + "id": "20200416-152736_722925990", + "dateCreated": "2020-04-16 15:27:36.530", + "dateStarted": "2020-04-16 15:32:02.630", + "dateFinished": "2020-04-16 15:32:05.551", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\n\nimport org.apache.spark.mllib.tree.RandomForest\nimport org.apache.spark.mllib.tree.configuration.Strategy\n\nval treeStrategy \u003d Strategy.defaultStrategy(\"Classification\")\nval model_rf \u003d RandomForest.trainClassifier(parsedTrainData, treeStrategy, \n numTrees \u003d 100, featureSubsetStrategy \u003d \"auto\", seed \u003d 125)\n\n// Predict\nval labelsAndPreds_rf \u003d parsedTestData.map { point \u003d\u003e\n val pred \u003d model_rf.predict(point.features)\n (point.label, pred)\n}\nval m_rf \u003d new Metrics(labelsAndPreds_rf)\nprintln(\"precision \u003d %.2f, recall \u003d %.2f, F1 \u003d %.2f, accuracy \u003d %.2f\"\n .format(m_rf.precision, m_rf.recall, m_rf.F1, m_rf.accuracy))", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:32:05.001", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "ERROR", + "msg": [ + { + "type": "TEXT", + "data": "\u003cconsole\u003e:27: error: not found: value parsedTrainData\n val model_rf \u003d RandomForest.trainClassifier(parsedTrainData, treeStrategy,\n ^\n\u003cconsole\u003e:31: error: not found: value parsedTestData\n val labelsAndPreds_rf \u003d parsedTestData.map { point \u003d\u003e\n ^\n\u003cconsole\u003e:35: error: not found: type Metrics\n val m_rf \u003d new Metrics(labelsAndPreds_rf)\n ^\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587050965852_917165299", + "id": "20200416-152925_1537660841", + "dateCreated": "2020-04-16 15:29:25.852", + "dateStarted": "2020-04-16 15:32:05.088", + "dateFinished": "2020-04-16 15:32:05.187", + "status": "ERROR", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nAs expected, the improved feature set increased the accuracy of our model for both SVM and Decision Tree models.", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:32:07.941", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eAs expected, the improved feature set increased the accuracy of our model for both SVM and Decision Tree models.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587050955200_2010188697", + "id": "20200416-152915_2130162375", + "dateCreated": "2020-04-16 15:29:15.200", + "dateStarted": "2020-04-16 15:32:07.942", + "dateFinished": "2020-04-16 15:32:07.968", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nSummary\nIn this IPython notebook we have demonstrated how to build a predictive model in Scala with Apache Hadoop, Apache Spark and its machine learning library: ML-Lib.\n\nWe have used Apache Spark on our HDP cluster to perform various types of data pre-processing and feature engineering tasks. We then applied a few ML-Lib machine learning algorithms such as support vector machines and decision tree to the resulting datasets and showed how through iterations we continuously add new and improved features resulting in better model performance.", + "user": "anonymous", + "dateUpdated": "2020-04-16 15:32:09.918", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eSummary\u003cbr/\u003eIn this IPython notebook we have demonstrated how to build a predictive model in Scala with Apache Hadoop, Apache Spark and its machine learning library: ML-Lib.\u003c/p\u003e\n\u003cp\u003eWe have used Apache Spark on our HDP cluster to perform various types of data pre-processing and feature engineering tasks. We then applied a few ML-Lib machine learning algorithms such as support vector machines and decision tree to the resulting datasets and showed how through iterations we continuously add new and improved features resulting in better model performance.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587050955005_202030383", + "id": "20200416-152915_1181988752", + "dateCreated": "2020-04-16 15:29:15.006", + "dateStarted": "2020-04-16 15:32:09.919", + "dateFinished": "2020-04-16 15:32:09.950", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + } + ], + "name": "demo-notebooks/Predicting Airline Delays", + "id": "2F6S8N3CE", + "noteParams": {}, + "noteForms": {}, + "angularObjects": { + "md:shared_process": [], + "sh:shared_process": [], + "spark:shared_process": [] + }, + "config": { + "isZeppelinNotebookCronEnable": false + }, + "info": {} +} \ No newline at end of file diff --git a/2F7WQEZD7/note.json b/2F7WQEZD7/note.json new file mode 100644 index 0000000..c33165d --- /dev/null +++ b/2F7WQEZD7/note.json @@ -0,0 +1,2065 @@ +{ + "paragraphs": [ + { + "text": "%md\n# Advertising Analytics Click Prediction\n\nThis notebook has been refrenced from : https://databricks.com/blog/2018/07/19/simplify-advertising-analytics-click-prediction-with-databricks-unified-analytics-platform.html\n\nAdvertising teams want to analyze their immense stores and varieties of data requiring a scalable, extensible, and elastic platform. Advanced analytics, including but not limited to classification, clustering, recognition, prediction, and recommendations allow these organizations to gain deeper insights from their data and drive business outcomes. As data of various types grow in volume, Apache Spark provides an API and distributed compute engine to process data easily and in parallel, thereby decreasing time to value.\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.541", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch1\u003eAdvertising Analytics Click Prediction\u003c/h1\u003e\n\u003cp\u003eThis notebook has been refrenced from : \u003ca href\u003d\"https://databricks.com/blog/2018/07/19/simplify-advertising-analytics-click-prediction-with-databricks-unified-analytics-platform.html\"\u003ehttps://databricks.com/blog/2018/07/19/simplify-advertising-analytics-click-prediction-with-databricks-unified-analytics-platform.html\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003eAdvertising teams want to analyze their immense stores and varieties of data requiring a scalable, extensible, and elastic platform. Advanced analytics, including but not limited to classification, clustering, recognition, prediction, and recommendations allow these organizations to gain deeper insights from their data and drive business outcomes. As data of various types grow in volume, Apache Spark provides an API and distributed compute engine to process data easily and in parallel, thereby decreasing time to value.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549540_-42455149", + "id": "20190726-062658_506161667", + "dateCreated": "2020-04-16 12:02:29.540", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nLet’s look at a concrete example with the [Click-Through Rate Prediction dataset](https://www.kaggle.com/c/avazu-ctr-prediction/data) of ad impressions and clicks from the data science website Kaggle. The goal of this workflow is to create a machine learning model that, given a new ad impression, predicts whether or not there will be a click.\n\n**To build our advanced analytics workflow, let’s focus on the three main steps:**\n\n1. ETL\n2. Data Exploration\n3. Advanced Analytics / Machine Learning\n\n### ELT and data exploration can be done using [FLUIRENGINE](http://fluirserver:8100/src/#/app/dashboard-v1) and then the data can be directly fed to the Machine Leaning steps", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:13:08.109", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eLet’s look at a concrete example with the \u003ca href\u003d\"https://www.kaggle.com/c/avazu-ctr-prediction/data\"\u003eClick-Through Rate Prediction dataset\u003c/a\u003e of ad impressions and clicks from the data science website Kaggle. The goal of this workflow is to create a machine learning model that, given a new ad impression, predicts whether or not there will be a click.\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003eTo build our advanced analytics workflow, let’s focus on the three main steps:\u003c/strong\u003e\u003c/p\u003e\n\u003col\u003e\n \u003cli\u003eETL\u003c/li\u003e\n \u003cli\u003eData Exploration\u003c/li\u003e\n \u003cli\u003eAdvanced Analytics / Machine Learning\u003c/li\u003e\n\u003c/ol\u003e\n\u003ch3\u003eELT and data exploration can be done using \u003ca href\u003d\"http://fluirserver:8100/src/#/app/dashboard-v1\"\u003eFLUIRENGINE\u003c/a\u003e and then the data can be directly fed to the Machine Leaning steps\u003c/h3\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549542_-934872459", + "id": "20190726-091241_1343319182", + "dateCreated": "2020-04-16 12:02:29.542", + "dateStarted": "2020-04-16 12:13:08.109", + "dateFinished": "2020-04-16 12:13:08.122", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n### 1. ETL: Building the ETL process for the advertising logs\nFirst, we load the data from minio/s3. Optionally we can download and copy the dataset to dcefs storage and read it into Spark.\n\n\n\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.543", + "config": { + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e1. ETL: Building the ETL process for the advertising logs\u003c/h3\u003e\n\u003cp\u003eFirst, we load the data from minio/s3. Optionally we can download and copy the dataset to dcefs storage and read it into Spark.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549543_1057888400", + "id": "20190726-091606_1760021641", + "dateCreated": "2020-04-16 12:02:29.543", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\ns3a configuration", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.544", + "config": { + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003es3a configuration\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549543_1589182312", + "id": "20200416-115550_762518523", + "dateCreated": "2020-04-16 12:02:29.543", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\n// configure aws credential\nsc.hadoopConfiguration.set(\"fs.s3a.access.key\", \"minio\")\nsc.hadoopConfiguration.set(\"fs.s3a.secret.key\", \"minio123\")\nsc.hadoopConfiguration.set(\"fs.s3a.endpoint\", \"http://172.26.202.30:9000\")", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.546", + "config": { + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "apps": [], + "jobName": "paragraph_1587038549546_1407729276", + "id": "20200416-115538_1545933775", + "dateCreated": "2020-04-16 12:02:29.546", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nOptionally you can load dataset from dcefs using the below script", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.547", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eOptionally you can load dataset from dcefs using the below script\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549547_-189181838", + "id": "20200416-115819_1321410336", + "dateCreated": "2020-04-16 12:02:29.547", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sh\n\n#copy data from kaggle to some local dir /fluir_data/notebook-data/, uncomment below commands and run\n#hadoop fs -rm -r -f /avazu\n#hadoop fs -mkdir -p /avazu\n#hadoop fs -copyFromLocal /fluir_data/notebook-data/train.gz /avazu/\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.548", + "config": { + "editorSetting": { + "language": "sh", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/sh", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038549547_-2019216164", + "id": "20200416-115845_408581632", + "dateCreated": "2020-04-16 12:02:29.547", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n*A. Read train.gz file using spark code*", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.548", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e\u003cem\u003eA. Read train.gz file using spark code\u003c/em\u003e\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549548_1819187465", + "id": "20190726-070514_2130357322", + "dateCreated": "2020-04-16 12:02:29.548", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\n//load data from minio/s3\nval df \u003d spark.read.format(\"csv\").option(\"header\", \"true\").load(\"s3a://avazuctrprediction/train.gz\") //from s3\n\n//optionally if dataset is loaded to dcefs storage\n//val df \u003d spark.read.format(\"csv\").option(\"header\", \"true\").load(\"/avazu/train.gz\") //from dcefs", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.548", + "config": { + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "df: org.apache.spark.sql.DataFrame \u003d [id: string, click: string ... 22 more fields]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549548_-1422071932", + "id": "20190726-063752_977085327", + "dateCreated": "2020-04-16 12:02:29.548", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n*B. Convert to parquet format and store it in \u0027train\u0027 dir*\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.549", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e\u003cem\u003eB. Convert to parquet format and store it in \u0026lsquo;train\u0026rsquo; dir\u003c/em\u003e\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549549_924205176", + "id": "20190726-070501_843225411", + "dateCreated": "2020-04-16 12:02:29.549", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\ndf.coalesce(4).write.mode(\"overwrite\").parquet(\"/avazu/train\")", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.549", + "config": { + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038549549_587920671", + "id": "20190726-070257_1082277310", + "dateCreated": "2020-04-16 12:02:29.549", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n*C. Read the parquet file and create a view with name \u0027impression\u0027. With this your data is loaded*\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.550", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e\u003cem\u003eC. Read the parquet file and create a view with name \u0026lsquo;impression\u0026rsquo;. With this your data is loaded\u003c/em\u003e\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549550_96203332", + "id": "20190726-070753_1546203188", + "dateCreated": "2020-04-16 12:02:29.550", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\nimpression \u003d spark.read.parquet(\"/avazu/train/\")\nimpression.createOrReplaceTempView(\"impression\")\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.550", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038549550_-892669715", + "id": "20190726-070847_268493318", + "dateCreated": "2020-04-16 12:02:29.550", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n### 2. Data Exploration: Explore Advertising Logs with Spark SQL\n\n*A. Display the schema of impressions*\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.551", + "config": { + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e2. Data Exploration: Explore Advertising Logs with Spark SQL\u003c/h3\u003e\n\u003cp\u003e\u003cem\u003eA. Display the schema of impressions\u003c/em\u003e\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549551_-1263659241", + "id": "20190726-092128_1673125143", + "dateCreated": "2020-04-16 12:02:29.551", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\nimpression.show()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.552", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+--------------------+-----+--------+----+----------+--------+-----------+-------------+--------+----------+------------+---------+---------+------------+-----------+----------------+-----+---+---+----+---+---+------+---+\n| id|click| hour| C1|banner_pos| site_id|site_domain|site_category| app_id|app_domain|app_category|device_id|device_ip|device_model|device_type|device_conn_type| C14|C15|C16| C17|C18|C19| C20|C21|\n+--------------------+-----+--------+----+----------+--------+-----------+-------------+--------+----------+------------+---------+---------+------------+-----------+----------------+-----+---+---+----+---+---+------+---+\n| 1000009418151094273| 0|14102100|1005| 0|1fbe01fe| f3845767| 28905ebd|ecad2386| 7801e8d9| 07d7df22| a99f214a| ddd2926e| 44956a24| 1| 2|15706|320| 50|1722| 0| 35| -1| 79|\n|10000169349117863715| 0|14102100|1005| 0|1fbe01fe| f3845767| 28905ebd|ecad2386| 7801e8d9| 07d7df22| a99f214a| 96809ac8| 711ee120| 1| 0|15704|320| 50|1722| 0| 35|100084| 79|\n|10000371904215119486| 0|14102100|1005| 0|1fbe01fe| f3845767| 28905ebd|ecad2386| 7801e8d9| 07d7df22| a99f214a| b3cf8def| 8a4875bd| 1| 0|15704|320| 50|1722| 0| 35|100084| 79|\n|10000640724480838376| 0|14102100|1005| 0|1fbe01fe| f3845767| 28905ebd|ecad2386| 7801e8d9| 07d7df22| a99f214a| e8275b8f| 6332421a| 1| 0|15706|320| 50|1722| 0| 35|100084| 79|\n|10000679056417042096| 0|14102100|1005| 1|fe8cc448| 9166c161| 0569f928|ecad2386| 7801e8d9| 07d7df22| a99f214a| 9644d0bf| 779d90c2| 1| 0|18993|320| 50|2161| 0| 35| -1|157|\n|10000720757801103869| 0|14102100|1005| 0|d6137915| bb1ef334| f028772b|ecad2386| 7801e8d9| 07d7df22| a99f214a| 05241af0| 8a4875bd| 1| 0|16920|320| 50|1899| 0|431|100077|117|\n|10000724729988544911| 0|14102100|1005| 0|8fda644b| 25d4cfcd| f028772b|ecad2386| 7801e8d9| 07d7df22| a99f214a| b264c159| be6db1d7| 1| 0|20362|320| 50|2333| 0| 39| -1|157|\n|10000918755742328737| 0|14102100|1005| 1|e151e245| 7e091613| f028772b|ecad2386| 7801e8d9| 07d7df22| a99f214a| e6f67278| be74e6fe| 1| 0|20632|320| 50|2374| 3| 39| -1| 23|\n|10000949271186029916| 1|14102100|1005| 0|1fbe01fe| f3845767| 28905ebd|ecad2386| 7801e8d9| 07d7df22| a99f214a| 37e8da74| 5db079b5| 1| 2|15707|320| 50|1722| 0| 35| -1| 79|\n|10001264480619467364| 0|14102100|1002| 0|84c7ba46| c4e18dd6| 50e219e0|ecad2386| 7801e8d9| 07d7df22| c357dbff| f1ac7184| 373ecbe6| 0| 0|21689|320| 50|2496| 3|167|100191| 23|\n|10001868339616595934| 0|14102100|1005| 1|e151e245| 7e091613| f028772b|ecad2386| 7801e8d9| 07d7df22| a99f214a| 5d877109| 8f5c9827| 1| 0|17747|320| 50|1974| 2| 39|100019| 33|\n|10001966791793526909| 0|14102100|1005| 0|1fbe01fe| f3845767| 28905ebd|ecad2386| 7801e8d9| 07d7df22| a99f214a| 6f407810| 1f0bc64f| 1| 0|15701|320| 50|1722| 0| 35| -1| 79|\n|10002028568167339219| 0|14102100|1005| 0|9e8cf15d| 0d3cb7be| f028772b|ecad2386| 7801e8d9| 07d7df22| a99f214a| 58811cdf| 8326c04b| 1| 2|20596|320| 50|2161| 0| 35|100148|157|\n|10002044883120869786| 0|14102100|1005| 0|d6137915| bb1ef334| f028772b|ecad2386| 7801e8d9| 07d7df22| a99f214a| 72aab6df| 04258293| 1| 0|19771|320| 50|2227| 0|687|100077| 48|\n|10002518649031436658| 0|14102100|1005| 0|85f751fd| c4e18dd6| 50e219e0|98fed791| d9b5648e| 0f2161f8| a99f214a| 6dec2796| aad45b01| 1| 0|20984|320| 50|2371| 0|551| -1| 46|\n|10003539039235338011| 0|14102100|1005| 0|1fbe01fe| f3845767| 28905ebd|ecad2386| 7801e8d9| 07d7df22| a99f214a| a4f47b2e| 8a4875bd| 1| 0|15699|320| 50|1722| 0| 35|100084| 79|\n|10003585669470236873| 0|14102100|1005| 0|d9750ee7| 98572c79| f028772b|ecad2386| 7801e8d9| 07d7df22| a99f214a| 9b1fe278| 128f4ba1| 1| 0|17914|320| 50|2043| 2| 39| -1| 32|\n|10004105575081229495| 0|14102100|1005| 0|1fbe01fe| f3845767| 28905ebd|ecad2386| 7801e8d9| 07d7df22| a99f214a| c26c53cf| be87996b| 1| 2|15708|320| 50|1722| 0| 35|100084| 79|\n|10004181428767727519| 0|14102100|1005| 1|0c2fe9d6| 27e3c518| 28905ebd|ecad2386| 7801e8d9| 07d7df22| a99f214a| b7a69808| 158e4944| 1| 0| 6558|320| 50| 571| 2| 39| -1| 32|\n|10004482643316086592| 0|14102100|1005| 0|85f751fd| c4e18dd6| 50e219e0|66a5f0f3| d9b5648e| cef3e649| a99f214a| fa60af6b| b4b19c97| 1| 0|21234|320| 50|2434| 3|163|100088| 61|\n+--------------------+-----+--------+----+----------+--------+-----------+-------------+--------+----------+------------+---------+---------+------------+-----------+----------------+-----+---+---+----+---+---+------+---+\nonly showing top 20 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549551_1543948441", + "id": "20190726-064158_493185535", + "dateCreated": "2020-04-16 12:02:29.551", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n*B. Visualize the data*\n\nWe can now explore our data with the familiar and ubiquitous SQL language. Databricks and Spark support Scala, Python, R, and SQL. The following code snippets calculates the click through rate (CTR) by banner position and hour of day.\n\nCalculate CTR by Banner Position:", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.552", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e\u003cem\u003eB. Visualize the data\u003c/em\u003e\u003c/p\u003e\n\u003cp\u003eWe can now explore our data with the familiar and ubiquitous SQL language. Databricks and Spark support Scala, Python, R, and SQL. The following code snippets calculates the click through rate (CTR) by banner position and hour of day.\u003c/p\u003e\n\u003cp\u003eCalculate CTR by Banner Position:\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549552_581966044", + "id": "20190726-063331_980710091", + "dateCreated": "2020-04-16 12:02:29.552", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\nselect banner_pos,\nsum(case when click \u003d 1 then 1 else 0 end) / (count(1) * 1.0) as CTR\nfrom impression \ngroup by 1 \norder by 1", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.553", + "config": { + "editorSetting": { + "language": "sql", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/sql", + "fontSize": 9.0, + "results": { + "0": { + "graph": { + "mode": "multiBarChart", + "height": 300.0, + "optionOpen": false, + "setting": { + "table": { + "tableGridState": { + "columns": [ + { + "name": "banner_pos", + "visible": true, + "width": "*", + "sort": {}, + "filters": [ + {} + ], + "pinned": "" + }, + { + "name": "CTR", + "visible": true, + "width": "*", + "sort": {}, + "filters": [ + {} + ], + "pinned": "" + } + ], + "scrollFocus": {}, + "selection": [], + "grouping": { + "grouping": [], + "aggregations": [], + "rowExpandedStates": {} + }, + "treeView": {}, + "pagination": { + "paginationCurrentPage": 1.0, + "paginationPageSize": 250.0 + } + }, + "tableColumnTypeState": { + "updated": false, + "names": { + "banner_pos": "string", + "CTR": "string" + } + }, + "updated": false, + "initialized": false, + "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", + "tableOptionValue": { + "useFilter": false, + "showPagination": false, + "showAggregationFooter": false + } + }, + "stackedAreaChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + }, + "multiBarChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default", + "stacked": false + }, + "lineChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + } + }, + "commonSetting": {}, + "keys": [ + { + "name": "banner_pos", + "index": 0.0, + "aggr": "sum" + } + ], + "groups": [], + "values": [ + { + "name": "CTR", + "index": 1.0, + "aggr": "sum" + } + ] + }, + "helium": {} + } + }, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TABLE", + "data": "banner_pos\tCTR\n0\t0.15703298819099778\n1\t0.19508511565607623\n2\t0.11542857142857143\n3\t0E-17\n4\t0.16793893129770992\n5\t0.08945935433683392\n7\t0.07326007326007326\n" + }, + { + "type": "TEXT", + "data": "" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549552_-1128505266", + "id": "20190725-065233_431493583", + "dateCreated": "2020-04-16 12:02:29.552", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nCalculate CTR by Hour of the day:\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.553", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eCalculate CTR by Hour of the day:\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549553_589659427", + "id": "20190726-063457_928980557", + "dateCreated": "2020-04-16 12:02:29.553", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\nselect substr(hour, 7) as hour,\nsum(case when click \u003d 1 then 1 else 0 end) / (count(1) * 1.0) as CTR\nfrom impression \ngroup by 1 \norder by 1\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.554", + "config": { + "editorSetting": { + "language": "sql", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/sql", + "fontSize": 9.0, + "results": { + "0": { + "graph": { + "mode": "lineChart", + "height": 300.0, + "optionOpen": false, + "setting": { + "table": { + "tableGridState": {}, + "tableColumnTypeState": { + "names": { + "hour": "string", + "CTR": "string" + }, + "updated": false + }, + "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", + "tableOptionValue": { + "useFilter": false, + "showPagination": false, + "showAggregationFooter": false + }, + "updated": false, + "initialized": false + }, + "lineChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + }, + "stackedAreaChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + }, + "multiBarChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + } + }, + "commonSetting": {}, + "keys": [ + { + "name": "hour", + "index": 0.0, + "aggr": "sum" + } + ], + "groups": [], + "values": [ + { + "name": "CTR", + "index": 1.0, + "aggr": "sum" + } + ] + }, + "helium": {} + } + }, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TABLE", + "data": "hour\tCTR\n00\t0.17471387997243836\n01\t0.17369508592715473\n02\t0.15069575988933393\n03\t0.16979131649039332\n04\t0.15120641000940648\n05\t0.14884460694698355\n06\t0.16125896879692975\n07\t0.16975218693714138\n08\t0.16994460635772326\n09\t0.16370384164006981\n10\t0.17844501769752235\n11\t0.20195143055571368\n12\t0.19962525223407322\n" + }, + { + "type": "TEXT", + "data": "" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549553_-659902637", + "id": "20190725-070516_1747352060", + "dateCreated": "2020-04-16 12:02:29.553", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\nselect * from impression limit 10", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.554", + "config": { + "editorSetting": { + "language": "sql", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/sql", + "fontSize": 9.0, + "results": { + "0": { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "setting": { + "table": { + "tableGridState": {}, + "tableColumnTypeState": { + "names": { + "id": "string", + "click": "string", + "hour": "string", + "C1": "string", + "banner_pos": "string", + "site_id": "string", + "site_domain": "string", + "site_category": "string", + "app_id": "string", + "app_domain": "string", + "app_category": "string", + "device_id": "string", + "device_ip": "string", + "device_model": "string", + "device_type": "string", + "device_conn_type": "string", + "C14": "string", + "C15": "string", + "C16": "string", + "C17": "string", + "C18": "string", + "C19": "string", + "C20": "string", + "C21": "string" + }, + "updated": false + }, + "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", + "tableOptionValue": { + "useFilter": false, + "showPagination": false, + "showAggregationFooter": false + }, + "updated": false, + "initialized": false + }, + "stackedAreaChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + }, + "lineChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + }, + "multiBarChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + } + }, + "commonSetting": {}, + "keys": [ + { + "name": "id", + "index": 0.0, + "aggr": "sum" + } + ], + "groups": [], + "values": [ + { + "name": "click", + "index": 1.0, + "aggr": "sum" + } + ] + }, + "helium": {} + } + }, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TABLE", + "data": "id\tclick\thour\tC1\tbanner_pos\tsite_id\tsite_domain\tsite_category\tapp_id\tapp_domain\tapp_category\tdevice_id\tdevice_ip\tdevice_model\tdevice_type\tdevice_conn_type\tC14\tC15\tC16\tC17\tC18\tC19\tC20\tC21\n1000009418151094273\t0\t14102100\t1005\t0\t1fbe01fe\tf3845767\t28905ebd\tecad2386\t7801e8d9\t07d7df22\ta99f214a\tddd2926e\t44956a24\t1\t2\t15706\t320\t50\t1722\t0\t35\t-1\t79\n10000169349117863715\t0\t14102100\t1005\t0\t1fbe01fe\tf3845767\t28905ebd\tecad2386\t7801e8d9\t07d7df22\ta99f214a\t96809ac8\t711ee120\t1\t0\t15704\t320\t50\t1722\t0\t35\t100084\t79\n10000371904215119486\t0\t14102100\t1005\t0\t1fbe01fe\tf3845767\t28905ebd\tecad2386\t7801e8d9\t07d7df22\ta99f214a\tb3cf8def\t8a4875bd\t1\t0\t15704\t320\t50\t1722\t0\t35\t100084\t79\n10000640724480838376\t0\t14102100\t1005\t0\t1fbe01fe\tf3845767\t28905ebd\tecad2386\t7801e8d9\t07d7df22\ta99f214a\te8275b8f\t6332421a\t1\t0\t15706\t320\t50\t1722\t0\t35\t100084\t79\n10000679056417042096\t0\t14102100\t1005\t1\tfe8cc448\t9166c161\t0569f928\tecad2386\t7801e8d9\t07d7df22\ta99f214a\t9644d0bf\t779d90c2\t1\t0\t18993\t320\t50\t2161\t0\t35\t-1\t157\n10000720757801103869\t0\t14102100\t1005\t0\td6137915\tbb1ef334\tf028772b\tecad2386\t7801e8d9\t07d7df22\ta99f214a\t05241af0\t8a4875bd\t1\t0\t16920\t320\t50\t1899\t0\t431\t100077\t117\n10000724729988544911\t0\t14102100\t1005\t0\t8fda644b\t25d4cfcd\tf028772b\tecad2386\t7801e8d9\t07d7df22\ta99f214a\tb264c159\tbe6db1d7\t1\t0\t20362\t320\t50\t2333\t0\t39\t-1\t157\n10000918755742328737\t0\t14102100\t1005\t1\te151e245\t7e091613\tf028772b\tecad2386\t7801e8d9\t07d7df22\ta99f214a\te6f67278\tbe74e6fe\t1\t0\t20632\t320\t50\t2374\t3\t39\t-1\t23\n10000949271186029916\t1\t14102100\t1005\t0\t1fbe01fe\tf3845767\t28905ebd\tecad2386\t7801e8d9\t07d7df22\ta99f214a\t37e8da74\t5db079b5\t1\t2\t15707\t320\t50\t1722\t0\t35\t-1\t79\n10001264480619467364\t0\t14102100\t1002\t0\t84c7ba46\tc4e18dd6\t50e219e0\tecad2386\t7801e8d9\t07d7df22\tc357dbff\tf1ac7184\t373ecbe6\t0\t0\t21689\t320\t50\t2496\t3\t167\t100191\t23\n" + }, + { + "type": "TEXT", + "data": "" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549554_-752598850", + "id": "20190725-074349_977717830", + "dateCreated": "2020-04-16 12:02:29.554", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\ndescribe impression", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.555", + "config": { + "editorSetting": { + "language": "sql", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/sql", + "fontSize": 9.0, + "results": { + "0": { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "setting": { + "table": { + "tableGridState": {}, + "tableColumnTypeState": { + "names": { + "col_name": "string", + "data_type": "string", + "comment": "string" + }, + "updated": false + }, + "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", + "tableOptionValue": { + "useFilter": false, + "showPagination": false, + "showAggregationFooter": false + }, + "updated": false, + "initialized": false + }, + "multiBarChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + } + }, + "commonSetting": {}, + "keys": [ + { + "name": "col_name", + "index": 0.0, + "aggr": "sum" + } + ], + "groups": [], + "values": [ + { + "name": "data_type", + "index": 1.0, + "aggr": "sum" + } + ] + }, + "helium": {} + } + }, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TABLE", + "data": "col_name\tdata_type\tcomment\nid\tstring\tnull\nclick\tstring\tnull\nhour\tstring\tnull\nC1\tstring\tnull\nbanner_pos\tstring\tnull\nsite_id\tstring\tnull\nsite_domain\tstring\tnull\nsite_category\tstring\tnull\napp_id\tstring\tnull\napp_domain\tstring\tnull\napp_category\tstring\tnull\ndevice_id\tstring\tnull\ndevice_ip\tstring\tnull\ndevice_model\tstring\tnull\ndevice_type\tstring\tnull\ndevice_conn_type\tstring\tnull\nC14\tstring\tnull\nC15\tstring\tnull\nC16\tstring\tnull\nC17\tstring\tnull\nC18\tstring\tnull\nC19\tstring\tnull\nC20\tstring\tnull\nC21\tstring\tnull\n" + }, + { + "type": "TEXT", + "data": "" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549555_-1813233548", + "id": "20190725-112914_1439701002", + "dateCreated": "2020-04-16 12:02:29.555", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n### 3. Advance Analytics/Machine Learning\n\nOnce we have familiarized ourselves with our data, we can proceed to the machine learning phase, where we convert our data into features for input to a machine learning algorithm and produce a trained model with which we can predict. Because Spark MLlib algorithms take a column of feature vectors of doubles as input, a typical feature engineering workflow includes:\n\n - Identifying numeri and categorical features\n - String indexing\n - Assembling them all into a sparse vector \n \nThe following code snippet is an example of a feature engineering workflow.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.556", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e3. Advance Analytics/Machine Learning\u003c/h3\u003e\n\u003cp\u003eOnce we have familiarized ourselves with our data, we can proceed to the machine learning phase, where we convert our data into features for input to a machine learning algorithm and produce a trained model with which we can predict. Because Spark MLlib algorithms take a column of feature vectors of doubles as input, a typical feature engineering workflow includes:\u003c/p\u003e\n\u003cul\u003e\n \u003cli\u003eIdentifying numeri and categorical features\u003c/li\u003e\n \u003cli\u003eString indexing\u003c/li\u003e\n \u003cli\u003eAssembling them all into a sparse vector\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eThe following code snippet is an example of a feature engineering workflow.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549555_1759990371", + "id": "20190726-093032_2055747866", + "dateCreated": "2020-04-16 12:02:29.555", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark.pyspark\nfrom pyspark.sql.functions import *\n# impression \u003d spark.sql(\"select * from fluir.impression\")\nstrCols \u003d map(lambda t: t[0], filter(lambda t: t[1] \u003d\u003d \u0027string\u0027, impression.dtypes))\nintCols \u003d map(lambda t: t[0], filter(lambda t: t[1] \u003d\u003d \u0027int\u0027, impression.dtypes))\n# [row_idx][json_idx]\\n\nstrColsCount \u003d sorted(map(lambda c: (c, impression.select(countDistinct(c)).collect()[0][0]), strCols), key\u003dlambda x: x[1], reverse\u003dTrue)\nintColsCount \u003d sorted(map(lambda c: (c, impression.select(countDistinct(c)).collect()[0][0]), intCols), key\u003dlambda x: x[1], reverse\u003dTrue)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.557", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038549556_851812731", + "id": "20190725-130245_1975873297", + "dateCreated": "2020-04-16 12:02:29.556", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nfrom pyspark.ml.feature import StringIndexer, VectorAssembler\n\n\n# All of the columns (string or integer) are categorical columns\nmaxBins \u003d 70\nprint(strColsCount)\ncategorical \u003d map(lambda c: c[0], filter(lambda c: c[1] \u003c\u003d maxBins, strColsCount))\ncategorical +\u003d map(lambda c: c[0], filter(lambda c: c[1] \u003c\u003d maxBins, intColsCount))\n\nprint(categorical)\n#remove \u0027click\u0027 which we are trying to predict\ncategorical.remove(\u0027click\u0027)\n\n# Apply string indexer to all of the categorical columns\n# And add _idx to the column name to indicate the index of the\n# categorical value\nstringIndexers \u003d map(lambda c: StringIndexer(inputCol \u003d c, outputCol \u003d c + \"_idx\"), categorical)\n\n# Assemble the put as the input to the VectorAssembler \n# with the output being our features\nassemblerInputs \u003d map(lambda c: c + \"_idx\", categorical)\nvectorAssembler \u003d VectorAssembler(inputCols \u003d assemblerInputs, outputCol \u003d \"features\")\n\n# The [click] column is our label \nlabelStringIndexer \u003d StringIndexer(inputCol \u003d \"click\", outputCol \u003d \"label\")\n\n# The stages of our ML pipeline \nstages \u003d stringIndexers + [vectorAssembler, labelStringIndexer]", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.557", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "[(\u0027id\u0027, 2499999), (\u0027device_ip\u0027, 682727), (\u0027device_id\u0027, 202503), (\u0027device_model\u0027, 5546), (\u0027app_id\u0027, 3384), (\u0027site_domain\u0027, 2890), (\u0027site_id\u0027, 2604), (\u0027C14\u0027, 719), (\u0027app_domain\u0027, 229), (\u0027C17\u0027, 183), (\u0027C20\u0027, 166), (\u0027C19\u0027, 44), (\u0027C21\u0027, 38), (\u0027app_category\u0027, 28), (\u0027site_category\u0027, 22), (\u0027hour\u0027, 13), (\u0027C16\u0027, 9), (\u0027C15\u0027, 8), (\u0027C1\u0027, 7), (\u0027banner_pos\u0027, 7), (\u0027device_type\u0027, 4), (\u0027device_conn_type\u0027, 4), (\u0027C18\u0027, 4), (\u0027click\u0027, 2)]\n[\u0027C19\u0027, \u0027C21\u0027, \u0027app_category\u0027, \u0027site_category\u0027, \u0027hour\u0027, \u0027C16\u0027, \u0027C15\u0027, \u0027C1\u0027, \u0027banner_pos\u0027, \u0027device_type\u0027, \u0027device_conn_type\u0027, \u0027C18\u0027, \u0027click\u0027]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549557_1185151469", + "id": "20190725-074448_1657879985", + "dateCreated": "2020-04-16 12:02:29.557", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nIn our use of GBTClassifer, you may have noticed that while we use string indexer but we are not applying One Hot Encoder (OHE).When using StringIndexer, categorical features are kept as k-ary categorical features. A tree node will test if feature X has a value in {subset of categories}. With both StringIndexer + OHE: Your categorical features are turned into a bunch of binary features. A tree node will test if feature X \u003d category a vs. all the other categories (one vs. rest test).\n\nWhen using only StringIndexer, the benefits include:\n\n - There are fewer features to choose\n - Each node’s test is more expressive than with binary 1-vs-rest features\n\nTherefore, for because for tree based methods, it is preferable to not use OHE as it is a less expressive test and it takes up more space. But for non-tree-based algorithms such as like linear regression, you must use OHE or else the model will impose a false and misleading ordering on categories.\n\nWith our workflow created, we can create our ML pipeline.\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.558", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eIn our use of GBTClassifer, you may have noticed that while we use string indexer but we are not applying One Hot Encoder (OHE).When using StringIndexer, categorical features are kept as k-ary categorical features. A tree node will test if feature X has a value in {subset of categories}. With both StringIndexer + OHE: Your categorical features are turned into a bunch of binary features. A tree node will test if feature X \u003d category a vs. all the other categories (one vs. rest test).\u003c/p\u003e\n\u003cp\u003eWhen using only StringIndexer, the benefits include:\u003c/p\u003e\n\u003cul\u003e\n \u003cli\u003eThere are fewer features to choose\u003c/li\u003e\n \u003cli\u003eEach node’s test is more expressive than with binary 1-vs-rest features\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eTherefore, for because for tree based methods, it is preferable to not use OHE as it is a less expressive test and it takes up more space. But for non-tree-based algorithms such as like linear regression, you must use OHE or else the model will impose a false and misleading ordering on categories.\u003c/p\u003e\n\u003cp\u003eWith our workflow created, we can create our ML pipeline.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549558_-1753534312", + "id": "20190726-100031_1413395870", + "dateCreated": "2020-04-16 12:02:29.558", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\nfrom pyspark.ml import Pipeline\n# Create our pipeline\npipeline \u003d Pipeline(stages \u003d stages)\n\n# create transformer to add features\nfeaturizer \u003d pipeline.fit(impression)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.559", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038549559_1484234233", + "id": "20190726-060055_1599209480", + "dateCreated": "2020-04-16 12:02:29.559", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n#dataframe with feature and intermediate transformation columns appended\nfeaturizedImpressions \u003d featurizer.transform(impression)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.560", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038549560_1516562422", + "id": "20190726-060411_413282270", + "dateCreated": "2020-04-16 12:02:29.560", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\nfeaturizedImpressions.schema", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.561", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "StructType(List(StructField(id,StringType,true),StructField(click,StringType,true),StructField(hour,StringType,true),StructField(C1,StringType,true),StructField(banner_pos,StringType,true),StructField(site_id,StringType,true),StructField(site_domain,StringType,true),StructField(site_category,StringType,true),StructField(app_id,StringType,true),StructField(app_domain,StringType,true),StructField(app_category,StringType,true),StructField(device_id,StringType,true),StructField(device_ip,StringType,true),StructField(device_model,StringType,true),StructField(device_type,StringType,true),StructField(device_conn_type,StringType,true),StructField(C14,StringType,true),StructField(C15,StringType,true),StructField(C16,StringType,true),StructField(C17,StringType,true),StructField(C18,StringType,true),StructField(C19,StringType,true),StructField(C20,StringType,true),StructField(C21,StringType,true),StructField(C19_idx,DoubleType,false),StructField(C21_idx,DoubleType,false),StructField(app_category_idx,DoubleType,false),StructField(site_category_idx,DoubleType,false),StructField(hour_idx,DoubleType,false),StructField(C16_idx,DoubleType,false),StructField(C15_idx,DoubleType,false),StructField(C1_idx,DoubleType,false),StructField(banner_pos_idx,DoubleType,false),StructField(device_type_idx,DoubleType,false),StructField(device_conn_type_idx,DoubleType,false),StructField(C18_idx,DoubleType,false),StructField(features,VectorUDT,true),StructField(label,DoubleType,false)))\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549560_-1970988006", + "id": "20190726-061549_1182099030", + "dateCreated": "2020-04-16 12:02:29.561", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\nfeaturizedImpressions.select(\u0027features\u0027, \u0027label\u0027).show()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.562", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+--------------------+-----+\n| features|label|\n+--------------------+-----+\n|(12,[3,4,10,11],[...| 0.0|\n|(12,[3,4,11],[2.0...| 0.0|\n|(12,[3,4,11],[2.0...| 0.0|\n|(12,[3,4,11],[2.0...| 0.0|\n|(12,[1,3,4,8,11],...| 0.0|\n|(12,[0,1,3,4,11],...| 0.0|\n|(12,[0,1,3,4,11],...| 0.0|\n|(12,[0,1,3,4,8],[...| 0.0|\n|(12,[3,4,10,11],[...| 1.0|\n|(12,[0,1,4,7,9],[...| 0.0|\n|(12,[0,1,3,4,8,11...| 0.0|\n|(12,[3,4,11],[2.0...| 0.0|\n|(12,[1,3,4,10,11]...| 0.0|\n|(12,[0,1,3,4,11],...| 0.0|\n|(12,[0,1,2,4,11],...| 0.0|\n|(12,[3,4,11],[2.0...| 0.0|\n|(12,[0,1,3,4,11],...| 0.0|\n|(12,[3,4,10,11],[...| 0.0|\n|(12,[0,1,3,4,8,11...| 0.0|\n|(12,[0,1,2,4],[10...| 0.0|\n+--------------------+-----+\nonly showing top 20 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549561_-1307673739", + "id": "20190726-060549_250048612", + "dateCreated": "2020-04-16 12:02:29.561", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nNext, we will split our featurized dataset into training and test datasets via .randomSplit()\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.563", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eNext, we will split our featurized dataset into training and test datasets via .randomSplit()\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549562_-216719074", + "id": "20190726-101103_614284789", + "dateCreated": "2020-04-16 12:02:29.562", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\ntrain, test \u003d featurizedImpressions.select([\"label\", \"features\", \"hour\"]).randomSplit([0.7, 0.3], 42)\ntrain.cache()\ntest.cache()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.563", + "config": { + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "DataFrame[label: double, features: vector, hour: string]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549563_78625503", + "id": "20190726-060626_273745094", + "dateCreated": "2020-04-16 12:02:29.563", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nNext, we will train, predict, and evaluate our model using the GBTClassifier. As a side note, a good primer on solving binary classification problems with Spark MLlib is Susan Li’s [Machine Learning with PySpark and MLlib — Solving a Binary Classification Problem](https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa).\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.564", + "config": { + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eNext, we will train, predict, and evaluate our model using the GBTClassifier. As a side note, a good primer on solving binary classification problems with Spark MLlib is Susan Li’s \u003ca href\u003d\"https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa\"\u003eMachine Learning with PySpark and MLlib — Solving a Binary Classification Problem\u003c/a\u003e.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549564_-1996698271", + "id": "20190726-101131_2082278727", + "dateCreated": "2020-04-16 12:02:29.564", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\nfrom pyspark.ml.classification import GBTClassifier\n# Train our GBTClassifier model\nclassifier \u003d GBTClassifier(labelCol\u003d\"label\", featuresCol\u003d\"features\", maxBins\u003dmaxBins, maxDepth\u003d10, maxIter\u003d10)\nmodel \u003d classifier.fit(train)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.565", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038549565_-324167830", + "id": "20190726-061017_27628802", + "dateCreated": "2020-04-16 12:02:29.565", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n#Execute our predictions\npredictions \u003d model.transform(test)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.566", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038549565_526017206", + "id": "20190726-061115_1571337758", + "dateCreated": "2020-04-16 12:02:29.565", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\nfrom pyspark.ml.evaluation import BinaryClassificationEvaluator\n\n# Evaluate our GBTClassifier model using BinaryClassificationEvaluator()\nev \u003d BinaryClassificationEvaluator(rawPredictionCol\u003d\"rawPrediction\", metricName\u003d\"areaUnderROC\")\nprint ev.evaluate(predictions)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.567", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "0.72617660768\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549566_-397167294", + "id": "20190726-061730_1699548970", + "dateCreated": "2020-04-16 12:02:29.566", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nWith our predictions, we can evaluate the model according to some evaluation metric, for example, area under the ROC curve, and view features by importance. We can also see the AUC value which in this case is 0.725210509046.\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.567", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eWith our predictions, we can evaluate the model according to some evaluation metric, for example, area under the ROC curve, and view features by importance. We can also see the AUC value which in this case is 0.725210509046.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549567_1492966675", + "id": "20190726-101227_1036166619", + "dateCreated": "2020-04-16 12:02:29.567", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\nimport json\nfeatures \u003d map(lambda c: str(json.loads(json.dumps(c))[\u0027name\u0027]), predictions.schema[\u0027features\u0027].metadata.get(\u0027ml_attr\u0027).get(\u0027attrs\u0027).values()[0])\n\n#convert numpy.float64 to str for spark.createDataFrame()\nweights\u003dmap(lambda w: \u0027%.10f\u0027 % w, model.featureImportances)\nweightedFeatures \u003d sorted(zip(weights, features), key\u003dlambda x: x[1], reverse\u003dTrue)\nspark.createDataFrame(weightedFeatures).toDF(\"weight\", \"feature\").createOrReplaceTempView(\u0027wf\u0027)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.568", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038549568_-474345802", + "id": "20190726-061809_1052967550", + "dateCreated": "2020-04-16 12:02:29.568", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\nselect feature, weight from wf order by weight desc", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.569", + "config": { + "editorSetting": { + "language": "sql", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/sql", + "fontSize": 9.0, + "results": { + "0": { + "graph": { + "mode": "stackedAreaChart", + "height": 300.0, + "optionOpen": false, + "setting": { + "table": { + "tableGridState": {}, + "tableColumnTypeState": { + "names": { + "feature": "string", + "weight": "string" + }, + "updated": false + }, + "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", + "tableOptionValue": { + "useFilter": false, + "showPagination": false, + "showAggregationFooter": false + }, + "updated": false, + "initialized": false + }, + "multiBarChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + }, + "stackedAreaChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + }, + "lineChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + } + }, + "commonSetting": {}, + "keys": [ + { + "name": "feature", + "index": 0.0, + "aggr": "sum" + } + ], + "groups": [], + "values": [ + { + "name": "weight", + "index": 1.0, + "aggr": "sum" + } + ] + }, + "helium": {} + } + }, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TABLE", + "data": "feature\tweight\nC21_idx\t0.4177487329\nC19_idx\t0.1568063990\nsite_category_idx\t0.1128820585\nhour_idx\t0.0946347352\napp_category_idx\t0.0741270054\nbanner_pos_idx\t0.0387547628\ndevice_conn_type_idx\t0.0357779433\nC16_idx\t0.0242824873\nC15_idx\t0.0132284327\nC1_idx\t0.0120744964\nC18_idx\t0.0106200059\ndevice_type_idx\t0.0090629406\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549569_1660580900", + "id": "20190726-062007_1776818693", + "dateCreated": "2020-04-16 12:02:29.569", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\npredictions.createOrReplaceTempView(\"predictions\")", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.570", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038549569_-1119904000", + "id": "20190726-062042_73438190", + "dateCreated": "2020-04-16 12:02:29.569", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\nselect (count(1) * 1.0) from predictions\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.571", + "config": { + "editorSetting": { + "language": "sql", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/sql", + "fontSize": 9.0, + "results": { + "0": { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "setting": { + "table": { + "tableGridState": {}, + "tableColumnTypeState": { + "names": { + "(CAST(CAST(count(1) AS DECIMAL(20,0)) AS DECIMAL(21,1)) * CAST(1.0 AS DECIMAL(21,1)))": "string" + }, + "updated": false + }, + "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", + "tableOptionValue": { + "useFilter": false, + "showPagination": false, + "showAggregationFooter": false + }, + "updated": false, + "initialized": false + } + }, + "commonSetting": {} + }, + "helium": {} + } + }, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TABLE", + "data": "(CAST(CAST(count(1) AS DECIMAL(20,0)) AS DECIMAL(21,1)) * CAST(1.0 AS DECIMAL(21,1)))\n749323.0\n" + }, + { + "type": "TEXT", + "data": "" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549570_-1039854408", + "id": "20190726-062116_1473756896", + "dateCreated": "2020-04-16 12:02:29.570", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nCalculate the accuracy of the model by checking the cases where oour model has actually predicted the correct output\nformula:\n\n**accuracy \u003d Number of correct preductions/ total number of predictions**\n\n[Classification: Accuracy reference](https://developers.google.com/machine-learning/crash-course/classification/accuracy)\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.571", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eCalculate the accuracy of the model by checking the cases where oour model has actually predicted the correct output\u003cbr/\u003eformula:\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003eaccuracy \u003d Number of correct preductions/ total number of predictions\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e\u003ca href\u003d\"https://developers.google.com/machine-learning/crash-course/classification/accuracy\"\u003eClassification: Accuracy reference\u003c/a\u003e\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549571_-111616943", + "id": "20190726-101913_657557378", + "dateCreated": "2020-04-16 12:02:29.571", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\nselect sum(case when prediction \u003d label then 1 else 0 end) / count(*) as accuracy from predictions", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.572", + "config": { + "editorSetting": { + "language": "sql", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/sql", + "fontSize": 9.0, + "results": { + "0": { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "setting": { + "table": { + "tableGridState": {}, + "tableColumnTypeState": { + "names": { + "accuracy": "string" + }, + "updated": false + }, + "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", + "tableOptionValue": { + "useFilter": false, + "showPagination": false, + "showAggregationFooter": false + }, + "updated": false, + "initialized": false + } + }, + "commonSetting": {} + }, + "helium": {} + } + }, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TABLE", + "data": "accuracy\n0.8364523710068955\n" + }, + { + "type": "TEXT", + "data": "" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549572_993036920", + "id": "20190726-062137_2123639695", + "dateCreated": "2020-04-16 12:02:29.572", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\nAccuracy value shows how accurately our model has predicted the click. In this case the accuracy is 0.8364523710068955 .\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:02:29.573", + "config": { + "tableHide": false, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true, + "completionKey": "TAB", + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "fontSize": 9.0, + "editorHide": true, + "results": {}, + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eAccuracy value shows how accurately our model has predicted the click. In this case the accuracy is 0.8364523710068955 .\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038549573_-1817162627", + "id": "20190726-102322_107280852", + "dateCreated": "2020-04-16 12:02:29.573", + "status": "READY", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + } + ], + "name": "demo-notebooks/ Advertising Analytics Ad Click Prediction using Spark", + "id": "2F7WQEZD7", + "noteParams": {}, + "noteForms": {}, + "angularObjects": { + "md:shared_process": [], + "sh:shared_process": [], + "spark:shared_process": [] + }, + "config": { + "isZeppelinNotebookCronEnable": false + }, + "info": {} +} \ No newline at end of file diff --git a/2F8AZJBZY/note.json b/2F8AZJBZY/note.json new file mode 100644 index 0000000..3c41a6d --- /dev/null +++ b/2F8AZJBZY/note.json @@ -0,0 +1,5492 @@ +{ + "paragraphs": [ + { + "text": "%md\n\n# Web Server Log Analysis with Apache Spark\n\n#### Please Note: This Zeppelin notebook was ported from Jupyter notebook that was part of an EDX online course created by UC Berkeley and sponsored by Databricks:\n\n##### Link to EDX Course: Introduction to Apache Spark\n\u003chttps://www.edx.org/course/introduction-apache-spark-uc-berkeleyx-cs105x\u003e\n\n###Summary\n\nThis lab will demonstrate how easy it is to perform web server log analysis with Apache Spark.\n\nServer log analysis is an ideal use case for Spark. It\u0027s a very large, common data source and contains a rich set of information. Spark allows you to store your logs in files on disk cheaply, while still providing a quick and simple way to perform data analysis on them. This homework will show you how to use Apache Spark on real-world text-based production logs and fully harness the power of that data. Log data comes from many sources, such as web, file, and compute servers, application logs, user-generated content, and can be used for monitoring servers, improving business and customer intelligence, building recommendation systems, fraud detection, and much more.\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:03.507", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch1\u003eWeb Server Log Analysis with Apache Spark\u003c/h1\u003e\n\u003ch4\u003ePlease Note: This Zeppelin notebook was ported from Jupyter notebook that was part of an EDX online course created by UC Berkeley and sponsored by Databricks:\u003c/h4\u003e\n\u003ch5\u003eLink to EDX Course: Introduction to Apache Spark\u003c/h5\u003e\n\u003cp\u003e\u003ca href\u003d\"https://www.edx.org/course/introduction-apache-spark-uc-berkeleyx-cs105x\"\u003ehttps://www.edx.org/course/introduction-apache-spark-uc-berkeleyx-cs105x\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e###Summary\u003c/p\u003e\n\u003cp\u003eThis lab will demonstrate how easy it is to perform web server log analysis with Apache Spark.\u003c/p\u003e\n\u003cp\u003eServer log analysis is an ideal use case for Spark. It\u0026rsquo;s a very large, common data source and contains a rich set of information. Spark allows you to store your logs in files on disk cheaply, while still providing a quick and simple way to perform data analysis on them. This homework will show you how to use Apache Spark on real-world text-based production logs and fully harness the power of that data. Log data comes from many sources, such as web, file, and compute servers, application logs, user-generated content, and can be used for monitoring servers, improving business and customer intelligence, building recommendation systems, fraud detection, and much more.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857798_1404973334", + "id": "20160721-205608_398995935", + "dateCreated": "2020-04-16 12:07:37.798", + "dateStarted": "2020-04-16 12:08:03.683", + "dateFinished": "2020-04-16 12:08:03.704", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n## How to complete this lab\n\nThis lab is broken up into sections with bite-sized examples for demonstrating Spark functionality for log processing.\n\nIt consists of 5 parts:\n\n* Part 1: Introduction and Imports\n* Part 2: Exploratory Data Analysis\n* Part 3: Analysis Walk-Through on the Web Server Log File\n* Part 4: Analyzing Web Server Log File\n* Part 5: Exploring 404 Response Codes", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:03.781", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch2\u003eHow to complete this lab\u003c/h2\u003e\n\u003cp\u003eThis lab is broken up into sections with bite-sized examples for demonstrating Spark functionality for log processing.\u003c/p\u003e\n\u003cp\u003eIt consists of 5 parts:\u003c/p\u003e\n\u003cul\u003e\n \u003cli\u003ePart 1: Introduction and Imports\u003c/li\u003e\n \u003cli\u003ePart 2: Exploratory Data Analysis\u003c/li\u003e\n \u003cli\u003ePart 3: Analysis Walk-Through on the Web Server Log File\u003c/li\u003e\n \u003cli\u003ePart 4: Analyzing Web Server Log File\u003c/li\u003e\n \u003cli\u003ePart 5: Exploring 404 Response Codes\u003c/li\u003e\n\u003c/ul\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857809_-1925937657", + "id": "20160721-205649_1921894344", + "dateCreated": "2020-04-16 12:07:37.809", + "dateStarted": "2020-04-16 12:08:04.088", + "dateFinished": "2020-04-16 12:08:04.103", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n## Part 1: Library Imports\n\nWe can import standard Python libraries (modules) the usual way. An import statement will import the specified module. In this lab, we will provide any imports that are necessary.\nLet\u0027s import some of the libraries we\u0027ll need:\n\n* datetime: Date and time functions\n* re: The regular expression library", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:04.188", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch2\u003ePart 1: Library Imports\u003c/h2\u003e\n\u003cp\u003eWe can import standard Python libraries (modules) the usual way. An import statement will import the specified module. In this lab, we will provide any imports that are necessary.\u003cbr/\u003eLet\u0026rsquo;s import some of the libraries we\u0026rsquo;ll need:\u003c/p\u003e\n\u003cul\u003e\n \u003cli\u003edatetime: Date and time functions\u003c/li\u003e\n \u003cli\u003ere: The regular expression library\u003c/li\u003e\n\u003c/ul\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857810_1549329470", + "id": "20160721-210321_1736487949", + "dateCreated": "2020-04-16 12:07:37.810", + "dateStarted": "2020-04-16 12:08:04.490", + "dateFinished": "2020-04-16 12:08:04.501", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sh\n\nwget -O apache.access.log.PROJECT.gz https://www.dropbox.com/s/n13homy2ssyd1n2/apache.access.log.PROJECT.gz?dl\u003d0\n\nrm -f apache.access.log.PROJECT\ngunzip apache.access.log.PROJECT.gz", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:04.589", + "config": { + "editorSetting": { + "language": "sh", + "editOnDblClick": false, + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/sh", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "--2020-04-16 12:08:04-- https://www.dropbox.com/s/n13homy2ssyd1n2/apache.access.log.PROJECT.gz?dl\u003d0\nResolving www.dropbox.com (www.dropbox.com)... 162.125.7.1, 2620:100:601a:1::a27d:701\nConnecting to www.dropbox.com (www.dropbox.com)|162.125.7.1|:443... connected.\nHTTP request sent, awaiting response... 301 Moved Permanently\nLocation: /s/raw/n13homy2ssyd1n2/apache.access.log.PROJECT.gz [following]\n--2020-04-16 12:08:04-- https://www.dropbox.com/s/raw/n13homy2ssyd1n2/apache.access.log.PROJECT.gz\nReusing existing connection to www.dropbox.com:443.\nHTTP request sent, awaiting response... 302 Found\nLocation: https://uc8187801adfea2438e89d37d691.dl.dropboxusercontent.com/cd/0/inline/A18DD_SrCaVkSS96OIKZPcOp6h4Kg99ZWLAPAd5sW6aXogLTs8AE32mqcrc0_gq54Sr4t3f3Zz0DOWzE3t_S9ZPsxXWc_gzp04qUVjJQzCtg7w/file# [following]\n--2020-04-16 12:08:05-- https://uc8187801adfea2438e89d37d691.dl.dropboxusercontent.com/cd/0/inline/A18DD_SrCaVkSS96OIKZPcOp6h4Kg99ZWLAPAd5sW6aXogLTs8AE32mqcrc0_gq54Sr4t3f3Zz0DOWzE3t_S9ZPsxXWc_gzp04qUVjJQzCtg7w/file\nResolving uc8187801adfea2438e89d37d691.dl.dropboxusercontent.com (uc8187801adfea2438e89d37d691.dl.dropboxusercontent.com)... 162.125.7.6, 2620:100:601a:6::a27d:706\nConnecting to uc8187801adfea2438e89d37d691.dl.dropboxusercontent.com (uc8187801adfea2438e89d37d691.dl.dropboxusercontent.com)|162.125.7.6|:443... connected.\nHTTP request sent, awaiting response... 302 FOUND\nLocation: /cd/0/inline2/A19hjBhe4UrBJuxZElG9ELohjmyvL6-fmT_q_FdojNqIUJTu7mT3fpu7qohfdZMOojIx0HaBeowwi5_ZE8bHPppdlm7ivPaHdK85ariPHq5_coYfl_BfQ20lUA30HiqT7cSAwqnSqZzatTsAMaIM8aNqsE5mMBOD4khavld9wD8Gvxy0iezfALJiZUS9WkQ13bn14wG6ykcIHkYM54rYKy2nzEwBmvoHHLmPIogKv5zRYiPzgx3e1xx-iWOuQgq7wdRCO8TerxMDiPOQ4u0g_h3BFkto6tKvYxv4KeHsgBxjxBkSVQS32TV0MMVgRUxYa3eU0cm9X4Khf5i0LG40q7qw/file [following]\n--2020-04-16 12:08:05-- https://uc8187801adfea2438e89d37d691.dl.dropboxusercontent.com/cd/0/inline2/A19hjBhe4UrBJuxZElG9ELohjmyvL6-fmT_q_FdojNqIUJTu7mT3fpu7qohfdZMOojIx0HaBeowwi5_ZE8bHPppdlm7ivPaHdK85ariPHq5_coYfl_BfQ20lUA30HiqT7cSAwqnSqZzatTsAMaIM8aNqsE5mMBOD4khavld9wD8Gvxy0iezfALJiZUS9WkQ13bn14wG6ykcIHkYM54rYKy2nzEwBmvoHHLmPIogKv5zRYiPzgx3e1xx-iWOuQgq7wdRCO8TerxMDiPOQ4u0g_h3BFkto6tKvYxv4KeHsgBxjxBkSVQS32TV0MMVgRUxYa3eU0cm9X4Khf5i0LG40q7qw/file\nReusing existing connection to uc8187801adfea2438e89d37d691.dl.dropboxusercontent.com:443.\nHTTP request sent, awaiting response... 200 OK\nLength: 11124290 (11M) [application/octet-stream]\nSaving to: ‘apache.access.log.PROJECT.gz’\n\n 0K .......... .......... .......... .......... .......... 0% 7.55M 1s\n 50K .......... .......... .......... .......... .......... 0% 24.4M 1s\n 100K .......... .......... .......... .......... .......... 1% 9.37M 1s\n 150K .......... .......... .......... .......... .......... 1% 39.8M 1s\n 200K .......... .......... .......... .......... .......... 2% 215M 1s\n 250K .......... .......... .......... .......... .......... 2% 80.0M 1s\n 300K .......... .......... .......... .......... .......... 3% 22.3M 1s\n 350K .......... .......... .......... .......... .......... 3% 75.4M 0s\n 400K .......... .......... .......... .......... .......... 4% 143M 0s\n 450K .......... .......... .......... .......... .......... 4% 71.2M 0s\n 500K .......... .......... .......... .......... .......... 5% 113M 0s\n 550K .......... .......... .......... .......... .......... 5% 95.1M 0s\n 600K .......... .......... .......... .......... .......... 5% 65.8M 0s\n 650K .......... .......... .......... .......... .......... 6% 133M 0s\n 700K .......... .......... .......... .......... .......... 6% 173M 0s\n 750K .......... .......... .......... .......... .......... 7% 74.9M 0s\n 800K .......... .......... .......... .......... .......... 7% 191M 0s\n 850K .......... .......... .......... .......... .......... 8% 210M 0s\n 900K .......... .......... .......... .......... .......... 8% 224M 0s\n 950K .......... .......... .......... .......... .......... 9% 191M 0s\n 1000K .......... .......... .......... .......... .......... 9% 120M 0s\n 1050K .......... .......... .......... .......... .......... 10% 167M 0s\n 1100K .......... .......... .......... .......... .......... 10% 156M 0s\n 1150K .......... .......... .......... .......... .......... 11% 129M 0s\n 1200K .......... .......... .......... .......... .......... 11% 135M 0s\n 1250K .......... .......... .......... .......... .......... 11% 125M 0s\n 1300K .......... .......... .......... .......... .......... 12% 128M 0s\n 1350K .......... .......... .......... .......... .......... 12% 69.0M 0s\n 1400K .......... .......... .......... .......... .......... 13% 90.8M 0s\n 1450K .......... .......... .......... .......... .......... 13% 93.5M 0s\n 1500K .......... .......... .......... .......... .......... 14% 75.9M 0s\n 1550K .......... .......... .......... .......... .......... 14% 99.5M 0s\n 1600K .......... .......... .......... .......... .......... 15% 137M 0s\n 1650K .......... .......... .......... .......... .......... 15% 108M 0s\n 1700K .......... .......... .......... .......... .......... 16% 87.7M 0s\n 1750K .......... .......... .......... .......... .......... 16% 69.1M 0s\n 1800K .......... .......... .......... .......... .......... 17% 88.2M 0s\n 1850K .......... .......... .......... .......... .......... 17% 127M 0s\n 1900K .......... .......... .......... .......... .......... 17% 142M 0s\n 1950K .......... .......... .......... .......... .......... 18% 97.1M 0s\n 2000K .......... .......... .......... .......... .......... 18% 103M 0s\n 2050K .......... .......... .......... .......... .......... 19% 94.2M 0s\n 2100K .......... .......... .......... .......... .......... 19% 104M 0s\n 2150K .......... .......... .......... .......... .......... 20% 76.6M 0s\n 2200K .......... .......... .......... .......... .......... 20% 102M 0s\n 2250K .......... .......... .......... .......... .......... 21% 104M 0s\n 2300K .......... .......... .......... .......... .......... 21% 111M 0s\n 2350K .......... .......... .......... .......... .......... 22% 93.7M 0s\n 2400K .......... .......... .......... .......... .......... 22% 127M 0s\n 2450K .......... .......... .......... .......... .......... 23% 84.9M 0s\n 2500K .......... .......... .......... .......... .......... 23% 123M 0s\n 2550K .......... .......... .......... .......... .......... 23% 166M 0s\n 2600K .......... .......... .......... .......... .......... 24% 172M 0s\n 2650K .......... .......... .......... .......... .......... 24% 198M 0s\n 2700K .......... .......... .......... .......... .......... 25% 133M 0s\n 2750K .......... .......... .......... .......... .......... 25% 160M 0s\n 2800K .......... .......... .......... .......... .......... 26% 188M 0s\n 2850K .......... .......... .......... .......... .......... 26% 188M 0s\n 2900K .......... .......... .......... .......... .......... 27% 195M 0s\n 2950K .......... .......... .......... .......... .......... 27% 157M 0s\n 3000K .......... .......... .......... .......... .......... 28% 96.8M 0s\n 3050K .......... .......... .......... .......... .......... 28% 98.1M 0s\n 3100K .......... .......... .......... .......... .......... 28% 114M 0s\n 3150K .......... .......... .......... .......... .......... 29% 67.0M 0s\n 3200K .......... .......... .......... .......... .......... 29% 80.9M 0s\n 3250K .......... .......... .......... .......... .......... 30% 118M 0s\n 3300K .......... .......... .......... .......... .......... 30% 110M 0s\n 3350K .......... .......... .......... .......... .......... 31% 129M 0s\n 3400K .......... .......... .......... .......... .......... 31% 121M 0s\n 3450K .......... .......... .......... .......... .......... 32% 92.1M 0s\n 3500K .......... .......... .......... .......... .......... 32% 220M 0s\n 3550K .......... .......... .......... .......... .......... 33% 94.6M 0s\n 3600K .......... .......... .......... .......... .......... 33% 124M 0s\n 3650K .......... .......... .......... .......... .......... 34% 128M 0s\n 3700K .......... .......... .......... .......... .......... 34% 181M 0s\n 3750K .......... .......... .......... .......... .......... 34% 133M 0s\n 3800K .......... .......... .......... .......... .......... 35% 145M 0s\n 3850K .......... .......... .......... .......... .......... 35% 167M 0s\n 3900K .......... .......... .......... .......... .......... 36% 141M 0s\n 3950K .......... .......... .......... .......... .......... 36% 141M 0s\n 4000K .......... .......... .......... .......... .......... 37% 125M 0s\n 4050K .......... .......... .......... .......... .......... 37% 156M 0s\n 4100K .......... .......... .......... .......... .......... 38% 180M 0s\n 4150K .......... .......... .......... .......... .......... 38% 126M 0s\n 4200K .......... .......... .......... .......... .......... 39% 216M 0s\n 4250K .......... .......... .......... .......... .......... 39% 144M 0s\n 4300K .......... .......... .......... .......... .......... 40% 171M 0s\n 4350K .......... .......... .......... .......... .......... 40% 115M 0s\n 4400K .......... .......... .......... .......... .......... 40% 155M 0s\n 4450K .......... .......... .......... .......... .......... 41% 185M 0s\n 4500K .......... .......... .......... .......... .......... 41% 139M 0s\n 4550K .......... .......... .......... .......... .......... 42% 125M 0s\n 4600K .......... .......... .......... .......... .......... 42% 181M 0s\n 4650K .......... .......... .......... .......... .......... 43% 114M 0s\n 4700K .......... .......... .......... .......... .......... 43% 146M 0s\n 4750K .......... .......... .......... .......... .......... 44% 107M 0s\n 4800K .......... .......... .......... .......... .......... 44% 153M 0s\n 4850K .......... .......... .......... .......... .......... 45% 126M 0s\n 4900K .......... .......... .......... .......... .......... 45% 152M 0s\n 4950K .......... .......... .......... .......... .......... 46% 111M 0s\n 5000K .......... .......... .......... .......... .......... 46% 142M 0s\n 5050K .......... .......... .......... .......... .......... 46% 143M 0s\n 5100K .......... .......... .......... .......... .......... 47% 119M 0s\n 5150K .......... .......... .......... .......... .......... 47% 123M 0s\n 5200K .......... .......... .......... .......... .......... 48% 119M 0s\n 5250K .......... .......... .......... .......... .......... 48% 123M 0s\n 5300K .......... .......... .......... .......... .......... 49% 117M 0s\n 5350K .......... .......... .......... .......... .......... 49% 125M 0s\n 5400K .......... .......... .......... .......... .......... 50% 161M 0s\n 5450K .......... .......... .......... .......... .......... 50% 163M 0s\n 5500K .......... .......... .......... .......... .......... 51% 162M 0s\n 5550K .......... .......... .......... .......... .......... 51% 96.6M 0s\n 5600K .......... .......... .......... .......... .......... 52% 181M 0s\n 5650K .......... .......... .......... .......... .......... 52% 116M 0s\n 5700K .......... .......... .......... .......... .......... 52% 123M 0s\n 5750K .......... .......... .......... .......... .......... 53% 164M 0s\n 5800K .......... .......... .......... .......... .......... 53% 138M 0s\n 5850K .......... .......... .......... .......... .......... 54% 158M 0s\n 5900K .......... .......... .......... .......... .......... 54% 140M 0s\n 5950K .......... .......... .......... .......... .......... 55% 128M 0s\n 6000K .......... .......... .......... .......... .......... 55% 119M 0s\n 6050K .......... .......... .......... .......... .......... 56% 121M 0s\n 6100K .......... .......... .......... .......... .......... 56% 108M 0s\n 6150K .......... .......... .......... .......... .......... 57% 128M 0s\n 6200K .......... .......... .......... .......... .......... 57% 82.8M 0s\n 6250K .......... .......... .......... .......... .......... 57% 175M 0s\n 6300K .......... .......... .......... .......... .......... 58% 109M 0s\n 6350K .......... .......... .......... .......... .......... 58% 86.4M 0s\n 6400K .......... .......... .......... .......... .......... 59% 123M 0s\n 6450K .......... .......... .......... .......... .......... 59% 180M 0s\n 6500K .......... .......... .......... .......... .......... 60% 181M 0s\n 6550K .......... .......... .......... .......... .......... 60% 129M 0s\n 6600K .......... .......... .......... .......... .......... 61% 128M 0s\n 6650K .......... .......... .......... .......... .......... 61% 153M 0s\n 6700K .......... .......... .......... .......... .......... 62% 162M 0s\n 6750K .......... .......... .......... .......... .......... 62% 87.5M 0s\n 6800K .......... .......... .......... .......... .......... 63% 174M 0s\n 6850K .......... .......... .......... .......... .......... 63% 132M 0s\n 6900K .......... .......... .......... .......... .......... 63% 142M 0s\n 6950K .......... .......... .......... .......... .......... 64% 113M 0s\n 7000K .......... .......... .......... .......... .......... 64% 137M 0s\n 7050K .......... .......... .......... .......... .......... 65% 149M 0s\n 7100K .......... .......... .......... .......... .......... 65% 115M 0s\n 7150K .......... .......... .......... .......... .......... 66% 107M 0s\n 7200K .......... .......... .......... .......... .......... 66% 131M 0s\n 7250K .......... .......... .......... .......... .......... 67% 120M 0s\n 7300K .......... .......... .......... .......... .......... 67% 136M 0s\n 7350K .......... .......... .......... .......... .......... 68% 126M 0s\n 7400K .......... .......... .......... .......... .......... 68% 159M 0s\n 7450K .......... .......... .......... .......... .......... 69% 132M 0s\n 7500K .......... .......... .......... .......... .......... 69% 135M 0s\n 7550K .......... .......... .......... .......... .......... 69% 124M 0s\n 7600K .......... .......... .......... .......... .......... 70% 176M 0s\n 7650K .......... .......... .......... .......... .......... 70% 137M 0s\n 7700K .......... .......... .......... .......... .......... 71% 129M 0s\n 7750K .......... .......... .......... .......... .......... 71% 116M 0s\n 7800K .......... .......... .......... .......... .......... 72% 115M 0s\n 7850K .......... .......... .......... .......... .......... 72% 130M 0s\n 7900K .......... .......... .......... .......... .......... 73% 126M 0s\n 7950K .......... .......... .......... .......... .......... 73% 134M 0s\n 8000K .......... .......... .......... .......... .......... 74% 178M 0s\n 8050K .......... .......... .......... .......... .......... 74% 158M 0s\n 8100K .......... .......... .......... .......... .......... 75% 185M 0s\n 8150K .......... .......... .......... .......... .......... 75% 164M 0s\n 8200K .......... .......... .......... .......... .......... 75% 179M 0s\n 8250K .......... .......... .......... .......... .......... 76% 189M 0s\n 8300K .......... .......... .......... .......... .......... 76% 187M 0s\n 8350K .......... .......... .......... .......... .......... 77% 170M 0s\n 8400K .......... .......... .......... .......... .......... 77% 193M 0s\n 8450K .......... .......... .......... .......... .......... 78% 190M 0s\n 8500K .......... .......... .......... .......... .......... 78% 191M 0s\n 8550K .......... .......... .......... .......... .......... 79% 162M 0s\n 8600K .......... .......... .......... .......... .......... 79% 184M 0s\n 8650K .......... .......... .......... .......... .......... 80% 166M 0s\n 8700K .......... .......... .......... .......... .......... 80% 200M 0s\n 8750K .......... .......... .......... .......... .......... 81% 150M 0s\n 8800K .......... .......... .......... .......... .......... 81% 174M 0s\n 8850K .......... .......... .......... .......... .......... 81% 193M 0s\n 8900K .......... .......... .......... .......... .......... 82% 186M 0s\n 8950K .......... .......... .......... .......... .......... 82% 166M 0s\n 9000K .......... .......... .......... .......... .......... 83% 201M 0s\n 9050K .......... .......... .......... .......... .......... 83% 188M 0s\n 9100K .......... .......... .......... .......... .......... 84% 186M 0s\n 9150K .......... .......... .......... .......... .......... 84% 156M 0s\n 9200K .......... .......... .......... .......... .......... 85% 186M 0s\n 9250K .......... .......... .......... .......... .......... 85% 208M 0s\n 9300K .......... .......... .......... .......... .......... 86% 198M 0s\n 9350K .......... .......... .......... .......... .......... 86% 166M 0s\n 9400K .......... .......... .......... .......... .......... 86% 201M 0s\n 9450K .......... .......... .......... .......... .......... 87% 187M 0s\n 9500K .......... .......... .......... .......... .......... 87% 185M 0s\n 9550K .......... .......... .......... .......... .......... 88% 157M 0s\n 9600K .......... .......... .......... .......... .......... 88% 187M 0s\n 9650K .......... .......... .......... .......... .......... 89% 190M 0s\n 9700K .......... .......... .......... .......... .......... 89% 203M 0s\n 9750K .......... .......... .......... .......... .......... 90% 177M 0s\n 9800K .......... .......... .......... .......... .......... 90% 189M 0s\n 9850K .......... .......... .......... .......... .......... 91% 195M 0s\n 9900K .......... .......... .......... .......... .......... 91% 185M 0s\n 9950K .......... .......... .......... .......... .......... 92% 145M 0s\n 10000K .......... .......... .......... .......... .......... 92% 187M 0s\n 10050K .......... .......... .......... .......... .......... 92% 195M 0s\n 10100K .......... .......... .......... .......... .......... 93% 196M 0s\n 10150K .......... .......... .......... .......... .......... 93% 155M 0s\n 10200K .......... .......... .......... .......... .......... 94% 197M 0s\n 10250K .......... .......... .......... .......... .......... 94% 186M 0s\n 10300K .......... .......... .......... .......... .......... 95% 188M 0s\n 10350K .......... .......... .......... .......... .......... 95% 156M 0s\n 10400K .......... .......... .......... .......... .......... 96% 192M 0s\n 10450K .......... .......... .......... .......... .......... 96% 210M 0s\n 10500K .......... .......... .......... .......... .......... 97% 216M 0s\n 10550K .......... .......... .......... .......... .......... 97% 162M 0s\n 10600K .......... .......... .......... .......... .......... 98% 188M 0s\n 10650K .......... .......... .......... .......... .......... 98% 203M 0s\n 10700K .......... .......... .......... .......... .......... 98% 176M 0s\n 10750K .......... .......... .......... .......... .......... 99% 184M 0s\n 10800K .......... .......... .......... .......... .......... 99% 227M 0s\n 10850K .......... ... 100% 228M\u003d0.09s\n\n2020-04-16 12:08:06 (113 MB/s) - ‘apache.access.log.PROJECT.gz’ saved [11124290/11124290]\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857811_-62042733", + "id": "20160725-204705_200637156", + "dateCreated": "2020-04-16 12:07:37.811", + "dateStarted": "2020-04-16 12:08:04.821", + "dateFinished": "2020-04-16 12:08:06.852", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sh\n\nhdfs dfs -rm -f -skipTrash /tmp/apache.access.log.PROJECT\nhdfs dfs -put apache.access.log.PROJECT /tmp\nhdfs dfs -ls /tmp/apache.access.log.PROJECT\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:06.923", + "config": { + "editorSetting": { + "language": "sh", + "editOnDblClick": false, + "completionSupport": false + }, + "colWidth": 12.0, + "editorMode": "ace/mode/sh", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "20/04/16 12:08:08 INFO fs.DCEMount: MAINCLASS::::[zeppelin:org.apache.hadoop.fs.FsShell:[4518]:[1]:::] Created sparkfs context singleton object: 140627850098688 HEX PTR: 7fe6790cf800 references: 1 is_dce_shuffle_instance: false\n20/04/16 12:08:08 INFO fs.DCEMount: Started DCEMount. application_name: dcefs fsConfFilePath: /mnt/gluster_mounts/meta-volume/spark_config/7c34c7b6-7803-11ea-ae2c-005056829778/7c34c7b6-7803-11ea-ae2c-005056829778_NO_DCEVU.yaml is_dce_shuffle_instance:false sparkfs_ctx_ptr: 140627850098688 sparkfs_shufflectx_ptr: 0\n20/04/16 12:08:08 ERROR dce.DCEFSTalker: DCEMount initialized for application_name\u003ddcefs .is_dce_shuffle_instance\u003dfalse .readBufferSize\u003d131072 .writeBufferSize\u003d131072 .\nDeleted /tmp/apache.access.log.PROJECT\n20/04/16 12:08:10 INFO fs.DCEMount: MAINCLASS::::[zeppelin:org.apache.hadoop.fs.FsShell:[4589]:[1]:::] Created sparkfs context singleton object: 140251369373872 HEX PTR: 7f8ed10d00b0 references: 1 is_dce_shuffle_instance: false\n20/04/16 12:08:10 INFO fs.DCEMount: Started DCEMount. application_name: dcefs fsConfFilePath: /mnt/gluster_mounts/meta-volume/spark_config/7c34c7b6-7803-11ea-ae2c-005056829778/7c34c7b6-7803-11ea-ae2c-005056829778_NO_DCEVU.yaml is_dce_shuffle_instance:false sparkfs_ctx_ptr: 140251369373872 sparkfs_shufflectx_ptr: 0\n20/04/16 12:08:10 ERROR dce.DCEFSTalker: DCEMount initialized for application_name\u003ddcefs .is_dce_shuffle_instance\u003dfalse .readBufferSize\u003d131072 .writeBufferSize\u003d131072 .\n20/04/16 12:08:13 INFO fs.DCEMount: MAINCLASS::::[zeppelin:org.apache.hadoop.fs.FsShell:[4514]:[1]:::] Created sparkfs context singleton object: 139695371538144 HEX PTR: 7f0d5cff4ee0 references: 1 is_dce_shuffle_instance: false\n20/04/16 12:08:13 INFO fs.DCEMount: Started DCEMount. application_name: dcefs fsConfFilePath: /mnt/gluster_mounts/meta-volume/spark_config/7c34c7b6-7803-11ea-ae2c-005056829778/7c34c7b6-7803-11ea-ae2c-005056829778_NO_DCEVU.yaml is_dce_shuffle_instance:false sparkfs_ctx_ptr: 139695371538144 sparkfs_shufflectx_ptr: 0\n20/04/16 12:08:13 ERROR dce.DCEFSTalker: DCEMount initialized for application_name\u003ddcefs .is_dce_shuffle_instance\u003dfalse .readBufferSize\u003d131072 .writeBufferSize\u003d131072 .\n-rw-r--r--+ 1 root root 111503503 2020-04-16 12:08 /tmp/apache.access.log.PROJECT\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857813_285620653", + "id": "20160721-202134_1390334492", + "dateCreated": "2020-04-16 12:07:37.813", + "dateStarted": "2020-04-16 12:08:07.255", + "dateFinished": "2020-04-16 12:08:13.271", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\nsc\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:13.360", + "config": { + "tableHide": false, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "res6: org.apache.spark.SparkContext \u003d org.apache.spark.SparkContext@4318a7ae\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857814_1569344482", + "id": "20160724-010528_127523330", + "dateCreated": "2020-04-16 12:07:37.815", + "dateStarted": "2020-04-16 12:08:13.638", + "dateFinished": "2020-04-16 12:08:13.862", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%spark\nsqlContext\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:13.938", + "config": { + "colWidth": 12.0, + "fontSize": 9.0, + "results": {}, + "enabled": true, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "res7: org.apache.spark.sql.SQLContext \u003d org.apache.spark.sql.SQLContext@1a2a90ef\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857816_-477208723", + "id": "20200414-113153_1855927640", + "dateCreated": "2020-04-16 12:07:37.816", + "dateStarted": "2020-04-16 12:08:14.213", + "dateFinished": "2020-04-16 12:08:14.377", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\nimport datetime\n\n# Quick test of the datetime library\nprint \u0027This was last run on: {0}\u0027.format(datetime.datetime.now())", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:14.412", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "This was last run on: 2020-04-16 12:08:14.695199\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857817_1096423428", + "id": "20160718-191223_2111119137", + "dateCreated": "2020-04-16 12:07:37.817", + "dateStarted": "2020-04-16 12:08:14.688", + "dateFinished": "2020-04-16 12:08:14.698", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n# Quick test of regexp library\nimport re\n\n# Quick test of the regular expression library\nm \u003d re.search(\u0027(?\u003c\u003dabc)def\u0027, \u0027abcdef\u0027)\n\nprint m.group(0)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:14.787", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "def\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857818_-2106901498", + "id": "20160718-190935_95514384", + "dateCreated": "2020-04-16 12:07:37.818", + "dateStarted": "2020-04-16 12:08:15.003", + "dateFinished": "2020-04-16 12:08:15.011", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n## Part 2: Exploratory Data Analysis\n\nLet\u0027s begin looking at our data. For this lab, we will use a data set from NASA Kennedy Space Center web server in Florida. The full data set is freely available at \u003chttp://ita.ee.lbl.gov/html/contrib/NASA-HTTP.html\u003e, and it contains all HTTP requests for two months. We are using a subset that only contains several days\u0027 worth of requests. The log file has already been downloaded for you.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:15.102", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch2\u003ePart 2: Exploratory Data Analysis\u003c/h2\u003e\n\u003cp\u003eLet\u0026rsquo;s begin looking at our data. For this lab, we will use a data set from NASA Kennedy Space Center web server in Florida. The full data set is freely available at \u003ca href\u003d\"http://ita.ee.lbl.gov/html/contrib/NASA-HTTP.html\"\u003ehttp://ita.ee.lbl.gov/html/contrib/NASA-HTTP.html\u003c/a\u003e, and it contains all HTTP requests for two months. We are using a subset that only contains several days\u0026rsquo; worth of requests. The log file has already been downloaded for you.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857819_1342710068", + "id": "20160721-210530_1635089066", + "dateCreated": "2020-04-16 12:07:37.819", + "dateStarted": "2020-04-16 12:08:15.377", + "dateFinished": "2020-04-16 12:08:15.389", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (2a) Loading the log file\n\nNow that we have the path to the file, let\u0027s load it into a DataFrame. We\u0027ll do this in steps. First, we\u0027ll use sqlContext.read.text() to read the text file. This will produce a DataFrame with a single string column called value.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:15.476", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(2a) Loading the log file\u003c/h3\u003e\n\u003cp\u003eNow that we have the path to the file, let\u0026rsquo;s load it into a DataFrame. We\u0026rsquo;ll do this in steps. First, we\u0026rsquo;ll use sqlContext.read.text() to read the text file. This will produce a DataFrame with a single string column called value.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857820_-1201232176", + "id": "20160721-210741_1305148056", + "dateCreated": "2020-04-16 12:07:37.820", + "dateStarted": "2020-04-16 12:08:15.684", + "dateFinished": "2020-04-16 12:08:15.691", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n# Specify path to downloaded log file\nimport sys\nimport os\n\nlog_file_path \u003d \u0027dce:///\u0027 + os.path.join(\u0027tmp\u0027, \u0027apache.access.log.PROJECT\u0027)\n\nprint log_file_path\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:15.782", + "config": { + "editorSetting": { + "language": "python", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "colWidth": 12.0, + "editorMode": "ace/mode/python", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "dce:///tmp/apache.access.log.PROJECT\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857821_1696313949", + "id": "20160718-191015_2140118453", + "dateCreated": "2020-04-16 12:07:37.821", + "dateStarted": "2020-04-16 12:08:15.991", + "dateFinished": "2020-04-16 12:08:15.998", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nbase_df \u003d sqlContext.read.text(log_file_path)\n# Let\u0027s look at the schema\nbase_df.printSchema()\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:16.090", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "root\n |-- value: string (nullable \u003d true)\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857822_1278752749", + "id": "20160718-195018_1400250065", + "dateCreated": "2020-04-16 12:07:37.822", + "dateStarted": "2020-04-16 12:08:16.309", + "dateFinished": "2020-04-16 12:08:16.344", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nLet\u0027s take a look at some of the data", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:16.414", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eLet\u0026rsquo;s take a look at some of the data\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857822_2143615579", + "id": "20160721-210826_262995281", + "dateCreated": "2020-04-16 12:07:37.822", + "dateStarted": "2020-04-16 12:08:16.658", + "dateFinished": "2020-04-16 12:08:16.665", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nbase_df.show(truncate\u003dFalse)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:16.757", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+-------------------------------------------------------------------------------------------------------------------------------+\n|value |\n+-------------------------------------------------------------------------------------------------------------------------------+\n|in24.inetnebr.com - - [01/Aug/1995:00:00:01 -0400] \"GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0\" 200 1839 |\n|uplherc.upl.com - - [01/Aug/1995:00:00:07 -0400] \"GET / HTTP/1.0\" 304 0 |\n|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] \"GET /images/ksclogo-medium.gif HTTP/1.0\" 304 0 |\n|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] \"GET /images/MOSAIC-logosmall.gif HTTP/1.0\" 304 0 |\n|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] \"GET /images/USA-logosmall.gif HTTP/1.0\" 304 0 |\n|ix-esc-ca2-07.ix.netcom.com - - [01/Aug/1995:00:00:09 -0400] \"GET /images/launch-logo.gif HTTP/1.0\" 200 1713 |\n|uplherc.upl.com - - [01/Aug/1995:00:00:10 -0400] \"GET /images/WORLD-logosmall.gif HTTP/1.0\" 304 0 |\n|slppp6.intermind.net - - [01/Aug/1995:00:00:10 -0400] \"GET /history/skylab/skylab.html HTTP/1.0\" 200 1687 |\n|piweba4y.prodigy.com - - [01/Aug/1995:00:00:10 -0400] \"GET /images/launchmedium.gif HTTP/1.0\" 200 11853 |\n|slppp6.intermind.net - - [01/Aug/1995:00:00:11 -0400] \"GET /history/skylab/skylab-small.gif HTTP/1.0\" 200 9202 |\n|slppp6.intermind.net - - [01/Aug/1995:00:00:12 -0400] \"GET /images/ksclogosmall.gif HTTP/1.0\" 200 3635 |\n|ix-esc-ca2-07.ix.netcom.com - - [01/Aug/1995:00:00:12 -0400] \"GET /history/apollo/images/apollo-logo1.gif HTTP/1.0\" 200 1173 |\n|slppp6.intermind.net - - [01/Aug/1995:00:00:13 -0400] \"GET /history/apollo/images/apollo-logo.gif HTTP/1.0\" 200 3047 |\n|uplherc.upl.com - - [01/Aug/1995:00:00:14 -0400] \"GET /images/NASA-logosmall.gif HTTP/1.0\" 304 0 |\n|133.43.96.45 - - [01/Aug/1995:00:00:16 -0400] \"GET /shuttle/missions/sts-69/mission-sts-69.html HTTP/1.0\" 200 10566 |\n|kgtyk4.kj.yamagata-u.ac.jp - - [01/Aug/1995:00:00:17 -0400] \"GET / HTTP/1.0\" 200 7280 |\n|kgtyk4.kj.yamagata-u.ac.jp - - [01/Aug/1995:00:00:18 -0400] \"GET /images/ksclogo-medium.gif HTTP/1.0\" 200 5866 |\n|d0ucr6.fnal.gov - - [01/Aug/1995:00:00:19 -0400] \"GET /history/apollo/apollo-16/apollo-16.html HTTP/1.0\" 200 2743 |\n|ix-esc-ca2-07.ix.netcom.com - - [01/Aug/1995:00:00:19 -0400] \"GET /shuttle/resources/orbiters/discovery.html HTTP/1.0\" 200 6849|\n|d0ucr6.fnal.gov - - [01/Aug/1995:00:00:20 -0400] \"GET /history/apollo/apollo-16/apollo-16-patch-small.gif HTTP/1.0\" 200 14897 |\n+-------------------------------------------------------------------------------------------------------------------------------+\nonly showing top 20 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857823_352568890", + "id": "20160718-195215_169877856", + "dateCreated": "2020-04-16 12:07:37.823", + "dateStarted": "2020-04-16 12:08:16.995", + "dateFinished": "2020-04-16 12:08:17.320", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (2b) Parsing the log file", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:17.394", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(2b) Parsing the log file\u003c/h3\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857824_143787995", + "id": "20160721-212802_988981671", + "dateCreated": "2020-04-16 12:07:37.824", + "dateStarted": "2020-04-16 12:08:17.650", + "dateFinished": "2020-04-16 12:08:17.657", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nIf you\u0027re familiar with web servers at all, you\u0027ll recognize that this is in\n[Common Log Format](https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format). \n\nThe fields are:\n\nremotehost, rfc931, authuser, [date], \"request\", status, bytes\n\n\n | field | meaning |\n |-------------|------------------------------------------------------------------------|\n | remotehost | Remote hostname (or IP number if DNS hostname is not available). |\n | rfc931 | The remote logname of the user. We don\u0027t really care about this field. |\n | authuser | The username of the remote user, as authenticated by the HTTP server. |\n | [date] | The date and time of the request. |\n | \"request\" | The request, exactly as it came from the browser or client. |\n | status | The HTTP status code the server sent back to the client. |\n | bytes | The number of bytes (`Content-Length`) transferred to the client. |\n\n\n\nNext, we have to parse it into individual columns. We\u0027ll use the special built-in [regexp\\_extract()](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.regexp_extract)\nfunction to do the parsing. This function matches a column against a regular expression with one or more [capture groups](http://regexone.com/lesson/capturing_groups) and allows you to extract one of the matched groups. We\u0027ll use one regular expression for each field we wish to extract.\n\nIf you can\u0027t read these regular expressions, don\u0027t worry. Trust us: They work. If you find regular expressions confusing (and they certainly _can_ be), and you want to learn more about them, start with the\n[RegexOne web site](http://regexone.com/). You might also find [_Regular Expressions Cookbook_](http://shop.oreilly.com/product/0636920023630.do), by Jan Goyvaerts and Steven Levithan, to be helpful.\n\n_Some people, when confronted with a problem, think \"I know, I\u0027ll use regular expressions.\" Now they have two problems._ (attributed to Jamie Zawinski)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:17.749", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eIf you\u0026rsquo;re familiar with web servers at all, you\u0026rsquo;ll recognize that this is in\u003cbr/\u003e\u003ca href\u003d\"https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format\"\u003eCommon Log Format\u003c/a\u003e. \u003c/p\u003e\n\u003cp\u003eThe fields are:\u003c/p\u003e\n\u003cp\u003eremotehost, rfc931, authuser, [date], \u0026ldquo;request\u0026rdquo;, status, bytes\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e| field | meaning |\n|-------------|------------------------------------------------------------------------|\n| remotehost | Remote hostname (or IP number if DNS hostname is not available). |\n| rfc931 | The remote logname of the user. We don\u0026#39;t really care about this field. |\n| authuser | The username of the remote user, as authenticated by the HTTP server. |\n| [date] | The date and time of the request. |\n| \u0026quot;request\u0026quot; | The request, exactly as it came from the browser or client. |\n| status | The HTTP status code the server sent back to the client. |\n| bytes | The number of bytes (`Content-Length`) transferred to the client. |\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eNext, we have to parse it into individual columns. We\u0026rsquo;ll use the special built-in \u003ca href\u003d\"http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.regexp_extract\"\u003eregexp_extract()\u003c/a\u003e\u003cbr/\u003efunction to do the parsing. This function matches a column against a regular expression with one or more \u003ca href\u003d\"http://regexone.com/lesson/capturing_groups\"\u003ecapture groups\u003c/a\u003e and allows you to extract one of the matched groups. We\u0026rsquo;ll use one regular expression for each field we wish to extract.\u003c/p\u003e\n\u003cp\u003eIf you can\u0026rsquo;t read these regular expressions, don\u0026rsquo;t worry. Trust us: They work. If you find regular expressions confusing (and they certainly \u003cem\u003ecan\u003c/em\u003e be), and you want to learn more about them, start with the\u003cbr/\u003e\u003ca href\u003d\"http://regexone.com/\"\u003eRegexOne web site\u003c/a\u003e. You might also find \u003ca href\u003d\"http://shop.oreilly.com/product/0636920023630.do\"\u003e\u003cem\u003eRegular Expressions Cookbook\u003c/em\u003e\u003c/a\u003e, by Jan Goyvaerts and Steven Levithan, to be helpful.\u003c/p\u003e\n\u003cp\u003e\u003cem\u003eSome people, when confronted with a problem, think \u0026ldquo;I know, I\u0026rsquo;ll use regular expressions.\u0026rdquo; Now they have two problems.\u003c/em\u003e (attributed to Jamie Zawinski)\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857825_1083081291", + "id": "20160721-210934_867340648", + "dateCreated": "2020-04-16 12:07:37.825", + "dateStarted": "2020-04-16 12:08:17.979", + "dateFinished": "2020-04-16 12:08:17.996", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nfrom pyspark.sql.functions import split, regexp_extract\nsplit_df \u003d base_df.select(regexp_extract(\u0027value\u0027, r\u0027^([^\\s]+\\s)\u0027, 1).alias(\u0027host\u0027),\n regexp_extract(\u0027value\u0027, r\u0027^.*\\[(\\d\\d/\\w{3}/\\d{4}:\\d{2}:\\d{2}:\\d{2} -\\d{4})]\u0027, 1).alias(\u0027timestamp\u0027),\n regexp_extract(\u0027value\u0027, r\u0027^.*\"\\w+\\s+([^\\s]+)\\s+HTTP.*\"\u0027, 1).alias(\u0027path\u0027),\n regexp_extract(\u0027value\u0027, r\u0027^.*\"\\s+([^\\s]+)\u0027, 1).cast(\u0027integer\u0027).alias(\u0027status\u0027),\n regexp_extract(\u0027value\u0027, r\u0027^.*\\s+(\\d+)$\u0027, 1).cast(\u0027integer\u0027).alias(\u0027content_size\u0027))\nsplit_df.show(truncate\u003dFalse)\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:18.078", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+----------------------------+--------------------------+---------------------------------------------------+------+------------+\n|host |timestamp |path |status|content_size|\n+----------------------------+--------------------------+---------------------------------------------------+------+------------+\n|in24.inetnebr.com |01/Aug/1995:00:00:01 -0400|/shuttle/missions/sts-68/news/sts-68-mcc-05.txt |200 |1839 |\n|uplherc.upl.com |01/Aug/1995:00:00:07 -0400|/ |304 |0 |\n|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/ksclogo-medium.gif |304 |0 |\n|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/MOSAIC-logosmall.gif |304 |0 |\n|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/USA-logosmall.gif |304 |0 |\n|ix-esc-ca2-07.ix.netcom.com |01/Aug/1995:00:00:09 -0400|/images/launch-logo.gif |200 |1713 |\n|uplherc.upl.com |01/Aug/1995:00:00:10 -0400|/images/WORLD-logosmall.gif |304 |0 |\n|slppp6.intermind.net |01/Aug/1995:00:00:10 -0400|/history/skylab/skylab.html |200 |1687 |\n|piweba4y.prodigy.com |01/Aug/1995:00:00:10 -0400|/images/launchmedium.gif |200 |11853 |\n|slppp6.intermind.net |01/Aug/1995:00:00:11 -0400|/history/skylab/skylab-small.gif |200 |9202 |\n|slppp6.intermind.net |01/Aug/1995:00:00:12 -0400|/images/ksclogosmall.gif |200 |3635 |\n|ix-esc-ca2-07.ix.netcom.com |01/Aug/1995:00:00:12 -0400|/history/apollo/images/apollo-logo1.gif |200 |1173 |\n|slppp6.intermind.net |01/Aug/1995:00:00:13 -0400|/history/apollo/images/apollo-logo.gif |200 |3047 |\n|uplherc.upl.com |01/Aug/1995:00:00:14 -0400|/images/NASA-logosmall.gif |304 |0 |\n|133.43.96.45 |01/Aug/1995:00:00:16 -0400|/shuttle/missions/sts-69/mission-sts-69.html |200 |10566 |\n|kgtyk4.kj.yamagata-u.ac.jp |01/Aug/1995:00:00:17 -0400|/ |200 |7280 |\n|kgtyk4.kj.yamagata-u.ac.jp |01/Aug/1995:00:00:18 -0400|/images/ksclogo-medium.gif |200 |5866 |\n|d0ucr6.fnal.gov |01/Aug/1995:00:00:19 -0400|/history/apollo/apollo-16/apollo-16.html |200 |2743 |\n|ix-esc-ca2-07.ix.netcom.com |01/Aug/1995:00:00:19 -0400|/shuttle/resources/orbiters/discovery.html |200 |6849 |\n|d0ucr6.fnal.gov |01/Aug/1995:00:00:20 -0400|/history/apollo/apollo-16/apollo-16-patch-small.gif|200 |14897 |\n+----------------------------+--------------------------+---------------------------------------------------+------+------------+\nonly showing top 20 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857826_-1295842938", + "id": "20160718-195552_1528005254", + "dateCreated": "2020-04-16 12:07:37.826", + "dateStarted": "2020-04-16 12:08:18.279", + "dateFinished": "2020-04-16 12:08:18.717", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (2c) Data Cleaning\n\nLet\u0027s see how well our parsing logic worked. First, let\u0027s verify that there are no null rows in the original data set.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:18.779", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(2c) Data Cleaning\u003c/h3\u003e\n\u003cp\u003eLet\u0026rsquo;s see how well our parsing logic worked. First, let\u0026rsquo;s verify that there are no null rows in the original data set.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857827_-1830491143", + "id": "20160721-212703_1932014155", + "dateCreated": "2020-04-16 12:07:37.827", + "dateStarted": "2020-04-16 12:08:19.044", + "dateFinished": "2020-04-16 12:08:19.051", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nprint base_df.filter(base_df[\u0027value\u0027].isNull()).count()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:19.143", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "0\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857828_-731100291", + "id": "20160718-200153_1865911463", + "dateCreated": "2020-04-16 12:07:37.828", + "dateStarted": "2020-04-16 12:08:19.385", + "dateFinished": "2020-04-16 12:08:19.900", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nIf our parsing worked properly, we\u0027ll have no rows with null column values. Let\u0027s check.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:19.984", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eIf our parsing worked properly, we\u0026rsquo;ll have no rows with null column values. Let\u0026rsquo;s check.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857829_1259382530", + "id": "20160721-212848_1283299327", + "dateCreated": "2020-04-16 12:07:37.829", + "dateStarted": "2020-04-16 12:08:20.238", + "dateFinished": "2020-04-16 12:08:20.247", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nbad_rows_df \u003d split_df.filter(split_df[\u0027host\u0027].isNull() |\n split_df[\u0027timestamp\u0027].isNull() |\n split_df[\u0027path\u0027].isNull() |\n split_df[\u0027status\u0027].isNull() |\n split_df[\u0027content_size\u0027].isNull())\n\nprint bad_rows_df.count()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:20.336", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "8756\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857830_2115216399", + "id": "20160718-200217_1205719967", + "dateCreated": "2020-04-16 12:07:37.830", + "dateStarted": "2020-04-16 12:08:20.574", + "dateFinished": "2020-04-16 12:08:25.372", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nNot perfect. We have some null values. We have more cleaning to do. Which columns are affected?", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:25.376", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eNot perfect. We have some null values. We have more cleaning to do. Which columns are affected?\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857831_1420760720", + "id": "20160721-212919_864646186", + "dateCreated": "2020-04-16 12:07:37.831", + "dateStarted": "2020-04-16 12:08:25.681", + "dateFinished": "2020-04-16 12:08:25.690", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nfrom pyspark.sql.functions import col, sum\n\ndef count_null(col_name):\n return sum(col(col_name).isNull().cast(\u0027integer\u0027)).alias(col_name)\n\n# Build up a list of column expressions, one per colum", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:25.791", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857833_-261001759", + "id": "20160718-200250_1860290921", + "dateCreated": "2020-04-16 12:07:37.833", + "dateStarted": "2020-04-16 12:08:26.174", + "dateFinished": "2020-04-16 12:08:26.180", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nexprs \u003d []\nfor col_name in split_df.columns:\n exprs.append(count_null(col_name))\n \nprint exprs\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:26.273", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "[Column\u003csum(CAST((host IS NULL) AS INT)) AS `host`\u003e, Column\u003csum(CAST((timestamp IS NULL) AS INT)) AS `timestamp`\u003e, Column\u003csum(CAST((path IS NULL) AS INT)) AS `path`\u003e, Column\u003csum(CAST((status IS NULL) AS INT)) AS `status`\u003e, Column\u003csum(CAST((content_size IS NULL) AS INT)) AS `content_size`\u003e]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857834_-735136779", + "id": "20160718-200341_292449305", + "dateCreated": "2020-04-16 12:07:37.834", + "dateStarted": "2020-04-16 12:08:26.522", + "dateFinished": "2020-04-16 12:08:26.559", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nsplit_df.agg(*exprs).show()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:26.621", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+----+---------+----+------+------------+\n|host|timestamp|path|status|content_size|\n+----+---------+----+------+------------+\n| 0| 0| 0| 0| 8756|\n+----+---------+----+------+------------+\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857835_-404419615", + "id": "20160718-200402_1743180176", + "dateCreated": "2020-04-16 12:07:37.835", + "dateStarted": "2020-04-16 12:08:26.860", + "dateFinished": "2020-04-16 12:08:32.050", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nOkay, they\u0027re all in the `content_size` column. Let\u0027s see if we can figure out what\u0027s wrong. Our original parsing regular expression for that column was:\n\n```\nregexp_extract(\u0027value\u0027, r\u0027^.*\\s+(\\d+)$\u0027, 1).cast(\u0027integer\u0027).alias(\u0027content_size\u0027)\n```\n\nThe `\\d+` selects one or more digits at the end of the input line. Is it possible there are lines without a valid content size? Or is there something wrong with our regular expression? Let\u0027s see if there are any lines that do not end with one or more digits.\n\n**Note**: In the expression below, `~` means \"not\".", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:32.065", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eOkay, they\u0026rsquo;re all in the \u003ccode\u003econtent_size\u003c/code\u003e column. Let\u0026rsquo;s see if we can figure out what\u0026rsquo;s wrong. Our original parsing regular expression for that column was:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003eregexp_extract(\u0026#39;value\u0026#39;, r\u0026#39;^.*\\s+(\\d+)$\u0026#39;, 1).cast(\u0026#39;integer\u0026#39;).alias(\u0026#39;content_size\u0026#39;)\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eThe \u003ccode\u003e\\d+\u003c/code\u003e selects one or more digits at the end of the input line. Is it possible there are lines without a valid content size? Or is there something wrong with our regular expression? Let\u0026rsquo;s see if there are any lines that do not end with one or more digits.\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003eNote\u003c/strong\u003e: In the expression below, \u003ccode\u003e~\u003c/code\u003e means \u0026ldquo;not\u0026rdquo;.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857836_-1853839059", + "id": "20160721-213021_496415433", + "dateCreated": "2020-04-16 12:07:37.836", + "dateStarted": "2020-04-16 12:08:32.397", + "dateFinished": "2020-04-16 12:08:32.409", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nbad_content_size_df \u003d base_df.filter(~ base_df[\u0027value\u0027].rlike(r\u0027\\d+$\u0027))\n\nprint bad_content_size_df.count()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:32.496", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "8756\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857838_1949422515", + "id": "20160718-200415_1670913707", + "dateCreated": "2020-04-16 12:07:37.838", + "dateStarted": "2020-04-16 12:08:32.948", + "dateFinished": "2020-04-16 12:08:34.252", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nThat\u0027s it! The count matches the number of rows in `bad_rows_df` exactly.\n\nLet\u0027s take a look at some of the bad column values. Since it\u0027s possible that the rows end in extra white space, we\u0027ll tack a marker character onto the end of each line, to make it easier to see trailing white space.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:34.348", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eThat\u0026rsquo;s it! The count matches the number of rows in \u003ccode\u003ebad_rows_df\u003c/code\u003e exactly.\u003c/p\u003e\n\u003cp\u003eLet\u0026rsquo;s take a look at some of the bad column values. Since it\u0026rsquo;s possible that the rows end in extra white space, we\u0026rsquo;ll tack a marker character onto the end of each line, to make it easier to see trailing white space.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857839_583251534", + "id": "20160721-213051_622513747", + "dateCreated": "2020-04-16 12:07:37.839", + "dateStarted": "2020-04-16 12:08:34.596", + "dateFinished": "2020-04-16 12:08:34.606", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nfrom pyspark.sql.functions import lit, concat\n\nbad_content_size_df.select(concat(bad_content_size_df[\u0027value\u0027], lit(\u0027$\u0027))).show(truncate\u003dFalse)\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:34.695", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+----------------------------------------------------------------------------------------------------------------------------+\n|concat(value, $) |\n+----------------------------------------------------------------------------------------------------------------------------+\n|gw1.att.com - - [01/Aug/1995:00:03:53 -0400] \"GET /shuttle/missions/sts-73/news HTTP/1.0\" 302 -$ |\n|js002.cc.utsunomiya-u.ac.jp - - [01/Aug/1995:00:07:33 -0400] \"GET /shuttle/resources/orbiters/discovery.gif HTTP/1.0\" 404 -$|\n|tia1.eskimo.com - - [01/Aug/1995:00:28:41 -0400] \"GET /pub/winvn/release.txt HTTP/1.0\" 404 -$ |\n|itws.info.eng.niigata-u.ac.jp - - [01/Aug/1995:00:38:01 -0400] \"GET /ksc.html/facts/about_ksc.html HTTP/1.0\" 403 -$ |\n|grimnet23.idirect.com - - [01/Aug/1995:00:50:12 -0400] \"GET /www/software/winvn/winvn.html HTTP/1.0\" 404 -$ |\n|miriworld.its.unimelb.edu.au - - [01/Aug/1995:01:04:54 -0400] \"GET /history/history.htm HTTP/1.0\" 404 -$ |\n|ras38.srv.net - - [01/Aug/1995:01:05:14 -0400] \"GET /elv/DELTA/uncons.htm HTTP/1.0\" 404 -$ |\n|cs1-06.leh.ptd.net - - [01/Aug/1995:01:17:38 -0400] \"GET /sts-71/launch/\" 404 -$ |\n|www-b2.proxy.aol.com - - [01/Aug/1995:01:22:07 -0400] \"GET /shuttle/countdown HTTP/1.0\" 302 -$ |\n|maui56.maui.net - - [01/Aug/1995:01:31:56 -0400] \"GET /shuttle HTTP/1.0\" 302 -$ |\n|dialip-24.athenet.net - - [01/Aug/1995:01:33:02 -0400] \"GET /history/apollo/apollo-13.html HTTP/1.0\" 404 -$ |\n|h96-158.ccnet.com - - [01/Aug/1995:01:35:50 -0400] \"GET /history/apollo/a-001/a-001-patch-small.gif HTTP/1.0\" 404 -$ |\n|h96-158.ccnet.com - - [01/Aug/1995:01:36:23 -0400] \"GET /history/apollo/a-001/movies/ HTTP/1.0\" 404 -$ |\n|h96-158.ccnet.com - - [01/Aug/1995:01:36:30 -0400] \"GET /history/apollo/a-001/a-001-patch-small.gif HTTP/1.0\" 404 -$ |\n|h96-158.ccnet.com - - [01/Aug/1995:01:36:38 -0400] \"GET /history/apollo/a-001/movies/ HTTP/1.0\" 404 -$ |\n|h96-158.ccnet.com - - [01/Aug/1995:01:36:42 -0400] \"GET /history/apollo/a-001/a-001-patch-small.gif HTTP/1.0\" 404 -$ |\n|h96-158.ccnet.com - - [01/Aug/1995:01:36:44 -0400] \"GET /history/apollo/a-001/images/ HTTP/1.0\" 404 -$ |\n|h96-158.ccnet.com - - [01/Aug/1995:01:36:47 -0400] \"GET /history/apollo/a-001/a-001-patch-small.gif HTTP/1.0\" 404 -$ |\n|h96-158.ccnet.com - - [01/Aug/1995:01:37:04 -0400] \"GET /history/apollo/a-004/a-004-patch-small.gif HTTP/1.0\" 404 -$ |\n|h96-158.ccnet.com - - [01/Aug/1995:01:37:05 -0400] \"GET /history/apollo/a-004/movies/ HTTP/1.0\" 404 -$ |\n+----------------------------------------------------------------------------------------------------------------------------+\nonly showing top 20 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857840_-835320090", + "id": "20160718-200445_212592333", + "dateCreated": "2020-04-16 12:07:37.840", + "dateStarted": "2020-04-16 12:08:34.959", + "dateFinished": "2020-04-16 12:08:35.988", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (2d) Fix the rows with null content\\_size\n\nThe easiest solution is to replace the null values in `split_df` with 0. The DataFrame API provides a set of functions and fields specifically designed for working with null values, among them:\n\n* [fillna()](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.fillna), which fills null values with specified non-null values.\n* [na](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.na), which returns a [DataFrameNaFunctions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameNaFunctions) object with many functions for operating on null columns.\n\nWe\u0027ll use `fillna()`, because it\u0027s simple. There are several ways to invoke this function. The easiest is just to replace _all_ null columns with known values. But, for safety, it\u0027s better to pass a Python dictionary containing (column\\_name, value) mappings. That\u0027s what we\u0027ll do.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:36.059", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(2d) Fix the rows with null content_size\u003c/h3\u003e\n\u003cp\u003eThe easiest solution is to replace the null values in \u003ccode\u003esplit_df\u003c/code\u003e with 0. The DataFrame API provides a set of functions and fields specifically designed for working with null values, among them:\u003c/p\u003e\n\u003cul\u003e\n \u003cli\u003e\u003ca href\u003d\"http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.fillna\"\u003efillna()\u003c/a\u003e, which fills null values with specified non-null values.\u003c/li\u003e\n \u003cli\u003e\u003ca href\u003d\"http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.na\"\u003ena\u003c/a\u003e, which returns a \u003ca href\u003d\"http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameNaFunctions\"\u003eDataFrameNaFunctions\u003c/a\u003e object with many functions for operating on null columns.\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eWe\u0026rsquo;ll use \u003ccode\u003efillna()\u003c/code\u003e, because it\u0026rsquo;s simple. There are several ways to invoke this function. The easiest is just to replace \u003cem\u003eall\u003c/em\u003e null columns with known values. But, for safety, it\u0026rsquo;s better to pass a Python dictionary containing (column_name, value) mappings. That\u0026rsquo;s what we\u0026rsquo;ll do.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857842_-39991921", + "id": "20160721-213152_1800712122", + "dateCreated": "2020-04-16 12:07:37.842", + "dateStarted": "2020-04-16 12:08:36.306", + "dateFinished": "2020-04-16 12:08:36.325", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n# Replace all null content_size values with 0.\ncleaned_df \u003d split_df.fillna({\u0027content_size\u0027: 0})", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:36.405", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857843_1284781406", + "id": "20160718-200611_314088170", + "dateCreated": "2020-04-16 12:07:37.843", + "dateStarted": "2020-04-16 12:08:36.638", + "dateFinished": "2020-04-16 12:08:36.654", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n# Ensure that there are no nulls left.\nexprs \u003d []\nfor col_name in cleaned_df.columns:\n exprs.append(count_null(col_name))\n\ncleaned_df.agg(*exprs).show()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:36.738", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+----+---------+----+------+------------+\n|host|timestamp|path|status|content_size|\n+----+---------+----+------+------------+\n| 0| 0| 0| 0| 0|\n+----+---------+----+------+------------+\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857844_785852184", + "id": "20160718-200631_2138789122", + "dateCreated": "2020-04-16 12:07:37.844", + "dateStarted": "2020-04-16 12:08:36.988", + "dateFinished": "2020-04-16 12:08:41.261", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (2e) Parsing the timestamp.\n\nOkay, now that we have a clean, parsed DataFrame, we have to parse the timestamp field into an actual timestamp. The Common Log Format time is somewhat non-standard. A User-Defined Function (UDF) is the most straightforward way to parse it.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:41.292", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(2e) Parsing the timestamp.\u003c/h3\u003e\n\u003cp\u003eOkay, now that we have a clean, parsed DataFrame, we have to parse the timestamp field into an actual timestamp. The Common Log Format time is somewhat non-standard. A User-Defined Function (UDF) is the most straightforward way to parse it.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857844_-483421829", + "id": "20160721-213247_360874576", + "dateCreated": "2020-04-16 12:07:37.844", + "dateStarted": "2020-04-16 12:08:41.564", + "dateFinished": "2020-04-16 12:08:41.574", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nfrom pyspark.sql.functions import udf\n\nmonth_map \u003d {\n \u0027Jan\u0027: 1, \u0027Feb\u0027: 2, \u0027Mar\u0027:3, \u0027Apr\u0027:4, \u0027May\u0027:5, \u0027Jun\u0027:6, \u0027Jul\u0027:7,\n \u0027Aug\u0027:8, \u0027Sep\u0027: 9, \u0027Oct\u0027:10, \u0027Nov\u0027: 11, \u0027Dec\u0027: 12\n}\n\ndef parse_clf_time(s):\n \"\"\" Convert Common Log time format into a Python datetime object\n Args:\n s (str): date and time in Apache time format [dd/mmm/yyyy:hh:mm:ss (+/-)zzzz]\n Returns:\n a string suitable for passing to CAST(\u0027timestamp\u0027)\n \"\"\"\n # NOTE: We\u0027re ignoring time zone here. In a production application, you\u0027d want to handle that.\n return \"{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}\".format(\n int(s[7:11]),\n month_map[s[3:6]],\n int(s[0:2]),\n int(s[12:14]),\n int(s[15:17]),\n int(s[18:20])\n )\n\nu_parse_time \u003d udf(parse_clf_time)\n\nlogs_df \u003d cleaned_df.select(\u0027*\u0027, u_parse_time(cleaned_df[\u0027timestamp\u0027]).cast(\u0027timestamp\u0027).alias(\u0027time\u0027)).drop(\u0027timestamp\u0027)\ntotal_log_entries \u003d logs_df.count()\n\nprint total_log_entries", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:41.663", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "1043177\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857845_860270154", + "id": "20160718-200648_780067911", + "dateCreated": "2020-04-16 12:07:37.845", + "dateStarted": "2020-04-16 12:08:41.855", + "dateFinished": "2020-04-16 12:08:42.288", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nlogs_df.printSchema()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:42.355", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "root\n |-- host: string (nullable \u003d true)\n |-- path: string (nullable \u003d true)\n |-- status: integer (nullable \u003d true)\n |-- content_size: integer (nullable \u003d false)\n |-- time: timestamp (nullable \u003d true)\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857846_1735475434", + "id": "20160718-200730_1083966913", + "dateCreated": "2020-04-16 12:07:37.846", + "dateStarted": "2020-04-16 12:08:42.582", + "dateFinished": "2020-04-16 12:08:42.589", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nlogs_df.show(truncate \u003d False)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:42.682", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+----------------------------+---------------------------------------------------+------+------------+-------------------+\n|host |path |status|content_size|time |\n+----------------------------+---------------------------------------------------+------+------------+-------------------+\n|in24.inetnebr.com |/shuttle/missions/sts-68/news/sts-68-mcc-05.txt |200 |1839 |1995-08-01 00:00:01|\n|uplherc.upl.com |/ |304 |0 |1995-08-01 00:00:07|\n|uplherc.upl.com |/images/ksclogo-medium.gif |304 |0 |1995-08-01 00:00:08|\n|uplherc.upl.com |/images/MOSAIC-logosmall.gif |304 |0 |1995-08-01 00:00:08|\n|uplherc.upl.com |/images/USA-logosmall.gif |304 |0 |1995-08-01 00:00:08|\n|ix-esc-ca2-07.ix.netcom.com |/images/launch-logo.gif |200 |1713 |1995-08-01 00:00:09|\n|uplherc.upl.com |/images/WORLD-logosmall.gif |304 |0 |1995-08-01 00:00:10|\n|slppp6.intermind.net |/history/skylab/skylab.html |200 |1687 |1995-08-01 00:00:10|\n|piweba4y.prodigy.com |/images/launchmedium.gif |200 |11853 |1995-08-01 00:00:10|\n|slppp6.intermind.net |/history/skylab/skylab-small.gif |200 |9202 |1995-08-01 00:00:11|\n|slppp6.intermind.net |/images/ksclogosmall.gif |200 |3635 |1995-08-01 00:00:12|\n|ix-esc-ca2-07.ix.netcom.com |/history/apollo/images/apollo-logo1.gif |200 |1173 |1995-08-01 00:00:12|\n|slppp6.intermind.net |/history/apollo/images/apollo-logo.gif |200 |3047 |1995-08-01 00:00:13|\n|uplherc.upl.com |/images/NASA-logosmall.gif |304 |0 |1995-08-01 00:00:14|\n|133.43.96.45 |/shuttle/missions/sts-69/mission-sts-69.html |200 |10566 |1995-08-01 00:00:16|\n|kgtyk4.kj.yamagata-u.ac.jp |/ |200 |7280 |1995-08-01 00:00:17|\n|kgtyk4.kj.yamagata-u.ac.jp |/images/ksclogo-medium.gif |200 |5866 |1995-08-01 00:00:18|\n|d0ucr6.fnal.gov |/history/apollo/apollo-16/apollo-16.html |200 |2743 |1995-08-01 00:00:19|\n|ix-esc-ca2-07.ix.netcom.com |/shuttle/resources/orbiters/discovery.html |200 |6849 |1995-08-01 00:00:19|\n|d0ucr6.fnal.gov |/history/apollo/apollo-16/apollo-16-patch-small.gif|200 |14897 |1995-08-01 00:00:20|\n+----------------------------+---------------------------------------------------+------+------------+-------------------+\nonly showing top 20 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857847_900593409", + "id": "20160718-201656_1316433772", + "dateCreated": "2020-04-16 12:07:37.847", + "dateStarted": "2020-04-16 12:08:42.921", + "dateFinished": "2020-04-16 12:08:47.288", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nLet\u0027s cache logs_df. We\u0027re going to be using it quite a bit from here forward.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:47.325", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eLet\u0026rsquo;s cache logs_df. We\u0026rsquo;re going to be using it quite a bit from here forward.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857848_202037001", + "id": "20160721-213319_1843703416", + "dateCreated": "2020-04-16 12:07:37.848", + "dateStarted": "2020-04-16 12:08:47.616", + "dateFinished": "2020-04-16 12:08:47.624", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nlogs_df.cache()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:47.715", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "DataFrame[host: string, path: string, status: int, content_size: int, time: timestamp]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857849_288796413", + "id": "20160718-201706_792557368", + "dateCreated": "2020-04-16 12:07:37.849", + "dateStarted": "2020-04-16 12:08:47.979", + "dateFinished": "2020-04-16 12:08:48.076", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n## Part 3: Analysis Walk-Through on the Web Server Log File\n\nNow that we have a DataFrame containing the parsed log file as a set of Row objects, we can perform various analyses.\n\n### (3a) Example: Content Size Statistics\n\nLet\u0027s compute some statistics about the sizes of content being returned by the web server. In particular, we\u0027d like to know what are the average, minimum, and maximum content sizes.\n\nWe can compute the statistics by calling `.describe()` on the `content_size` column of `logs_df`. The `.describe()` function returns the count, mean, stddev, min, and max of a given column.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:48.078", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch2\u003ePart 3: Analysis Walk-Through on the Web Server Log File\u003c/h2\u003e\n\u003cp\u003eNow that we have a DataFrame containing the parsed log file as a set of Row objects, we can perform various analyses.\u003c/p\u003e\n\u003ch3\u003e(3a) Example: Content Size Statistics\u003c/h3\u003e\n\u003cp\u003eLet\u0026rsquo;s compute some statistics about the sizes of content being returned by the web server. In particular, we\u0026rsquo;d like to know what are the average, minimum, and maximum content sizes.\u003c/p\u003e\n\u003cp\u003eWe can compute the statistics by calling \u003ccode\u003e.describe()\u003c/code\u003e on the \u003ccode\u003econtent_size\u003c/code\u003e column of \u003ccode\u003elogs_df\u003c/code\u003e. The \u003ccode\u003e.describe()\u003c/code\u003e function returns the count, mean, stddev, min, and max of a given column.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857850_-1282084411", + "id": "20160721-213338_215734708", + "dateCreated": "2020-04-16 12:07:37.850", + "dateStarted": "2020-04-16 12:08:48.414", + "dateFinished": "2020-04-16 12:08:48.423", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n# Calculate statistics based on the content size.\ncontent_size_summary_df \u003d logs_df.describe([\u0027content_size\u0027])\ncontent_size_summary_df.show()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:48.513", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+-------+------------------+\n|summary| content_size|\n+-------+------------------+\n| count| 1043177|\n| mean|17531.555702435926|\n| stddev| 68561.9990626423|\n| min| 0|\n| max| 3421948|\n+-------+------------------+\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857851_1700381734", + "id": "20160718-202001_878461975", + "dateCreated": "2020-04-16 12:07:37.851", + "dateStarted": "2020-04-16 12:08:48.746", + "dateFinished": "2020-04-16 12:08:59.411", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nAlternatively, we can use SQL to directly calculate these statistics. You can explore the many useful functions within the `pyspark.sql.functions` module in the [documentation](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions).\n\nAfter we apply the `.agg()` function, we call `.first()` to extract the first value, which is equivalent to `.take(1)[0]`.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:59.457", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eAlternatively, we can use SQL to directly calculate these statistics. You can explore the many useful functions within the \u003ccode\u003epyspark.sql.functions\u003c/code\u003e module in the \u003ca href\u003d\"https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions\"\u003edocumentation\u003c/a\u003e.\u003c/p\u003e\n\u003cp\u003eAfter we apply the \u003ccode\u003e.agg()\u003c/code\u003e function, we call \u003ccode\u003e.first()\u003c/code\u003e to extract the first value, which is equivalent to \u003ccode\u003e.take(1)[0]\u003c/code\u003e.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857851_875068948", + "id": "20160721-213401_647137257", + "dateCreated": "2020-04-16 12:07:37.851", + "dateStarted": "2020-04-16 12:08:59.745", + "dateFinished": "2020-04-16 12:08:59.756", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nfrom pyspark.sql import functions as sqlFunctions\n\nstats \u003d (logs_df\n .agg(sqlFunctions.min(logs_df[\u0027content_size\u0027]),\n sqlFunctions.avg(logs_df[\u0027content_size\u0027]),\n sqlFunctions.max(logs_df[\u0027content_size\u0027]))\n .first())\n\nprint \u0027Using SQL functions:\u0027\nprint \u0027Content Size Avg: %.02f Min: %.02f; Max: %.02f\u0027 % (stats[\u0027avg(content_size)\u0027], stats[\u0027min(content_size)\u0027], stats[\u0027max(content_size)\u0027])\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:08:59.842", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Using SQL functions:\nContent Size Avg: 17531.56 Min: 0.00; Max: 3421948.00\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857852_-1363691512", + "id": "20160718-202032_221945837", + "dateCreated": "2020-04-16 12:07:37.852", + "dateStarted": "2020-04-16 12:09:00.042", + "dateFinished": "2020-04-16 12:09:00.254", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (3b) Example: HTTP Status Analysis\n\nNext, let\u0027s look at the status values that appear in the log. We want to know which status values appear in the data and how many times. We again start with `logs_df`, then group by the `status` column, apply the `.count()` aggregation function, and sort by the `status` column.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:00.324", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(3b) Example: HTTP Status Analysis\u003c/h3\u003e\n\u003cp\u003eNext, let\u0026rsquo;s look at the status values that appear in the log. We want to know which status values appear in the data and how many times. We again start with \u003ccode\u003elogs_df\u003c/code\u003e, then group by the \u003ccode\u003estatus\u003c/code\u003e column, apply the \u003ccode\u003e.count()\u003c/code\u003e aggregation function, and sort by the \u003ccode\u003estatus\u003c/code\u003e column.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857852_-1140552962", + "id": "20160721-213426_1090267607", + "dateCreated": "2020-04-16 12:07:37.852", + "dateStarted": "2020-04-16 12:09:00.576", + "dateFinished": "2020-04-16 12:09:00.585", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nstatus_to_count_df \u003d(logs_df\n .groupBy(\u0027status\u0027)\n .count()\n .sort(\u0027status\u0027)\n .cache())\n\nstatus_to_count_length \u003d status_to_count_df.count()\nprint \u0027Found %d response codes\u0027 % status_to_count_length\nstatus_to_count_df.show()\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:00.675", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Found 7 response codes\n+------+------+\n|status| count|\n+------+------+\n| 200|940847|\n| 302| 16244|\n| 304| 79824|\n| 403| 58|\n| 404| 6185|\n| 500| 2|\n| 501| 17|\n+------+------+\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857853_957464334", + "id": "20160718-202106_1792027802", + "dateCreated": "2020-04-16 12:07:37.853", + "dateStarted": "2020-04-16 12:09:00.921", + "dateFinished": "2020-04-16 12:09:02.823", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nassert status_to_count_length \u003d\u003d 7\nassert status_to_count_df.take(100) \u003d\u003d [(200, 940847), (302, 16244), (304, 79824), (403, 58), (404, 6185), (500, 2), (501, 17)]\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:02.922", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857854_1657997585", + "id": "20160718-203631_1933692996", + "dateCreated": "2020-04-16 12:07:37.854", + "dateStarted": "2020-04-16 12:09:03.126", + "dateFinished": "2020-04-16 12:09:03.254", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (3c) Example: Status Graphing\n\nNow, let\u0027s visualize the results from the last example. We can seamlessly switch over to the \u0027%sql\u0027 interpreter to show a bar chart of the count for each response code. You can see that this is not a very effective plot. Due to the large number of \u0027200\u0027 codes, it is very hard to see the relative number of the others. We can alleviate this by taking the logarithm of the count, adding that as a column to our DataFrame and displaying the result.\n\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:03.326", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(3c) Example: Status Graphing\u003c/h3\u003e\n\u003cp\u003eNow, let\u0026rsquo;s visualize the results from the last example. We can seamlessly switch over to the \u0026lsquo;%sql\u0026rsquo; interpreter to show a bar chart of the count for each response code. You can see that this is not a very effective plot. Due to the large number of \u0026lsquo;200\u0026rsquo; codes, it is very hard to see the relative number of the others. We can alleviate this by taking the logarithm of the count, adding that as a column to our DataFrame and displaying the result.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857854_-1549449830", + "id": "20160721-213602_558712068", + "dateCreated": "2020-04-16 12:07:37.854", + "dateStarted": "2020-04-16 12:09:03.546", + "dateFinished": "2020-04-16 12:09:03.555", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nlog_status_to_count_df \u003d status_to_count_df.withColumn(\u0027log_count\u0027, sqlFunctions.log(status_to_count_df[\u0027count\u0027]))\nlog_status_to_count_df.show()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:03.646", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+------+------+------------------+\n|status| count| log_count|\n+------+------+------------------+\n| 200|940847| 13.75453581236166|\n| 302| 16244| 9.69547888880619|\n| 304| 79824|11.287579490100818|\n| 403| 58| 4.060443010546419|\n| 404| 6185| 8.729882284826589|\n| 500| 2|0.6931471805599453|\n| 501| 17| 2.833213344056216|\n+------+------+------------------+\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857855_-1548276301", + "id": "20160718-203658_1473188241", + "dateCreated": "2020-04-16 12:07:37.855", + "dateStarted": "2020-04-16 12:09:03.837", + "dateFinished": "2020-04-16 12:09:04.005", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nNext, we\u0027ll register the contents of the DataFrame as a temp table, backed by Hive metastore, so we can write sql queries against the data. \n\nAfter running the SQL \u0027select\u0027 cell below, choose one or more of the display options available, and then open the \"settings\" tab and drag `status` to the key entry field and drag `log_count` to the value entry field. See the diagram, below, for an example.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:04.037", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eNext, we\u0026rsquo;ll register the contents of the DataFrame as a temp table, backed by Hive metastore, so we can write sql queries against the data. \u003c/p\u003e\n\u003cp\u003eAfter running the SQL \u0026lsquo;select\u0026rsquo; cell below, choose one or more of the display options available, and then open the \u0026ldquo;settings\u0026rdquo; tab and drag \u003ccode\u003estatus\u003c/code\u003e to the key entry field and drag \u003ccode\u003elog_count\u003c/code\u003e to the value entry field. See the diagram, below, for an example.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857881_-1550898881", + "id": "20160722-191107_1130696257", + "dateCreated": "2020-04-16 12:07:37.882", + "dateStarted": "2020-04-16 12:09:04.287", + "dateFinished": "2020-04-16 12:09:04.297", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nprint log_status_to_count_df.columns\nlog_status_to_count_df.registerTempTable(\"logstatus\")\nprint sqlContext.read.table(\u0027logstatus\u0027).dtypes", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:04.386", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "[\u0027status\u0027, \u0027count\u0027, \u0027log_count\u0027]\n[(\u0027status\u0027, \u0027int\u0027), (\u0027count\u0027, \u0027bigint\u0027), (\u0027log_count\u0027, \u0027double\u0027)]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857883_-429067246", + "id": "20160722-154558_417576869", + "dateCreated": "2020-04-16 12:07:37.883", + "dateStarted": "2020-04-16 12:09:04.611", + "dateFinished": "2020-04-16 12:09:04.643", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\n\nselect status, count, log_count from logstatus", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:04.710", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/sql", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "multiBarChart", + "height": 300.0, + "optionOpen": false, + "keys": [ + { + "name": "status", + "index": 0.0, + "aggr": "sum" + } + ], + "values": [ + { + "name": "log_count", + "index": 2.0, + "aggr": "sum" + } + ], + "groups": [], + "scatter": { + "xAxis": { + "name": "status", + "index": 0.0, + "aggr": "sum" + } + }, + "setting": { + "multiBarChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + } + }, + "commonSetting": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TABLE", + "data": "status\tcount\tlog_count\n200\t940847\t13.75453581236166\n302\t16244\t9.69547888880619\n304\t79824\t11.287579490100818\n403\t58\t4.060443010546419\n404\t6185\t8.729882284826589\n500\t2\t0.6931471805599453\n501\t17\t2.833213344056216\n" + }, + { + "type": "TEXT", + "data": "" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857884_290082634", + "id": "20160722-154539_189658824", + "dateCreated": "2020-04-16 12:07:37.884", + "dateStarted": "2020-04-16 12:09:04.940", + "dateFinished": "2020-04-16 12:09:05.102", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (3d) Example: Frequent Hosts\n\nLet\u0027s look at hosts that have accessed the server frequently (e.g., more than ten times). As with the response code analysis in (3b), we create a new DataFrame by grouping `successLogsDF` by the \u0027host\u0027 column and aggregating by count.\n\nWe then filter the result based on the count of accesses by each host being greater than ten. Then, we select the \u0027host\u0027 column and show 20 elements from the result.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:05.140", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(3d) Example: Frequent Hosts\u003c/h3\u003e\n\u003cp\u003eLet\u0026rsquo;s look at hosts that have accessed the server frequently (e.g., more than ten times). As with the response code analysis in (3b), we create a new DataFrame by grouping \u003ccode\u003esuccessLogsDF\u003c/code\u003e by the \u0026lsquo;host\u0026rsquo; column and aggregating by count.\u003c/p\u003e\n\u003cp\u003eWe then filter the result based on the count of accesses by each host being greater than ten. Then, we select the \u0026lsquo;host\u0026rsquo; column and show 20 elements from the result.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857885_-314140241", + "id": "20160722-191832_137511351", + "dateCreated": "2020-04-16 12:07:37.885", + "dateStarted": "2020-04-16 12:09:05.407", + "dateFinished": "2020-04-16 12:09:05.415", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n# Any hosts that has accessed the server more than 10 times.\nhost_sum_df \u003d(logs_df\n .groupBy(\u0027host\u0027)\n .count())\n\nhost_more_than_10_df \u003d (host_sum_df\n .filter(host_sum_df[\u0027count\u0027] \u003e 10)\n .select(host_sum_df[\u0027host\u0027]))\n\nprint \u0027Any 20 hosts that have accessed more then 10 times:\\n\u0027\nhost_more_than_10_df.show(truncate\u003dFalse)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:05.507", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Any 20 hosts that have accessed more then 10 times:\n\n+-------------------------------+\n|host |\n+-------------------------------+\n|prakinf2.prakinf.tu-ilmenau.de |\n|alpha2.csd.uwm.edu |\n|cjc07992.slip.digex.net |\n|n1377004.ksc.nasa.gov |\n|163.205.2.134 |\n|huge.oso.chalmers.se |\n|163.205.44.27 |\n|shark.ksc.nasa.gov |\n|etc5.etechcorp.com |\n|dd07-029.compuserve.com |\n|131.182.101.161 |\n|134.95.100.201 |\n|vab08.larc.nasa.gov |\n|ip11.iac.net |\n|ad11-012.compuserve.com |\n|ad053.du.pipex.com |\n|204.184.6.19 |\n|p8.denver1.dialup.csn.net |\n|gate2.gdc.com |\n|alcott.acsu.buffalo.edu |\n+-------------------------------+\nonly showing top 20 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857885_1030962059", + "id": "20160718-203737_92678960", + "dateCreated": "2020-04-16 12:07:37.885", + "dateStarted": "2020-04-16 12:09:05.722", + "dateFinished": "2020-04-16 12:09:06.753", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (3e) Example: Visualizing Paths\n\nNow, let\u0027s visualize the number of hits to paths (URIs) in the log. To perform this task, we start with our `logs_df` and group by the `path` column, aggregate by count, and sort in descending order.\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:06.823", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(3e) Example: Visualizing Paths\u003c/h3\u003e\n\u003cp\u003eNow, let\u0026rsquo;s visualize the number of hits to paths (URIs) in the log. To perform this task, we start with our \u003ccode\u003elogs_df\u003c/code\u003e and group by the \u003ccode\u003epath\u003c/code\u003e column, aggregate by count, and sort in descending order.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857886_891969335", + "id": "20160722-191924_1596254266", + "dateCreated": "2020-04-16 12:07:37.886", + "dateStarted": "2020-04-16 12:09:07.072", + "dateFinished": "2020-04-16 12:09:07.079", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\npaths_df \u003d (logs_df\n .groupBy(\u0027path\u0027)\n .count()\n .sort(\u0027count\u0027, ascending\u003dFalse))\n \n#paths_df.printSchema()\n#paths_df.collect()\n\npaths_counts \u003d (paths_df\n .select(\u0027path\u0027, \u0027count\u0027)\n .rdd #mukul \n .map(lambda r: (r[0], r[1]))\n .collect())\n\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:07.171", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857887_2064835322", + "id": "20160718-211843_299092782", + "dateCreated": "2020-04-16 12:07:37.887", + "dateStarted": "2020-04-16 12:09:07.387", + "dateFinished": "2020-04-16 12:09:10.457", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nOptional: We can extract the paths and the counts, and unpack the resulting list of `Rows` using a `map` function and `lambda` expression. This would allow us to find particular paths, such as those that describe the fateful Apollo 13 mission.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:10.490", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eOptional: We can extract the paths and the counts, and unpack the resulting list of \u003ccode\u003eRows\u003c/code\u003e using a \u003ccode\u003emap\u003c/code\u003e function and \u003ccode\u003elambda\u003c/code\u003e expression. This would allow us to find particular paths, such as those that describe the fateful Apollo 13 mission.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857887_1375134033", + "id": "20160722-193401_1732241902", + "dateCreated": "2020-04-16 12:07:37.887", + "dateStarted": "2020-04-16 12:09:10.785", + "dateFinished": "2020-04-16 12:09:10.792", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\npaths, counts \u003d zip(*paths_counts)\n\nfor p in paths:\n if p.startswith(\u0027/history/apollo/apollo13\u0027):\n print p", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:10.892", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "/history/apollo/apollo13/apollo-13.html\n/history/apollo/apollo13/apollo-13-info.html\n/history/apollo/apollo13/movies/apo13home.mpg\n/history/apollo/apollo13\n/history/apollo/apollo13/apollo13.html\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857888_-1020959682", + "id": "20160722-193156_1307759391", + "dateCreated": "2020-04-16 12:07:37.888", + "dateStarted": "2020-04-16 12:09:11.172", + "dateFinished": "2020-04-16 12:09:11.189", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (3f) Example: Top Paths\n\nFor the final example, we\u0027ll find the top paths (URIs) in the log. Because we sorted `paths_df` for plotting, all we need to do is call `.show()` and pass in `n\u003d10` and `truncate\u003dFalse` as the parameters to show the top ten paths without truncating.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:11.272", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(3f) Example: Top Paths\u003c/h3\u003e\n\u003cp\u003eFor the final example, we\u0026rsquo;ll find the top paths (URIs) in the log. Because we sorted \u003ccode\u003epaths_df\u003c/code\u003e for plotting, all we need to do is call \u003ccode\u003e.show()\u003c/code\u003e and pass in \u003ccode\u003en\u003d10\u003c/code\u003e and \u003ccode\u003etruncate\u003dFalse\u003c/code\u003e as the parameters to show the top ten paths without truncating.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857888_-1210227761", + "id": "20160722-193447_577294938", + "dateCreated": "2020-04-16 12:07:37.888", + "dateStarted": "2020-04-16 12:09:11.485", + "dateFinished": "2020-04-16 12:09:11.491", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nprint \u0027Top Ten Paths:\u0027\npaths_df.show(n\u003d10, truncate\u003dFalse)\n\nexpected \u003d [\n (u\u0027/images/NASA-logosmall.gif\u0027, 59666),\n (u\u0027/images/KSC-logosmall.gif\u0027, 50420),\n (u\u0027/images/MOSAIC-logosmall.gif\u0027, 43831),\n (u\u0027/images/USA-logosmall.gif\u0027, 43604),\n (u\u0027/images/WORLD-logosmall.gif\u0027, 43217),\n (u\u0027/images/ksclogo-medium.gif\u0027, 41267),\n (u\u0027/ksc.html\u0027, 28536),\n (u\u0027/history/apollo/images/apollo-logo1.gif\u0027, 26766),\n (u\u0027/images/launch-logo.gif\u0027, 24742),\n (u\u0027/\u0027, 20173)\n]\nassert paths_df.take(10) \u003d\u003d expected, \u0027incorrect Top Ten Paths\u0027\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:11.585", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Top Ten Paths:\n+---------------------------------------+-----+\n|path |count|\n+---------------------------------------+-----+\n|/images/NASA-logosmall.gif |59666|\n|/images/KSC-logosmall.gif |50420|\n|/images/MOSAIC-logosmall.gif |43831|\n|/images/USA-logosmall.gif |43604|\n|/images/WORLD-logosmall.gif |43217|\n|/images/ksclogo-medium.gif |41267|\n|/ksc.html |28536|\n|/history/apollo/images/apollo-logo1.gif|26766|\n|/images/launch-logo.gif |24742|\n|/ |20173|\n+---------------------------------------+-----+\nonly showing top 10 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857889_1119520986", + "id": "20160718-211914_242228510", + "dateCreated": "2020-04-16 12:07:37.889", + "dateStarted": "2020-04-16 12:09:11.822", + "dateFinished": "2020-04-16 12:09:13.096", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n## Part 4: Analyzing Web Server Log File\n\nNow it is your turn to perform analyses on the web server log files.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:13.123", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch2\u003ePart 4: Analyzing Web Server Log File\u003c/h2\u003e\n\u003cp\u003eNow it is your turn to perform analyses on the web server log files.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857890_527870634", + "id": "20160722-193742_1517160612", + "dateCreated": "2020-04-16 12:07:37.890", + "dateStarted": "2020-04-16 12:09:13.408", + "dateFinished": "2020-04-16 12:09:13.414", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n**(4a) Exercise: Top Ten Error Paths**\n\nWhat are the top ten paths which did not have return code 200? Create a sorted list containing the paths and the number of times that they were accessed with a non-200 return code and show the top ten.\n\nThink about the steps that you need to perform to determine which paths did not have a 200 return code, how you will uniquely count those paths and sort the list.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:13.507", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e\u003cstrong\u003e(4a) Exercise: Top Ten Error Paths\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eWhat are the top ten paths which did not have return code 200? Create a sorted list containing the paths and the number of times that they were accessed with a non-200 return code and show the top ten.\u003c/p\u003e\n\u003cp\u003eThink about the steps that you need to perform to determine which paths did not have a 200 return code, how you will uniquely count those paths and sort the list.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857890_-81515652", + "id": "20160722-193809_1481595131", + "dateCreated": "2020-04-16 12:07:37.890", + "dateStarted": "2020-04-16 12:09:13.717", + "dateFinished": "2020-04-16 12:09:13.726", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n# DataFrame containing all accesses that did not return a code 200\nfrom pyspark.sql.functions import desc, log\n\nnot200DF \u003d logs_df.filter(logs_df[\u0027status\u0027] !\u003d 200)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:13.816", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857891_-337527593", + "id": "20160718-212016_221401993", + "dateCreated": "2020-04-16 12:07:37.891", + "dateStarted": "2020-04-16 12:09:14.100", + "dateFinished": "2020-04-16 12:09:14.114", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n# Sorted DataFrame containing all paths and the number of times they were accessed with non-200 return code\nlogs_sum_df \u003d not200DF.groupBy(\u0027path\u0027).count().sort(\u0027count\u0027, ascending\u003dFalse)\n\nprint \u0027Top Ten failed URLs:\u0027\nlogs_sum_df.show(10, False)\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:14.199", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Top Ten failed URLs:\n+---------------------------------------+-----+\n|path |count|\n+---------------------------------------+-----+\n|/images/NASA-logosmall.gif |8761 |\n|/images/KSC-logosmall.gif |7236 |\n|/images/MOSAIC-logosmall.gif |5197 |\n|/images/USA-logosmall.gif |5157 |\n|/images/WORLD-logosmall.gif |5020 |\n|/images/ksclogo-medium.gif |4728 |\n|/history/apollo/images/apollo-logo1.gif|2907 |\n|/images/launch-logo.gif |2811 |\n|/ |2199 |\n|/images/ksclogosmall.gif |1622 |\n+---------------------------------------+-----+\nonly showing top 10 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857892_112062135", + "id": "20160718-212055_410692090", + "dateCreated": "2020-04-16 12:07:37.892", + "dateStarted": "2020-04-16 12:09:14.527", + "dateFinished": "2020-04-16 12:09:15.377", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\ntop_10_err_urls \u003d [(row[0], row[1]) for row in logs_sum_df.take(10)]\ntop_10_err_expected \u003d [\n (u\u0027/images/NASA-logosmall.gif\u0027, 8761),\n (u\u0027/images/KSC-logosmall.gif\u0027, 7236),\n (u\u0027/images/MOSAIC-logosmall.gif\u0027, 5197),\n (u\u0027/images/USA-logosmall.gif\u0027, 5157),\n (u\u0027/images/WORLD-logosmall.gif\u0027, 5020),\n (u\u0027/images/ksclogo-medium.gif\u0027, 4728),\n (u\u0027/history/apollo/images/apollo-logo1.gif\u0027, 2907),\n (u\u0027/images/launch-logo.gif\u0027, 2811),\n (u\u0027/\u0027, 2199),\n (u\u0027/images/ksclogosmall.gif\u0027, 1622)\n]\n\nassert logs_sum_df.count() \u003d\u003d 7675, \u0027incorrect count for logs_sum_df\u0027\nassert top_10_err_urls \u003d\u003d top_10_err_expected, \u0027incorrect Top Ten failed URLs\u0027\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:15.428", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857892_1235190427", + "id": "20160718-212113_968538624", + "dateCreated": "2020-04-16 12:07:37.892", + "dateStarted": "2020-04-16 12:09:15.683", + "dateFinished": "2020-04-16 12:09:17.615", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (4b) Exercise: Number of Unique Hosts\n\nHow many unique hosts are there in the entire log?\n\nThere are multiple ways to find this. Try to find a more optimal way than grouping by \u0027host\u0027.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:17.684", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(4b) Exercise: Number of Unique Hosts\u003c/h3\u003e\n\u003cp\u003eHow many unique hosts are there in the entire log?\u003c/p\u003e\n\u003cp\u003eThere are multiple ways to find this. Try to find a more optimal way than grouping by \u0026lsquo;host\u0026rsquo;.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857893_1300321604", + "id": "20160722-194321_2111382895", + "dateCreated": "2020-04-16 12:07:37.893", + "dateStarted": "2020-04-16 12:09:18.002", + "dateFinished": "2020-04-16 12:09:18.009", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nunique_host_count \u003d logs_df.select(\u0027host\u0027).distinct().count()\nprint \u0027Unique hosts: {0}\u0027.format(unique_host_count)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:18.102", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Unique hosts: 54507\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857894_1046390480", + "id": "20160718-212129_286712384", + "dateCreated": "2020-04-16 12:07:37.894", + "dateStarted": "2020-04-16 12:09:18.355", + "dateFinished": "2020-04-16 12:09:19.230", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nassert unique_host_count \u003d\u003d 54507, \u0027incorrect unique_host_count\u0027", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:19.255", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857895_-1894214203", + "id": "20160718-212331_1613083931", + "dateCreated": "2020-04-16 12:07:37.895", + "dateStarted": "2020-04-16 12:09:19.549", + "dateFinished": "2020-04-16 12:09:19.555", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (4c) Exercise: Number of Unique Daily Hosts\n\nFor an advanced exercise, let\u0027s determine the number of unique hosts in the entire log on a day-by-day basis. This computation will give us counts of the number of unique daily hosts. We\u0027d like a DataFrame sorted by increasing day of the month which includes the day of the month and the associated number of unique hosts for that day. Make sure you cache the resulting DataFrame `daily_hosts_df` so that we can reuse it in the next exercise.\n\nThink about the steps that you need to perform to count the number of different hosts that make requests *each* day.\n*Since the log only covers a single month, you can ignore the month.* You may want to use the [`dayofmonth` function](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.dayofmonth) in the `pyspark.sql.functions` module.\n\n**Description of each variable**\n\n**`day_to_host_pair_df`**\n\nA DataFrame with two columns\n\n | column | explanation |\n | ------ | -------------------- |\n | `host` | the host name |\n | `day` | the day of the month |\n\nThere will be one row in this DataFrame for each row in `logs_df`. Essentially, you\u0027re just trimming and transforming each row of `logs_df`. For example, for this row in `logs_df`:\n\n```\ngw1.att.com - - [23/Aug/1995:00:03:53 -0400] \"GET /shuttle/missions/sts-73/news HTTP/1.0\" 302 -\n```\n\nyour `day_to_host_pair_df` should have:\n\n```\ngw1.att.com 23\n```\n\n**`day_group_hosts_df`**\n\nThis DataFrame has the same columns as `day_to_host_pair_df`, but with duplicate (`day`, `host`) rows removed.\n\n**`daily_hosts_df`**\n\nA DataFrame with two columns:\n\n | column | explanation |\n | ------- | -------------------------------------------------- |\n | `day` | the day of the month |\n | `count` | the number of unique requesting hosts for that day |\n \n ", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:19.649", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(4c) Exercise: Number of Unique Daily Hosts\u003c/h3\u003e\n\u003cp\u003eFor an advanced exercise, let\u0026rsquo;s determine the number of unique hosts in the entire log on a day-by-day basis. This computation will give us counts of the number of unique daily hosts. We\u0026rsquo;d like a DataFrame sorted by increasing day of the month which includes the day of the month and the associated number of unique hosts for that day. Make sure you cache the resulting DataFrame \u003ccode\u003edaily_hosts_df\u003c/code\u003e so that we can reuse it in the next exercise.\u003c/p\u003e\n\u003cp\u003eThink about the steps that you need to perform to count the number of different hosts that make requests \u003cem\u003eeach\u003c/em\u003e day.\u003cbr/\u003e\u003cem\u003eSince the log only covers a single month, you can ignore the month.\u003c/em\u003e You may want to use the \u003ca href\u003d\"https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.dayofmonth\"\u003e\u003ccode\u003edayofmonth\u003c/code\u003e function\u003c/a\u003e in the \u003ccode\u003epyspark.sql.functions\u003c/code\u003e module.\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003eDescription of each variable\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003e\u003ccode\u003eday_to_host_pair_df\u003c/code\u003e\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eA DataFrame with two columns\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e| column | explanation |\n| ------ | -------------------- |\n| `host` | the host name |\n| `day` | the day of the month |\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eThere will be one row in this DataFrame for each row in \u003ccode\u003elogs_df\u003c/code\u003e. Essentially, you\u0026rsquo;re just trimming and transforming each row of \u003ccode\u003elogs_df\u003c/code\u003e. For example, for this row in \u003ccode\u003elogs_df\u003c/code\u003e:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003egw1.att.com - - [23/Aug/1995:00:03:53 -0400] \u0026quot;GET /shuttle/missions/sts-73/news HTTP/1.0\u0026quot; 302 -\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eyour \u003ccode\u003eday_to_host_pair_df\u003c/code\u003e should have:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003egw1.att.com 23\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003e\u003cstrong\u003e\u003ccode\u003eday_group_hosts_df\u003c/code\u003e\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eThis DataFrame has the same columns as \u003ccode\u003eday_to_host_pair_df\u003c/code\u003e, but with duplicate (\u003ccode\u003eday\u003c/code\u003e, \u003ccode\u003ehost\u003c/code\u003e) rows removed.\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003e\u003ccode\u003edaily_hosts_df\u003c/code\u003e\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eA DataFrame with two columns:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e| column | explanation |\n| ------- | -------------------------------------------------- |\n| `day` | the day of the month |\n| `count` | the number of unique requesting hosts for that day |\n\u003c/code\u003e\u003c/pre\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857896_-1121238663", + "id": "20160722-194417_206107074", + "dateCreated": "2020-04-16 12:07:37.896", + "dateStarted": "2020-04-16 12:09:19.860", + "dateFinished": "2020-04-16 12:09:19.876", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nfrom pyspark.sql.functions import dayofmonth\n\nday_to_host_pair_df \u003d logs_df.select(\u0027host\u0027, dayofmonth(\u0027time\u0027).alias(\u0027day\u0027))\n\nday_group_hosts_df \u003d day_to_host_pair_df.distinct()\n\ndaily_hosts_df \u003d day_group_hosts_df.groupBy(\u0027day\u0027).count()\ndaily_hosts_df.cache()\n\nprint \u0027Unique hosts per day:\u0027\ndaily_hosts_df.show(30, False)\n\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:19.959", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Unique hosts per day:\n+---+-----+\n|day|count|\n+---+-----+\n|12 |2864 |\n|22 |4456 |\n|1 |2582 |\n|13 |2650 |\n|6 |2537 |\n|16 |4340 |\n|3 |3222 |\n|20 |2560 |\n|5 |2502 |\n|19 |2550 |\n|15 |4214 |\n|9 |4317 |\n|17 |4385 |\n|4 |4190 |\n|8 |4406 |\n|7 |4106 |\n|10 |4523 |\n|21 |4134 |\n|11 |4346 |\n|14 |4454 |\n|18 |4168 |\n+---+-----+\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857896_-1288196643", + "id": "20160718-212353_745338793", + "dateCreated": "2020-04-16 12:07:37.897", + "dateStarted": "2020-04-16 12:09:20.174", + "dateFinished": "2020-04-16 12:09:22.457", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nLet\u0027s make sure our data matches known good values.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:22.476", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eLet\u0026rsquo;s make sure our data matches known good values.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857897_-791752752", + "id": "20160723-180050_1724926025", + "dateCreated": "2020-04-16 12:07:37.897", + "dateStarted": "2020-04-16 12:09:22.782", + "dateFinished": "2020-04-16 12:09:22.792", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\ndaily_hosts_list \u003d (daily_hosts_df\n .rdd #mukul: https://forums.wikitechy.com/question/attributeerror-dataframe-object-has-no-attribute-map/\n .map(lambda r: (r[0], r[1]))\n .take(30))\n \n#print daily_hosts_list\n \nassert day_to_host_pair_df.count() \u003d\u003d total_log_entries, \u0027incorrect row count for day_to_host_pair_df\u0027\nassert daily_hosts_df.count() \u003d\u003d 21, \u0027incorrect daily_hosts_df.count()\u0027\n#assert daily_hosts_list \u003d\u003d [(1, 2582), (3, 3222), (4, 4190), (5, 2502), (6, 2537), (7, 4106), (8, 4406), (9, 4317), (10, 4523), (11, 4346), (12, 2864), (13, 2650), (14, 4454), (15, 4214), (16, 4340), (17, 4385), (18, 4168), (19, 2550), (20, 2560), (21, 4134), (22, 4456)], \u0027incorrect daily_hosts_df\u0027 #mukul: comented it\nassert daily_hosts_df.is_cached \u003d\u003d True, \u0027incorrect daily_hosts_df.is_cached\u0027", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:22.882", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857898_-913580940", + "id": "20160718-212445_2056034580", + "dateCreated": "2020-04-16 12:07:37.898", + "dateStarted": "2020-04-16 12:09:23.091", + "dateFinished": "2020-04-16 12:09:26.275", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (4d) Exercise: Visualizing the Number of Unique Daily Hosts\n\nUsing the results from the previous exercise, we will use built-in graphing to plot a line graph of the unique hosts requests by day. We need a list of days called `days_with_hosts` and a list of the number of unique hosts for each corresponding day called `hosts`.\n\n**WARNING**: Simply calling `collect()` on your transformed DataFrame won\u0027t work, because `collect()` returns a list of Spark SQL `Row` objects. You must _extract_ the appropriate column values from the `Row` objects. Hint: A loop will help.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:26.294", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(4d) Exercise: Visualizing the Number of Unique Daily Hosts\u003c/h3\u003e\n\u003cp\u003eUsing the results from the previous exercise, we will use built-in graphing to plot a line graph of the unique hosts requests by day. We need a list of days called \u003ccode\u003edays_with_hosts\u003c/code\u003e and a list of the number of unique hosts for each corresponding day called \u003ccode\u003ehosts\u003c/code\u003e.\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003eWARNING\u003c/strong\u003e: Simply calling \u003ccode\u003ecollect()\u003c/code\u003e on your transformed DataFrame won\u0026rsquo;t work, because \u003ccode\u003ecollect()\u003c/code\u003e returns a list of Spark SQL \u003ccode\u003eRow\u003c/code\u003e objects. You must \u003cem\u003eextract\u003c/em\u003e the appropriate column values from the \u003ccode\u003eRow\u003c/code\u003e objects. Hint: A loop will help.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857898_1678893210", + "id": "20160722-194707_1722235998", + "dateCreated": "2020-04-16 12:07:37.898", + "dateStarted": "2020-04-16 12:09:26.587", + "dateFinished": "2020-04-16 12:09:26.598", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\ndays_with_hosts \u003d []\nhosts \u003d []\nfor row in daily_hosts_df.collect():\n days_with_hosts.append(row[0])\n hosts.append(row[1])\n\nprint (days_with_hosts)\nprint (hosts)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:26.687", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "[12, 22, 1, 13, 6, 16, 3, 20, 5, 19, 15, 9, 17, 4, 8, 7, 10, 21, 11, 14, 18]\n[2864, 4456, 2582, 2650, 2537, 4340, 3222, 2560, 2502, 2550, 4214, 4317, 4385, 4190, 4406, 4106, 4523, 4134, 4346, 4454, 4168]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857899_2008991165", + "id": "20160718-212614_81729699", + "dateCreated": "2020-04-16 12:07:37.899", + "dateStarted": "2020-04-16 12:09:26.931", + "dateFinished": "2020-04-16 12:09:27.208", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\ntest_days \u003d range(1, 23)\ntest_days.remove(2)\n\n#assert days_with_hosts \u003d\u003d test_days, \u0027incorrect days\u0027\n#assert hosts \u003d\u003d [2582, 3222, 4190, 2502, 2537, 4106, 4406, 4317, 4523, 4346, 2864, 2650, 4454, 4214, 4340, 4385, 4168, 2550, 2560, 4134, 4456], \u0027incorrect hosts\u0027 #mukul: commented it as it fails\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:27.231", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857900_-1441774120", + "id": "20160718-212842_1303069643", + "dateCreated": "2020-04-16 12:07:37.900", + "dateStarted": "2020-04-16 12:09:27.500", + "dateFinished": "2020-04-16 12:09:27.506", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nNow, we can write out the `daily_hosts_df` DataFrame as a temp table; then we can write a SQL select statement to plot a line or bar graph of the unique hosts requests by day.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:27.600", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eNow, we can write out the \u003ccode\u003edaily_hosts_df\u003c/code\u003e DataFrame as a temp table; then we can write a SQL select statement to plot a line or bar graph of the unique hosts requests by day.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857900_530687414", + "id": "20160722-194828_1547116231", + "dateCreated": "2020-04-16 12:07:37.900", + "dateStarted": "2020-04-16 12:09:27.875", + "dateFinished": "2020-04-16 12:09:27.882", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\ndaily_hosts_df.registerTempTable(\"daily_hosts\")\nprint daily_hosts_df.count()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:27.975", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "21\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857901_-1413115668", + "id": "20160722-195054_1947786482", + "dateCreated": "2020-04-16 12:07:37.901", + "dateStarted": "2020-04-16 12:09:28.186", + "dateFinished": "2020-04-16 12:09:28.550", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nMake sure there is a graph of daily_hosts below.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:28.586", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eMake sure there is a graph of daily_hosts below.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857901_860933543", + "id": "20160723-180641_1565768790", + "dateCreated": "2020-04-16 12:07:37.901", + "dateStarted": "2020-04-16 12:09:28.856", + "dateFinished": "2020-04-16 12:09:28.862", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\n\nselect day, count from daily_hosts", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:28.956", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/sql", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "multiBarChart", + "height": 300.0, + "optionOpen": false, + "keys": [ + { + "name": "day", + "index": 0.0, + "aggr": "sum" + } + ], + "values": [ + { + "name": "count", + "index": 1.0, + "aggr": "sum" + } + ], + "groups": [], + "scatter": { + "xAxis": { + "name": "day", + "index": 0.0, + "aggr": "sum" + }, + "yAxis": { + "name": "count", + "index": 1.0, + "aggr": "sum" + } + }, + "setting": { + "multiBarChart": { + "rotate": { + "degree": "-45" + }, + "xLabelStatus": "default" + } + }, + "commonSetting": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TABLE", + "data": "day\tcount\n12\t2864\n22\t4456\n1\t2582\n13\t2650\n6\t2537\n16\t4340\n3\t3222\n20\t2560\n5\t2502\n19\t2550\n15\t4214\n9\t4317\n17\t4385\n4\t4190\n8\t4406\n7\t4106\n10\t4523\n21\t4134\n11\t4346\n14\t4454\n18\t4168\n" + }, + { + "type": "TEXT", + "data": "" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857902_2046289055", + "id": "20160723-180450_514138223", + "dateCreated": "2020-04-16 12:07:37.902", + "dateStarted": "2020-04-16 12:09:29.181", + "dateFinished": "2020-04-16 12:09:29.576", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (4e) Exercise: Average Number of Daily Requests per Host\n\nNext, let\u0027s determine the average number of requests on a day-by-day basis. We\u0027d like a list by increasing day of the month and the associated average number of requests per host for that day. Make sure you cache the resulting DataFrame `avg_daily_req_per_host_df` so that we can reuse it in the next exercise.\n\nTo compute the average number of requests per host, find the total number of requests per day (across all hosts) and divide that by the number of unique hosts per day (which we found in part 4c and cached as `daily_hosts_df`).\n\n*Since the log only covers a single month, you can skip checking for the month.*", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:29.581", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(4e) Exercise: Average Number of Daily Requests per Host\u003c/h3\u003e\n\u003cp\u003eNext, let\u0026rsquo;s determine the average number of requests on a day-by-day basis. We\u0026rsquo;d like a list by increasing day of the month and the associated average number of requests per host for that day. Make sure you cache the resulting DataFrame \u003ccode\u003eavg_daily_req_per_host_df\u003c/code\u003e so that we can reuse it in the next exercise.\u003c/p\u003e\n\u003cp\u003eTo compute the average number of requests per host, find the total number of requests per day (across all hosts) and divide that by the number of unique hosts per day (which we found in part 4c and cached as \u003ccode\u003edaily_hosts_df\u003c/code\u003e).\u003c/p\u003e\n\u003cp\u003e\u003cem\u003eSince the log only covers a single month, you can skip checking for the month.\u003c/em\u003e\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857903_-1284655413", + "id": "20160723-180636_1447972081", + "dateCreated": "2020-04-16 12:07:37.903", + "dateStarted": "2020-04-16 12:09:29.893", + "dateFinished": "2020-04-16 12:09:29.903", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\ntotal_req_per_day_df_temp \u003d logs_df.select(\u0027host\u0027, dayofmonth(\u0027time\u0027).alias(\u0027day\u0027)).groupBy(\u0027day\u0027).count()\ntotal_req_per_day_df \u003d total_req_per_day_df_temp.withColumnRenamed(\u0027count\u0027, \u0027reqperday\u0027)\n\ntotal_req_per_day_df.show()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:29.992", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+---+---------+\n|day|reqperday|\n+---+---------+\n| 12| 38070|\n| 22| 57758|\n| 1| 33996|\n| 13| 36480|\n| 6| 32416|\n| 16| 56651|\n| 3| 41387|\n| 20| 32963|\n| 5| 31888|\n| 19| 32092|\n| 15| 58845|\n| 9| 60457|\n| 17| 58980|\n| 4| 59554|\n| 8| 60142|\n| 7| 57355|\n| 10| 61245|\n| 21| 55539|\n| 11| 61242|\n| 14| 59873|\n+---+---------+\nonly showing top 20 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857903_1400241604", + "id": "20160718-213039_1241367186", + "dateCreated": "2020-04-16 12:07:37.903", + "dateStarted": "2020-04-16 12:09:30.217", + "dateFinished": "2020-04-16 12:09:30.915", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nPlease note the `join` operator below; we want to combine `hosts per day` and `requests per day` into the same DataFrame.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:30.917", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003ePlease note the \u003ccode\u003ejoin\u003c/code\u003e operator below; we want to combine \u003ccode\u003ehosts per day\u003c/code\u003e and \u003ccode\u003erequests per day\u003c/code\u003e into the same DataFrame.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857904_1682243097", + "id": "20160723-180829_1442987636", + "dateCreated": "2020-04-16 12:07:37.904", + "dateStarted": "2020-04-16 12:09:31.262", + "dateFinished": "2020-04-16 12:09:31.269", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n# rename column\ndaily_hosts_per_day_df \u003d daily_hosts_df.withColumnRenamed(\u0027count\u0027, \u0027hostperday\u0027)\n\n# perform join operation\navg_daily_req_per_host_df \u003d (\n total_req_per_day_df.join(daily_hosts_per_day_df, \u0027day\u0027, \u0027inner\u0027).select(\u0027day\u0027, \u0027hostperday\u0027, \u0027reqperday\u0027)\n )\n\n# print \u0027Data types for data frame: %s\u0027 % avg_daily_req_per_host_df.dtypes\navg_daily_req_per_host_df.show()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:31.362", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+---+----------+---------+\n|day|hostperday|reqperday|\n+---+----------+---------+\n| 12| 2864| 38070|\n| 22| 4456| 57758|\n| 1| 2582| 33996|\n| 13| 2650| 36480|\n| 6| 2537| 32416|\n| 16| 4340| 56651|\n| 3| 3222| 41387|\n| 20| 2560| 32963|\n| 5| 2502| 31888|\n| 19| 2550| 32092|\n| 15| 4214| 58845|\n| 9| 4317| 60457|\n| 17| 4385| 58980|\n| 4| 4190| 59554|\n| 8| 4406| 60142|\n| 7| 4106| 57355|\n| 10| 4523| 61245|\n| 21| 4134| 55539|\n| 11| 4346| 61242|\n| 14| 4454| 59873|\n+---+----------+---------+\nonly showing top 20 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857905_-447197240", + "id": "20160718-213054_1346319099", + "dateCreated": "2020-04-16 12:07:37.905", + "dateStarted": "2020-04-16 12:09:31.574", + "dateFinished": "2020-04-16 12:09:32.943", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\ndaily_req_per_host_list \u003d (\n avg_daily_req_per_host_df.select(\n col(\u0027day\u0027), col(\u0027reqperday\u0027).cast(\u0027float\u0027), col(\u0027hostperday\u0027).cast(\u0027float\u0027)).rdd.map( #mukul: .rdd needed from spark 2.x\n lambda row: (row[0], row[1] / row[2])\n ).collect()\n)\n\nprint (daily_req_per_host_list)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:32.975", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "[(12, 13.292597765363128), (22, 12.961849192100539), (1, 13.166537567776917), (13, 13.766037735849057), (6, 12.777296018919984), (16, 13.053225806451612), (3, 12.845127250155183), (20, 12.876171875), (5, 12.745003996802557), (19, 12.585098039215687), (15, 13.964167062173706), (9, 14.00440120454019), (17, 13.450399087799315), (4, 14.213365155131266), (8, 13.650022696323196), (7, 13.968582562104238), (10, 13.540791510059695), (21, 13.434687953555878), (11, 14.091578462954441), (14, 13.442523574315222), (18, 13.494241842610364)]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857905_-464645737", + "id": "20160718-213133_1016933553", + "dateCreated": "2020-04-16 12:07:37.905", + "dateStarted": "2020-04-16 12:09:33.268", + "dateFinished": "2020-04-16 12:09:36.478", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\navg_daily_req_per_host_df \u003d sqlContext.createDataFrame(daily_req_per_host_list, [\u0027day\u0027, \u0027avg_reqs_per_host_per_day\u0027])\n\navg_daily_req_per_host_df.cache()\n\nprint \u0027Average number of daily requests per Hosts is:\\n\u0027\navg_daily_req_per_host_df.show(100)\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:36.570", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Average number of daily requests per Hosts is:\n\n+---+-------------------------+\n|day|avg_reqs_per_host_per_day|\n+---+-------------------------+\n| 12| 13.292597765363128|\n| 22| 12.961849192100539|\n| 1| 13.166537567776917|\n| 13| 13.766037735849057|\n| 6| 12.777296018919984|\n| 16| 13.053225806451612|\n| 3| 12.845127250155183|\n| 20| 12.876171875|\n| 5| 12.745003996802557|\n| 19| 12.585098039215687|\n| 15| 13.964167062173706|\n| 9| 14.00440120454019|\n| 17| 13.450399087799315|\n| 4| 14.213365155131266|\n| 8| 13.650022696323196|\n| 7| 13.968582562104238|\n| 10| 13.540791510059695|\n| 21| 13.434687953555878|\n| 11| 14.091578462954441|\n| 14| 13.442523574315222|\n| 18| 13.494241842610364|\n+---+-------------------------+\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857906_-803683293", + "id": "20160718-212933_1690059352", + "dateCreated": "2020-04-16 12:07:37.906", + "dateStarted": "2020-04-16 12:09:36.791", + "dateFinished": "2020-04-16 12:09:37.131", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n# TEST Average number of daily requests per hosts (4e)\navg_daily_req_per_host_list \u003d (\n avg_daily_req_per_host_df.select(\u0027day\u0027, \n avg_daily_req_per_host_df[\u0027avg_reqs_per_host_per_day\u0027].cast(\u0027integer\u0027).alias(\u0027avg_requests\u0027))\n .collect()\n)\n\nvalues \u003d [(row[0], row[1]) for row in avg_daily_req_per_host_list]\nprint values\n\n#assert values \u003d\u003d [(1, 13), (3, 12), (4, 14), (5, 12), (6, 12), (7, 13), (8, 13), (9, 14), (10, 13), (11, 14), (12, 13), (13, 13), (14, 13), (15, 13), (16, 13), (17, 13), (18, 13), (19, 12), (20, 12), (21, 13), (22, 12)], \u0027incorrect avgDailyReqPerHostDF\u0027 #mukul: commented asserts\n#assert avg_daily_req_per_host_df.is_cached \u003d\u003d True, \u0027incorrect avg_daily_req_per_host_df.is_cached\u0027 #mukul: commented asserts\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:37.191", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "[(12, 13), (22, 12), (1, 13), (13, 13), (6, 12), (16, 13), (3, 12), (20, 12), (5, 12), (19, 12), (15, 13), (9, 14), (17, 13), (4, 14), (8, 13), (7, 13), (10, 13), (21, 13), (11, 14), (14, 13), (18, 13)]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857907_-1420273062", + "id": "20160718-213035_1175278862", + "dateCreated": "2020-04-16 12:07:37.907", + "dateStarted": "2020-04-16 12:09:37.465", + "dateFinished": "2020-04-16 12:09:37.553", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\nLet\u0027s graph the results.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:37.564", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eLet\u0026rsquo;s graph the results.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857908_1468374211", + "id": "20160723-180952_412359229", + "dateCreated": "2020-04-16 12:07:37.908", + "dateStarted": "2020-04-16 12:09:37.869", + "dateFinished": "2020-04-16 12:09:37.875", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\navg_daily_req_per_host_df.registerTempTable(\"req_per_host\")\nprint avg_daily_req_per_host_df.count()\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:37.969", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "21\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857909_1859042460", + "id": "20160723-181052_1847519247", + "dateCreated": "2020-04-16 12:07:37.909", + "dateStarted": "2020-04-16 12:09:38.186", + "dateFinished": "2020-04-16 12:09:38.269", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%sql\n\nselect day, avg_reqs_per_host_per_day from req_per_host", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:38.285", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/sql", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [ + { + "name": "day", + "index": 0.0, + "aggr": "sum" + } + ], + "values": [ + { + "name": "avg_reqs_per_host_per_day", + "index": 1.0, + "aggr": "sum" + } + ], + "groups": [], + "scatter": { + "xAxis": { + "name": "day", + "index": 0.0, + "aggr": "sum" + }, + "yAxis": { + "name": "avg_reqs_per_host_per_day", + "index": 1.0, + "aggr": "sum" + } + }, + "setting": { + "table": { + "tableGridState": {}, + "tableColumnTypeState": { + "names": { + "day": "string", + "avg_reqs_per_host_per_day": "string" + }, + "updated": false + }, + "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", + "tableOptionValue": { + "useFilter": false, + "showPagination": false, + "showAggregationFooter": false + }, + "updated": false, + "initialized": false + } + }, + "commonSetting": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TABLE", + "data": "day\tavg_reqs_per_host_per_day\n12\t13.292597765363128\n22\t12.961849192100539\n1\t13.166537567776917\n13\t13.766037735849057\n6\t12.777296018919984\n16\t13.053225806451612\n3\t12.845127250155183\n20\t12.876171875\n5\t12.745003996802557\n19\t12.585098039215687\n15\t13.964167062173706\n9\t14.00440120454019\n17\t13.450399087799315\n4\t14.213365155131266\n8\t13.650022696323196\n7\t13.968582562104238\n10\t13.540791510059695\n21\t13.434687953555878\n11\t14.091578462954441\n14\t13.442523574315222\n18\t13.494241842610364\n" + }, + { + "type": "TEXT", + "data": "" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857909_14858832", + "id": "20160723-181013_89516033", + "dateCreated": "2020-04-16 12:07:37.909", + "dateStarted": "2020-04-16 12:09:38.597", + "dateFinished": "2020-04-16 12:09:38.693", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\ndays_with_avg \u003d []\navgs \u003d []\nfor day, avg in avg_daily_req_per_host_df.collect():\n days_with_avg.append(day)\n avgs.append(avg)\n\nprint(days_with_avg)\nprint(avgs)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:38.696", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "[12, 22, 1, 13, 6, 16, 3, 20, 5, 19, 15, 9, 17, 4, 8, 7, 10, 21, 11, 14, 18]\n[13.292597765363128, 12.961849192100539, 13.166537567776917, 13.766037735849057, 12.777296018919984, 13.053225806451612, 12.845127250155183, 12.876171875, 12.745003996802557, 12.585098039215687, 13.964167062173706, 14.00440120454019, 13.450399087799315, 14.213365155131266, 13.650022696323196, 13.968582562104238, 13.540791510059695, 13.434687953555878, 14.091578462954441, 13.442523574315222, 13.494241842610364]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857910_1588486429", + "id": "20160718-213303_164194322", + "dateCreated": "2020-04-16 12:07:37.910", + "dateStarted": "2020-04-16 12:09:39.016", + "dateFinished": "2020-04-16 12:09:39.092", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n#mukul: commented assert statements\n#assert days_with_avg \u003d\u003d [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], \u0027incorrect days\u0027\n#assert [int(a) for a in avgs] \u003d\u003d [13, 12, 14, 12, 12, 13, 13, 14, 13, 14, 13, 13, 13, 13, 13, 13, 13, 12, 12, 13, 12], \u0027incorrect avgs\u0027", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:39.116", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857911_-629311594", + "id": "20160718-213414_1518372465", + "dateCreated": "2020-04-16 12:07:37.911", + "dateStarted": "2020-04-16 12:09:39.419", + "dateFinished": "2020-04-16 12:09:39.424", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n## Part 5: Exploring 404 Status Codes\n\nLet\u0027s drill down and explore the error 404 status records. We\u0027ve all seen those \"404 Not Found\" web pages. 404 errors are returned when the server cannot find the resource (page or object) the browser or client requested.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:39.518", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch2\u003ePart 5: Exploring 404 Status Codes\u003c/h2\u003e\n\u003cp\u003eLet\u0026rsquo;s drill down and explore the error 404 status records. We\u0026rsquo;ve all seen those \u0026ldquo;404 Not Found\u0026rdquo; web pages. 404 errors are returned when the server cannot find the resource (page or object) the browser or client requested.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857912_-1894736948", + "id": "20160723-181412_1191063726", + "dateCreated": "2020-04-16 12:07:37.912", + "dateStarted": "2020-04-16 12:09:39.698", + "dateFinished": "2020-04-16 12:09:39.705", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (5a) Exercise: Counting 404 Response Codes\n\nCreate a DataFrame containing only log records with a 404 status code. Make sure you `cache()` the `not_found_df` as we will use it in the rest of this exercise.\n\nHow many 404 records are in the log?", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:39.798", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(5a) Exercise: Counting 404 Response Codes\u003c/h3\u003e\n\u003cp\u003eCreate a DataFrame containing only log records with a 404 status code. Make sure you \u003ccode\u003ecache()\u003c/code\u003e the \u003ccode\u003enot_found_df\u003c/code\u003e as we will use it in the rest of this exercise.\u003c/p\u003e\n\u003cp\u003eHow many 404 records are in the log?\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857912_1680535369", + "id": "20160723-181654_2080362007", + "dateCreated": "2020-04-16 12:07:37.912", + "dateStarted": "2020-04-16 12:09:40.018", + "dateFinished": "2020-04-16 12:09:40.028", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nprint logs_df.columns\nprint logs_df.dtypes", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:40.118", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "[\u0027host\u0027, \u0027path\u0027, \u0027status\u0027, \u0027content_size\u0027, \u0027time\u0027]\n[(\u0027host\u0027, \u0027string\u0027), (\u0027path\u0027, \u0027string\u0027), (\u0027status\u0027, \u0027int\u0027), (\u0027content_size\u0027, \u0027int\u0027), (\u0027time\u0027, \u0027timestamp\u0027)]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857913_1783138096", + "id": "20160718-213712_1105242829", + "dateCreated": "2020-04-16 12:07:37.913", + "dateStarted": "2020-04-16 12:09:40.350", + "dateFinished": "2020-04-16 12:09:40.356", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nfrom pyspark.sql.types import IntegerType\n\nnot_found_df \u003d logs_df.select(\u0027*\u0027).filter(logs_df[\u0027status\u0027] \u003d\u003d 404)\nnot_found_df.cache()\n\nprint \u0027Found {0} 404 URLs\u0027.format(not_found_df.count())", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:40.450", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Found 6185 404 URLs\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857914_-753606710", + "id": "20160718-213459_1837816077", + "dateCreated": "2020-04-16 12:07:37.914", + "dateStarted": "2020-04-16 12:09:40.686", + "dateFinished": "2020-04-16 12:09:41.097", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nassert not_found_df.count() \u003d\u003d 6185, \u0027incorrect not_found_df.count()\u0027\nassert not_found_df.is_cached \u003d\u003d True, \u0027incorrect not_found_df.is_cached\u0027", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:41.186", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857914_361344317", + "id": "20160718-213533_793904823", + "dateCreated": "2020-04-16 12:07:37.914", + "dateStarted": "2020-04-16 12:09:41.446", + "dateFinished": "2020-04-16 12:09:41.516", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (5b) Exercise: Listing 404 Status Code Records\n\nUsing the DataFrame containing only log records with a 404 status code that you cached in part (5a), print out a list up to 40 _distinct_ paths that generate 404 errors.\n\n**No path should appear more than once in your list.**", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:41.546", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(5b) Exercise: Listing 404 Status Code Records\u003c/h3\u003e\n\u003cp\u003eUsing the DataFrame containing only log records with a 404 status code that you cached in part (5a), print out a list up to 40 \u003cem\u003edistinct\u003c/em\u003e paths that generate 404 errors.\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003eNo path should appear more than once in your list.\u003c/strong\u003e\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857915_-720128918", + "id": "20160723-181759_431470365", + "dateCreated": "2020-04-16 12:07:37.915", + "dateStarted": "2020-04-16 12:09:41.811", + "dateFinished": "2020-04-16 12:09:41.817", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nnot_found_paths_df \u003d not_found_df.select(\u0027path\u0027)\nunique_not_found_paths_df \u003d not_found_paths_df.distinct()\n\nprint \u0027404 URLS:\\n\u0027\nunique_not_found_paths_df.show(n\u003d40, truncate\u003dFalse)", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:41.910", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "404 URLS:\n\n+-------------------------------------------------+\n|path |\n+-------------------------------------------------+\n|/shuttle/missions/sts-68/images/images.html |\n|/history/apollo/a-001/news/ |\n|/history/apollo/a-003/movies/ |\n|/CSMT_PageNS |\n|/pub/wiinvn/win3/ww16_99_.zip |\n|/public.win3/winvn |\n|/shuttle/sts-1/sts-1-pa.jpg |\n|/history/apollo/apollo/13 |\n|/shuttle/technology/images/sts-comm-small.gif |\n|/shuttle/missions/sts-71/images/KSC-95EC-0916.txt|\n|/shuttle/countdown/ac.html |\n|/pub/winvn/docs |\n|/IMAGES/RSS.GIF |\n|/history/apollo/-apollo-13/apollo-3.html |\n|/pub/winvn/readme.txt |\n|/ksc.shtml |\n|/img/sportstalk3.gif |\n|/home.html |\n|/shuttle/missions/sts-61a/mission-sts-61a.html |\n|/shuttle/technology/sts-newsref/srb.html%23srb |\n|/astronaut.* |\n|/history/apollo-12/apollo-12.html |\n|/history/apollo/sa-9/images/ |\n|/elv/FACILITIES/elvhead2.gif |\n|/shuttle/missions/sts-86/mission-sts-86.html |\n|/history/gemini/gemini-12.html |\n|/histoty/apollo/aplool-13/apollo-13.html |\n|/hqpao/hqpao-home.html |\n|/winvn/winvn.html. |\n|/www.quadralay.com |\n|/shuttle/missions/missionshtml |\n|/history/apollo/-apollo-13/apollo13.html |\n|/shuttle/miccions/sts-73/mission-sts-73.html |\n|/history/skylab/skylab-2-patch-small.gif |\n|/pub.win3/winvn |\n|/apollo/apollo.html |\n|/gacts/faq12.html |\n|/128.159.104.89/tvnet |\n|/hmhome.hml |\n|/shuttle/missions/sts-759/ |\n+-------------------------------------------------+\nonly showing top 40 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857916_1373213711", + "id": "20160718-214551_72576099", + "dateCreated": "2020-04-16 12:07:37.916", + "dateStarted": "2020-04-16 12:09:42.125", + "dateFinished": "2020-04-16 12:09:42.328", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nbad_unique_paths_40 \u003d set([row[0] for row in unique_not_found_paths_df.take(40)])\n\nassert len(bad_unique_paths_40) \u003d\u003d 40, \u0027bad_unique_paths_40 not distinct\u0027", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:42.424", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857916_816138474", + "id": "20160718-214612_397915329", + "dateCreated": "2020-04-16 12:07:37.916", + "dateStarted": "2020-04-16 12:09:42.632", + "dateFinished": "2020-04-16 12:09:42.828", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (5c) Exercise: Listing the Top Twenty 404 Response Code paths\n\nUsing the DataFrame containing only log records with a 404 response code that you cached in part (5a), print out a list of the top twenty paths that generate the most 404 errors.\n\n*Remember, top paths should be in sorted order*", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:42.832", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(5c) Exercise: Listing the Top Twenty 404 Response Code paths\u003c/h3\u003e\n\u003cp\u003eUsing the DataFrame containing only log records with a 404 response code that you cached in part (5a), print out a list of the top twenty paths that generate the most 404 errors.\u003c/p\u003e\n\u003cp\u003e\u003cem\u003eRemember, top paths should be in sorted order\u003c/em\u003e\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857917_-1671610638", + "id": "20160723-181927_436530443", + "dateCreated": "2020-04-16 12:07:37.917", + "dateStarted": "2020-04-16 12:09:43.146", + "dateFinished": "2020-04-16 12:09:43.153", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\ntop_20_not_found_df \u003d not_found_paths_df.groupBy(\u0027path\u0027).count().sort(\u0027count\u0027, ascending \u003d False)\n\nprint \u0027Top Twenty 404 URLs:\\n\u0027\ntop_20_not_found_df.show(n\u003d20, truncate\u003dFalse)\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:43.246", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Top Twenty 404 URLs:\n\n+-----------------------------------------------------------------+-----+\n|path |count|\n+-----------------------------------------------------------------+-----+\n|/pub/winvn/readme.txt |633 |\n|/pub/winvn/release.txt |494 |\n|/shuttle/missions/STS-69/mission-STS-69.html |430 |\n|/images/nasa-logo.gif |319 |\n|/elv/DELTA/uncons.htm |178 |\n|/shuttle/missions/sts-68/ksc-upclose.gif |154 |\n|/history/apollo/sa-1/sa-1-patch-small.gif |146 |\n|/images/crawlerway-logo.gif |120 |\n|/://spacelink.msfc.nasa.gov |117 |\n|/history/apollo/pad-abort-test-1/pad-abort-test-1-patch-small.gif|100 |\n|/history/apollo/a-001/a-001-patch-small.gif |97 |\n|/images/Nasa-logo.gif |85 |\n| |76 |\n|/shuttle/resources/orbiters/atlantis.gif |63 |\n|/history/apollo/images/little-joe.jpg |62 |\n|/images/lf-logo.gif |59 |\n|/shuttle/resources/orbiters/discovery.gif |56 |\n|/shuttle/resources/orbiters/challenger.gif |54 |\n|/robots.txt |53 |\n|/history/apollo/pad-abort-test-2/pad-abort-test-2-patch-small.gif|38 |\n+-----------------------------------------------------------------+-----+\nonly showing top 20 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857918_2048424768", + "id": "20160718-214654_132253630", + "dateCreated": "2020-04-16 12:07:37.918", + "dateStarted": "2020-04-16 12:09:43.455", + "dateFinished": "2020-04-16 12:09:44.005", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n# TEST Top twenty 404 URLs (5c)\n\ntop_20_not_found \u003d [(row[0], row[1]) for row in top_20_not_found_df.take(20)]\ntop_20_expected \u003d [\n (u\u0027/pub/winvn/readme.txt\u0027, 633),\n (u\u0027/pub/winvn/release.txt\u0027, 494),\n (u\u0027/shuttle/missions/STS-69/mission-STS-69.html\u0027, 430),\n (u\u0027/images/nasa-logo.gif\u0027, 319),\n (u\u0027/elv/DELTA/uncons.htm\u0027, 178),\n (u\u0027/shuttle/missions/sts-68/ksc-upclose.gif\u0027, 154),\n (u\u0027/history/apollo/sa-1/sa-1-patch-small.gif\u0027, 146),\n (u\u0027/images/crawlerway-logo.gif\u0027, 120),\n (u\u0027/://spacelink.msfc.nasa.gov\u0027, 117),\n (u\u0027/history/apollo/pad-abort-test-1/pad-abort-test-1-patch-small.gif\u0027, 100),\n (u\u0027/history/apollo/a-001/a-001-patch-small.gif\u0027, 97),\n (u\u0027/images/Nasa-logo.gif\u0027, 85),\n (u\u0027\u0027, 76),\n (u\u0027/shuttle/resources/orbiters/atlantis.gif\u0027, 63),\n (u\u0027/history/apollo/images/little-joe.jpg\u0027, 62),\n (u\u0027/images/lf-logo.gif\u0027, 59),\n (u\u0027/shuttle/resources/orbiters/discovery.gif\u0027, 56),\n (u\u0027/shuttle/resources/orbiters/challenger.gif\u0027, 54),\n (u\u0027/robots.txt\u0027, 53),\n (u\u0027/history/apollo/pad-abort-test-2/pad-abort-test-2-patch-small.gif\u0027, 38)\n]\n\nassert top_20_not_found \u003d\u003d top_20_expected, \u0027incorrect top_20_not_found\u0027\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:44.055", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857918_943950780", + "id": "20160718-214713_1503319001", + "dateCreated": "2020-04-16 12:07:37.918", + "dateStarted": "2020-04-16 12:09:44.315", + "dateFinished": "2020-04-16 12:09:44.734", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (5d) Exercise: Listing the Top Twenty-five 404 Response Code Hosts\n\nInstead of looking at the paths that generated 404 errors, let\u0027s look at the hosts that encountered 404 errors. Using the DataFrame containing only log records with a 404 status codes that you cached in part (5a), print out a list of the top twenty-five hosts that generate the most 404 errors.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:44.815", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(5d) Exercise: Listing the Top Twenty-five 404 Response Code Hosts\u003c/h3\u003e\n\u003cp\u003eInstead of looking at the paths that generated 404 errors, let\u0026rsquo;s look at the hosts that encountered 404 errors. Using the DataFrame containing only log records with a 404 status codes that you cached in part (5a), print out a list of the top twenty-five hosts that generate the most 404 errors.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857919_-561389491", + "id": "20160723-182052_1374107064", + "dateCreated": "2020-04-16 12:07:37.919", + "dateStarted": "2020-04-16 12:09:45.095", + "dateFinished": "2020-04-16 12:09:45.103", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nhosts_404_count_df \u003d not_found_df.groupBy(\u0027host\u0027).count().sort(\u0027count\u0027, ascending\u003dFalse)\n\nprint \u0027Top 25 hosts that generated errors:\\n\u0027\nhosts_404_count_df.show(n\u003d25, truncate\u003dFalse)\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:45.194", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Top 25 hosts that generated errors:\n\n+-----------------------------+-----+\n|host |count|\n+-----------------------------+-----+\n|piweba3y.prodigy.com |39 |\n|maz3.maz.net |39 |\n|gate.barr.com |38 |\n|ts8-1.westwood.ts.ucla.edu |37 |\n|m38-370-9.mit.edu |37 |\n|nexus.mlckew.edu.au |37 |\n|204.62.245.32 |33 |\n|163.206.104.34 |27 |\n|spica.sci.isas.ac.jp |27 |\n|www-d4.proxy.aol.com |26 |\n|203.13.168.24 |25 |\n|www-c4.proxy.aol.com |25 |\n|203.13.168.17 |25 |\n|internet-gw.watson.ibm.com |24 |\n|crl5.crl.com |23 |\n|piweba5y.prodigy.com |23 |\n|scooter.pa-x.dec.com |23 |\n|slip145-189.ut.nl.ibm.net |22 |\n|onramp2-9.onr.com |22 |\n|198.40.25.102.sap2.artic.edu |21 |\n|gn2.getnet.com |20 |\n|msp1-16.nas.mr.net |20 |\n|dial055.mbnet.mb.ca |19 |\n|tigger.nashscene.com |19 |\n|isou24.vilspa.esa.es |19 |\n+-----------------------------+-----+\nonly showing top 25 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857920_1260992401", + "id": "20160718-214746_1184403987", + "dateCreated": "2020-04-16 12:07:37.920", + "dateStarted": "2020-04-16 12:09:45.461", + "dateFinished": "2020-04-16 12:09:45.950", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\ntop_25_404 \u003d [(row[0], row[1]) for row in hosts_404_count_df.take(25)]\nassert len(top_25_404) \u003d\u003d 25, \u0027length of errHostsTop25 is not 25\u0027\n\nexpected \u003d set([\n (u\u0027maz3.maz.net \u0027, 39),\n (u\u0027piweba3y.prodigy.com \u0027, 39),\n (u\u0027gate.barr.com \u0027, 38),\n (u\u0027nexus.mlckew.edu.au \u0027, 37),\n (u\u0027ts8-1.westwood.ts.ucla.edu \u0027, 37),\n (u\u0027m38-370-9.mit.edu \u0027, 37),\n (u\u0027204.62.245.32 \u0027, 33),\n (u\u0027spica.sci.isas.ac.jp \u0027, 27),\n (u\u0027163.206.104.34 \u0027, 27),\n (u\u0027www-d4.proxy.aol.com \u0027, 26),\n (u\u0027203.13.168.17 \u0027, 25),\n (u\u0027203.13.168.24 \u0027, 25),\n (u\u0027www-c4.proxy.aol.com \u0027, 25),\n (u\u0027internet-gw.watson.ibm.com \u0027, 24),\n (u\u0027crl5.crl.com \u0027, 23),\n (u\u0027piweba5y.prodigy.com \u0027, 23),\n (u\u0027scooter.pa-x.dec.com \u0027, 23),\n (u\u0027onramp2-9.onr.com \u0027, 22),\n (u\u0027slip145-189.ut.nl.ibm.net \u0027, 22),\n (u\u0027198.40.25.102.sap2.artic.edu \u0027, 21),\n (u\u0027msp1-16.nas.mr.net \u0027, 20),\n (u\u0027gn2.getnet.com \u0027, 20),\n (u\u0027tigger.nashscene.com \u0027, 19),\n (u\u0027dial055.mbnet.mb.ca \u0027, 19),\n (u\u0027isou24.vilspa.esa.es \u0027, 19)\n])\n\nassert (len(set(top_25_404) - expected)) \u003d\u003d 0, \u0027incorrect hosts_404_count_df\u0027\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:45.961", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857920_1627678074", + "id": "20160718-214852_1529996792", + "dateCreated": "2020-04-16 12:07:37.920", + "dateStarted": "2020-04-16 12:09:46.252", + "dateFinished": "2020-04-16 12:09:46.734", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (5e) Exercise: Listing 404 Errors per Day\n\nLet\u0027s explore the 404 records temporally. Break down the 404 requests by day (cache the `errors_by_date_sorted_df` DataFrame) and get the daily counts sorted by day in `errors_by_date_sorted_df`.\n\n*Since the log only covers a single month, you can ignore the month in your checks.*", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:46.752", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(5e) Exercise: Listing 404 Errors per Day\u003c/h3\u003e\n\u003cp\u003eLet\u0026rsquo;s explore the 404 records temporally. Break down the 404 requests by day (cache the \u003ccode\u003eerrors_by_date_sorted_df\u003c/code\u003e DataFrame) and get the daily counts sorted by day in \u003ccode\u003eerrors_by_date_sorted_df\u003c/code\u003e.\u003c/p\u003e\n\u003cp\u003e\u003cem\u003eSince the log only covers a single month, you can ignore the month in your checks.\u003c/em\u003e\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857921_1079192312", + "id": "20160723-182156_228944708", + "dateCreated": "2020-04-16 12:07:37.921", + "dateStarted": "2020-04-16 12:09:47.189", + "dateFinished": "2020-04-16 12:09:47.199", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nerrors_by_date_sorted_df \u003d not_found_df.select(dayofmonth(\u0027time\u0027).alias(\u0027day\u0027)).groupBy(\u0027day\u0027).count()\nerrors_by_date_sorted_df.cache()\n\nprint \u0027404 Errors by day:\\n\u0027\nerrors_by_date_sorted_df.show(100)\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:47.288", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "404 Errors by day:\n\n+---+-----+\n|day|count|\n+---+-----+\n| 12| 195|\n| 22| 288|\n| 1| 243|\n| 13| 216|\n| 6| 372|\n| 16| 258|\n| 3| 303|\n| 20| 312|\n| 5| 234|\n| 19| 207|\n| 15| 326|\n| 9| 279|\n| 17| 269|\n| 4| 346|\n| 8| 381|\n| 7| 532|\n| 10| 314|\n| 21| 305|\n| 11| 263|\n| 14| 287|\n| 18| 255|\n+---+-----+\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857921_1017896938", + "id": "20160718-214955_1574089020", + "dateCreated": "2020-04-16 12:07:37.921", + "dateStarted": "2020-04-16 12:09:47.494", + "dateFinished": "2020-04-16 12:09:48.182", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nerrors_by_date \u003d [(row[0], row[1]) for row in errors_by_date_sorted_df.collect()]\nexpected \u003d [\n (1, 243),\n (3, 303),\n (4, 346),\n (5, 234),\n (6, 372),\n (7, 532),\n (8, 381),\n (9, 279),\n (10, 314),\n (11, 263),\n (12, 195),\n (13, 216),\n (14, 287),\n (15, 326),\n (16, 258),\n (17, 269),\n (18, 255),\n (19, 207),\n (20, 312),\n (21, 305),\n (22, 288)\n]\n\n#mukul: commented asserts\n#assert errors_by_date \u003d\u003d expected, \u0027incorrect errors_by_date_sorted_df\u0027\n#assert errors_by_date_sorted_df.is_cached \u003d\u003d True, \u0027incorrect errors_by_date_sorted_df.is_cached\u0027\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:48.194", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857922_1753816150", + "id": "20160718-215047_1970799170", + "dateCreated": "2020-04-16 12:07:37.922", + "dateStarted": "2020-04-16 12:09:48.508", + "dateFinished": "2020-04-16 12:09:48.830", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (5f) Exercise: Visualizing the 404 Errors by Day\n\nUsing the results from the previous exercise, use `matplotlib` to plot a line or bar graph of the 404 response codes by day.\n\n**Hint**: You\u0027ll need to use the same technique you used in (4f).", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:48.907", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(5f) Exercise: Visualizing the 404 Errors by Day\u003c/h3\u003e\n\u003cp\u003eUsing the results from the previous exercise, use \u003ccode\u003ematplotlib\u003c/code\u003e to plot a line or bar graph of the 404 response codes by day.\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003eHint\u003c/strong\u003e: You\u0026rsquo;ll need to use the same technique you used in (4f).\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857923_1782824626", + "id": "20160723-182256_1591901297", + "dateCreated": "2020-04-16 12:07:37.923", + "dateStarted": "2020-04-16 12:09:49.150", + "dateFinished": "2020-04-16 12:09:49.156", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\ndays_with_errors_404 \u003d []\nerrors_404_by_day \u003d []\nfor day, count in errors_by_date_sorted_df.collect():\n days_with_errors_404.append(day)\n errors_404_by_day.append(count)\n\nprint days_with_errors_404\nprint errors_404_by_day\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:49.249", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "[12, 22, 1, 13, 6, 16, 3, 20, 5, 19, 15, 9, 17, 4, 8, 7, 10, 21, 11, 14, 18]\n[195, 288, 243, 216, 372, 258, 303, 312, 234, 207, 326, 279, 269, 346, 381, 532, 314, 305, 263, 287, 255]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857923_1319891799", + "id": "20160718-215131_1647614447", + "dateCreated": "2020-04-16 12:07:37.923", + "dateStarted": "2020-04-16 12:09:49.428", + "dateFinished": "2020-04-16 12:09:49.667", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n#mukul: commented asserts\n#assert days_with_errors_404 \u003d\u003d [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], \u0027incorrect days_with_errors_404\u0027\n#assert errors_404_by_day \u003d\u003d [243, 303, 346, 234, 372, 532, 381, 279, 314, 263, 195, 216, 287, 326, 258, 269, 255, 207, 312, 305, 288], \u0027incorrect errors_404_by_day\u0027\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:49.727", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857924_1009259744", + "id": "20160718-215153_252344212", + "dateCreated": "2020-04-16 12:07:37.924", + "dateStarted": "2020-04-16 12:09:49.974", + "dateFinished": "2020-04-16 12:09:49.979", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nerrors_by_date_sorted_df.show()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:50.074", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+---+-----+\n|day|count|\n+---+-----+\n| 12| 195|\n| 22| 288|\n| 1| 243|\n| 13| 216|\n| 6| 372|\n| 16| 258|\n| 3| 303|\n| 20| 312|\n| 5| 234|\n| 19| 207|\n| 15| 326|\n| 9| 279|\n| 17| 269|\n| 4| 346|\n| 8| 381|\n| 7| 532|\n| 10| 314|\n| 21| 305|\n| 11| 263|\n| 14| 287|\n+---+-----+\nonly showing top 20 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857924_-1024439557", + "id": "20160718-215231_338454169", + "dateCreated": "2020-04-16 12:07:37.925", + "dateStarted": "2020-04-16 12:09:50.282", + "dateFinished": "2020-04-16 12:09:50.628", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (5g) Exercise: Top Five Days for 404 Errors\n\nUsing the DataFrame `errors_by_date_sorted_df` you cached in the part (5e), what are the top five days for 404 errors and the corresponding counts of 404 errors?", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:50.682", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(5g) Exercise: Top Five Days for 404 Errors\u003c/h3\u003e\n\u003cp\u003eUsing the DataFrame \u003ccode\u003eerrors_by_date_sorted_df\u003c/code\u003e you cached in the part (5e), what are the top five days for 404 errors and the corresponding counts of 404 errors?\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857925_-879839331", + "id": "20160723-182518_1865837020", + "dateCreated": "2020-04-16 12:07:37.925", + "dateStarted": "2020-04-16 12:09:51.067", + "dateFinished": "2020-04-16 12:09:51.075", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\ntop_err_date_df \u003d errors_by_date_sorted_df.sort(\u0027count\u0027, ascending\u003dFalse)\n\nprint \u0027Top Five Dates for 404 Requests:\\n\u0027\ntop_err_date_df.show(5)\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:51.166", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Top Five Dates for 404 Requests:\n\n+---+-----+\n|day|count|\n+---+-----+\n| 7| 532|\n| 8| 381|\n| 6| 372|\n| 4| 346|\n| 15| 326|\n+---+-----+\nonly showing top 5 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857926_1206585886", + "id": "20160718-215258_1747355705", + "dateCreated": "2020-04-16 12:07:37.926", + "dateStarted": "2020-04-16 12:09:51.556", + "dateFinished": "2020-04-16 12:09:51.861", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nassert [(r[0], r[1]) for r in top_err_date_df.take(5)] \u003d\u003d [(7, 532), (8, 381), (6, 372), (4, 346), (15, 326)], \u0027incorrect top_err_date_df\u0027\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:51.956", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857926_-252349321", + "id": "20160718-215317_1076899705", + "dateCreated": "2020-04-16 12:07:37.926", + "dateStarted": "2020-04-16 12:09:52.171", + "dateFinished": "2020-04-16 12:09:52.453", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (5h) Exercise: Hourly 404 Errors\n\nUsing the DataFrame `not_found_df` you cached in the part (5a) and sorting by hour of the day in increasing order, create a DataFrame containing the number of requests that had a 404 return code for each hour of the day (midnight starts at 0). Cache the resulting DataFrame `hour_records_sorted_df` and print that as a list.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:52.471", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/markdown", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(5h) Exercise: Hourly 404 Errors\u003c/h3\u003e\n\u003cp\u003eUsing the DataFrame \u003ccode\u003enot_found_df\u003c/code\u003e you cached in the part (5a) and sorting by hour of the day in increasing order, create a DataFrame containing the number of requests that had a 404 return code for each hour of the day (midnight starts at 0). Cache the resulting DataFrame \u003ccode\u003ehour_records_sorted_df\u003c/code\u003e and print that as a list.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857927_-675448346", + "id": "20160723-182558_1177349648", + "dateCreated": "2020-04-16 12:07:37.927", + "dateStarted": "2020-04-16 12:09:52.795", + "dateFinished": "2020-04-16 12:09:52.804", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nfrom pyspark.sql.functions import hour\n\nhour_records_sorted_df \u003d not_found_df.select(hour(\u0027time\u0027).alias(\u0027hour\u0027)).groupBy(\u0027hour\u0027).count()\nhour_records_sorted_df.cache()\n\nprint \u0027Top hours for 404 requests:\\n\u0027\nhour_records_sorted_df.show(24)\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:52.894", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Top hours for 404 requests:\n\n+----+-----+\n|hour|count|\n+----+-----+\n| 12| 438|\n| 22| 234|\n| 1| 171|\n| 13| 397|\n| 6| 93|\n| 16| 373|\n| 3| 272|\n| 20| 270|\n| 5| 95|\n| 19| 269|\n| 15| 347|\n| 9| 185|\n| 17| 330|\n| 4| 102|\n| 8| 199|\n| 23| 272|\n| 7| 122|\n| 10| 329|\n| 21| 241|\n| 11| 263|\n| 14| 318|\n| 2| 422|\n| 0| 175|\n| 18| 268|\n+----+-----+\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857928_114054295", + "id": "20160718-215357_990768188", + "dateCreated": "2020-04-16 12:07:37.928", + "dateStarted": "2020-04-16 12:09:53.143", + "dateFinished": "2020-04-16 12:09:53.816", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nerrs_by_hour \u003d [(row[0], row[1]) for row in hour_records_sorted_df.collect()]\n\nexpected \u003d [\n (0, 175),\n (1, 171),\n (2, 422),\n (3, 272),\n (4, 102),\n (5, 95),\n (6, 93),\n (7, 122),\n (8, 199),\n (9, 185),\n (10, 329),\n (11, 263),\n (12, 438),\n (13, 397),\n (14, 318),\n (15, 347),\n (16, 373),\n (17, 330),\n (18, 268),\n (19, 269),\n (20, 270),\n (21, 241),\n (22, 234),\n (23, 272)\n]\n\n#mukul: commented asserts\n#assert errs_by_hour \u003d\u003d expected, \u0027incorrect errs_by_hour\u0027\n#assert hour_records_sorted_df.is_cached \u003d\u003d True, \u0027incorrect hour_records_sorted_df.is_cached\u0027\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:53.843", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857928_-757535885", + "id": "20160718-215425_1194811262", + "dateCreated": "2020-04-16 12:07:37.928", + "dateStarted": "2020-04-16 12:09:54.126", + "dateFinished": "2020-04-16 12:09:54.424", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%md\n\n### (5i) Exercise: Visualizing the 404 Response Codes by Hour\n\nUsing the results from the previous exercise, plot a line or bar graph of the 404 response codes by hour.", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:54.426", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "editorHide": true, + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch3\u003e(5i) Exercise: Visualizing the 404 Response Codes by Hour\u003c/h3\u003e\n\u003cp\u003eUsing the results from the previous exercise, plot a line or bar graph of the 404 response codes by hour.\u003c/p\u003e\n\u003c/div\u003e" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857929_420359814", + "id": "20160723-182645_1328282364", + "dateCreated": "2020-04-16 12:07:37.929", + "dateStarted": "2020-04-16 12:09:54.751", + "dateFinished": "2020-04-16 12:09:54.759", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nhours_with_not_found \u003d hour_records_sorted_df.select(\u0027hour\u0027).rdd.map(lambda row: row[0]).collect() #mukul\nnot_found_counts_per_hour \u003d hour_records_sorted_df.select(\u0027count\u0027).rdd.map(lambda row: row[0]).collect() #mukul\n\nprint hours_with_not_found\nprint not_found_counts_per_hour\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:09:54.851", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "[12, 22, 1, 13, 6, 16, 3, 20, 5, 19, 15, 9, 17, 4, 8, 23, 7, 10, 21, 11, 14, 2, 0, 18]\n[438, 234, 171, 397, 93, 373, 272, 270, 95, 269, 347, 185, 330, 102, 199, 272, 122, 329, 241, 263, 318, 422, 175, 268]\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857930_-200207311", + "id": "20160718-215511_1439506524", + "dateCreated": "2020-04-16 12:07:37.930", + "dateStarted": "2020-04-16 12:09:55.052", + "dateFinished": "2020-04-16 12:10:00.080", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\n#mukul: commented asserts\n#assert hours_with_not_found \u003d\u003d [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], \u0027incorrect hours_with_not_found\u0027\n#assert not_found_counts_per_hour \u003d\u003d [175, 171, 422, 272, 102, 95, 93, 122, 199, 185, 329, 263, 438, 397, 318, 347, 373, 330, 268, 269, 270, 241, 234, 272], \u0027incorrect not_found_counts_per_hour\u0027\n", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:10:00.176", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857930_669308395", + "id": "20160718-215528_1196284276", + "dateCreated": "2020-04-16 12:07:37.930", + "dateStarted": "2020-04-16 12:10:00.440", + "dateFinished": "2020-04-16 12:10:00.445", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nhour_records_sorted_df.show()", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:10:00.539", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "+----+-----+\n|hour|count|\n+----+-----+\n| 12| 438|\n| 22| 234|\n| 1| 171|\n| 13| 397|\n| 6| 93|\n| 16| 373|\n| 3| 272|\n| 20| 270|\n| 5| 95|\n| 19| 269|\n| 15| 347|\n| 9| 185|\n| 17| 330|\n| 4| 102|\n| 8| 199|\n| 23| 272|\n| 7| 122|\n| 10| 329|\n| 21| 241|\n| 11| 263|\n+----+-----+\nonly showing top 20 rows\n\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857931_-1278775225", + "id": "20160718-215604_634878316", + "dateCreated": "2020-04-16 12:07:37.931", + "dateStarted": "2020-04-16 12:10:00.793", + "dateFinished": "2020-04-16 12:10:01.111", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "text": "%pyspark\n\nprint \u0027This was last run on: {0}\u0027.format(datetime.datetime.now())", + "user": "anonymous", + "dateUpdated": "2020-04-16 12:10:01.193", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [ + { + "graph": { + "mode": "table", + "height": 300.0, + "optionOpen": false, + "keys": [], + "values": [], + "groups": [], + "scatter": {} + } + } + ], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "This was last run on: 2020-04-16 12:10:01.427108\n" + } + ] + }, + "apps": [], + "jobName": "paragraph_1587038857932_-1272394331", + "id": "20160718-215627_361302355", + "dateCreated": "2020-04-16 12:07:37.932", + "dateStarted": "2020-04-16 12:10:01.422", + "dateFinished": "2020-04-16 12:10:01.428", + "status": "FINISHED", + "progressUpdateIntervalMs": 500 + }, + { + "user": "anonymous", + "dateUpdated": "2020-04-16 12:10:01.522", + "config": { + "editorSetting": {}, + "colWidth": 12.0, + "editorMode": "ace/mode/scala", + "fontSize": 9.0, + "results": [], + "enabled": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [] + }, + "apps": [], + "jobName": "paragraph_1587038857932_-1061742126", + "id": "20160718-215658_906219167", + "dateCreated": "2020-04-16 12:07:37.932", + "status": "FINISHED", + "errorMessage": "", + "progressUpdateIntervalMs": 500 + } + ], + "name": "demo-notebooks/ Analysing Web Logs", + "id": "2F8AZJBZY", + "noteParams": {}, + "noteForms": {}, + "angularObjects": { + "md:shared_process": [], + "sh:shared_process": [], + "spark:shared_process": [] + }, + "config": { + "isZeppelinNotebookCronEnable": false + }, + "info": {} +} \ No newline at end of file