Skip to content

Commit 8ca1be7

Browse files
authored
Merge pull request #133 from JetBrains/exploring-streaming
Exploring streaming
2 parents aa6d3e5 + 4ece47e commit 8ca1be7

File tree

23 files changed

+2140
-39
lines changed

23 files changed

+2140
-39
lines changed

.github/workflows/build.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,12 @@ jobs:
2525
key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
2626
restore-keys: ${{ runner.os }}-m2
2727
- name: Build with Maven
28-
run: ./mvnw -B package --file pom.xml -Pscala-2.12
28+
run: ./mvnw -B package --file pom.xml -Pscala-2.12 -Dkotest.tags="!Kafka"
29+
qodana:
30+
runs-on: ubuntu-latest
31+
steps:
32+
- uses: actions/checkout@v3
33+
- name: 'Qodana Scan'
34+
uses: JetBrains/qodana-action@v5.0.2
35+
2936
# vim: ts=2:sts=2:sw=2:expandtab

README.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ We have opened a Spark Project Improvement Proposal: [Kotlin support for Apache
2323
- [Column infix/operator functions](#column-infixoperator-functions)
2424
- [Overload Resolution Ambiguity](#overload-resolution-ambiguity)
2525
- [Tuples](#tuples)
26+
- [Streaming](#streaming)
2627
- [Examples](#examples)
2728
- [Reporting issues/Support](#reporting-issuessupport)
2829
- [Code of Conduct](#code-of-conduct)
@@ -267,6 +268,48 @@ Finally, all these tuple helper functions are also baked in:
267268
- `map`
268269
- `cast`
269270

271+
### Streaming
272+
273+
A popular Spark extension is [Spark Streaming](https://spark.apache.org/docs/latest/streaming-programming-guide.html).
274+
Of course the Kotlin Spark API also introduces a more Kotlin-esque approach to write your streaming programs.
275+
There are examples for use with a checkpoint, Kafka and SQL in the [examples module](examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/streaming).
276+
277+
We shall also provide a quick example below:
278+
```kotlin
279+
// Automatically provides ssc: JavaStreamingContext which starts and awaits termination or timeout
280+
withSparkStreaming(batchDuration = Durations.seconds(1), timeout = 10_000) { // this: KSparkStreamingSession
281+
282+
// create input stream for, for instance, Netcat: `$ nc -lk 9999`
283+
val lines: JavaReceiverInputDStream<String> = ssc.socketTextStream("localhost", 9999)
284+
285+
// split input stream on space
286+
val words: JavaDStream<String> = lines.flatMap { it.split(" ").iterator() }
287+
288+
// perform action on each formed RDD in the stream
289+
words.foreachRDD { rdd: JavaRDD<String>, _: Time ->
290+
291+
// to convert the JavaRDD to a Dataset, we need a spark session using the RDD context
292+
withSpark(rdd) { // this: KSparkSession
293+
val dataframe: Dataset<TestRow> = rdd.map { TestRow(word = it) }.toDS()
294+
dataframe
295+
.groupByKey { it.word }
296+
.count()
297+
.show()
298+
// +-----+--------+
299+
// | key|count(1)|
300+
// +-----+--------+
301+
// |hello| 1|
302+
// | is| 1|
303+
// | a| 1|
304+
// | this| 1|
305+
// | test| 3|
306+
// +-----+--------+
307+
}
308+
}
309+
}
310+
```
311+
312+
270313
## Examples
271314

272315
For more, check out [examples](https://github.com/JetBrains/kotlin-spark-api/tree/master/examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples) module.

examples/pom-3.2_2.12.xml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@
2929
<artifactId>spark-streaming_${scala.compat.version}</artifactId>
3030
<version>${spark3.version}</version>
3131
</dependency>
32+
<dependency><!-- Only needed for Qodana -->
33+
<groupId>org.apache.spark</groupId>
34+
<artifactId>spark-streaming-kafka-0-10_${scala.compat.version}</artifactId>
35+
<version>${spark3.version}</version>
36+
</dependency>
3237
</dependencies>
3338

3439
<build>
@@ -90,6 +95,14 @@
9095
<skipNexusStagingDeployMojo>true</skipNexusStagingDeployMojo>
9196
</configuration>
9297
</plugin>
98+
<plugin>
99+
<groupId>org.apache.maven.plugins</groupId>
100+
<artifactId>maven-compiler-plugin</artifactId>
101+
<configuration>
102+
<source>8</source>
103+
<target>8</target>
104+
</configuration>
105+
</plugin>
93106
</plugins>
94107
</build>
95108
</project>

examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/Broadcasting.kt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ package org.jetbrains.kotlinx.spark.examples
2121

2222
import org.jetbrains.kotlinx.spark.api.broadcast
2323
import org.jetbrains.kotlinx.spark.api.map
24-
import org.jetbrains.kotlinx.spark.api.sparkContext
2524
import org.jetbrains.kotlinx.spark.api.withSpark
2625
import java.io.Serializable
2726

examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/Main.kt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,14 @@
1919
*/
2020
package org.jetbrains.kotlinx.spark.examples
2121

22-
import org.apache.spark.api.java.function.ReduceFunction
2322
import org.apache.spark.sql.Dataset
2423
import org.jetbrains.kotlinx.spark.api.*
2524
import org.jetbrains.kotlinx.spark.api.tuples.*
26-
import scala.*
25+
import scala.Tuple2
26+
import scala.Tuple3
2727

2828
data class Q<T>(val id: Int, val text: T)
29+
2930
@Suppress("RedundantLambdaArrow", "UsePropertyAccessSyntax")
3031
object Main {
3132

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/*-
2+
* =LICENSE=
3+
* Kotlin Spark API: Examples for Spark 3.2+ (Scala 2.12)
4+
* ----------
5+
* Copyright (C) 2019 - 2022 JetBrains
6+
* ----------
7+
* Licensed under the Apache License, Version 2.0 (the "License");
8+
* you may not use this file except in compliance with the License.
9+
* You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing, software
14+
* distributed under the License is distributed on an "AS IS" BASIS,
15+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
* See the License for the specific language governing permissions and
17+
* limitations under the License.
18+
* =LICENSEEND=
19+
*/
20+
package org.jetbrains.kotlinx.spark.examples.streaming
21+
22+
import org.apache.kafka.clients.consumer.ConsumerConfig.*
23+
import org.apache.kafka.clients.consumer.ConsumerRecord
24+
import org.apache.kafka.common.serialization.StringDeserializer
25+
import org.apache.spark.streaming.Durations
26+
import org.apache.spark.streaming.api.java.JavaDStream
27+
import org.apache.spark.streaming.api.java.JavaInputDStream
28+
import org.apache.spark.streaming.kafka010.ConsumerStrategies
29+
import org.apache.spark.streaming.kafka010.KafkaUtils
30+
import org.apache.spark.streaming.kafka010.LocationStrategies
31+
import org.jetbrains.kotlinx.spark.api.reduceByKey
32+
import org.jetbrains.kotlinx.spark.api.tuples.*
33+
import org.jetbrains.kotlinx.spark.api.withSparkStreaming
34+
import scala.Tuple2
35+
import java.io.Serializable
36+
import java.util.regex.Pattern
37+
import kotlin.system.exitProcess
38+
39+
40+
/**
41+
* Src: https://github.com/apache/spark/blob/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
42+
*
43+
* Consumes messages from one or more topics in Kafka and does wordcount.
44+
* Usage: JavaDirectKafkaWordCount <brokers> <groupId> <topics>
45+
* <brokers> is a list of one or more Kafka brokers
46+
* <groupId> is a consumer group name to consume from topics
47+
* <topics> is a list of one or more kafka topics to consume from
48+
*
49+
* Example:
50+
*
51+
* First make sure you have a Kafka producer running. For instance, when running locally:
52+
* $ kafka-console-producer.sh --topic quickstart-events --bootstrap-server localhost:9092
53+
*
54+
* Then start the program normally or like this:
55+
* $ bin/run-example streaming.JavaDirectKafkaWordCount broker1-host:port,broker2-host:port \
56+
* consumer-group topic1,topic2
57+
*/
58+
object KotlinDirectKafkaWordCount {
59+
60+
private val SPACE = Pattern.compile(" ")
61+
62+
private const val DEFAULT_BROKER = "localhost:9092"
63+
private const val DEFAULT_GROUP_ID = "consumer-group"
64+
private const val DEFAULT_TOPIC = "quickstart-events"
65+
66+
@JvmStatic
67+
fun main(args: Array<String>) {
68+
if (args.size < 3 && args.isNotEmpty()) {
69+
System.err.println(
70+
"""Usage: JavaDirectKafkaWordCount <brokers> <groupId> <topics>
71+
<brokers> is a list of one or more Kafka brokers
72+
<groupId> is a consumer group name to consume from topics
73+
<topics> is a list of one or more kafka topics to consume from
74+
""".trimIndent()
75+
)
76+
exitProcess(1)
77+
}
78+
79+
val brokers: String = args.getOrElse(0) { DEFAULT_BROKER }
80+
val groupId: String = args.getOrElse(1) { DEFAULT_GROUP_ID }
81+
val topics: String = args.getOrElse(2) { DEFAULT_TOPIC }
82+
83+
// Create context with a 2 seconds batch interval
84+
withSparkStreaming(batchDuration = Durations.seconds(2), appName = "KotlinDirectKafkaWordCount") {
85+
86+
val topicsSet: Set<String> = topics.split(',').toSet()
87+
88+
val kafkaParams: Map<String, Serializable> = mapOf(
89+
BOOTSTRAP_SERVERS_CONFIG to brokers,
90+
GROUP_ID_CONFIG to groupId,
91+
KEY_DESERIALIZER_CLASS_CONFIG to StringDeserializer::class.java,
92+
VALUE_DESERIALIZER_CLASS_CONFIG to StringDeserializer::class.java,
93+
)
94+
95+
// Create direct kafka stream with brokers and topics
96+
val messages: JavaInputDStream<ConsumerRecord<String, String>> = KafkaUtils.createDirectStream(
97+
ssc,
98+
LocationStrategies.PreferConsistent(),
99+
ConsumerStrategies.Subscribe(topicsSet, kafkaParams),
100+
)
101+
102+
// Get the lines, split them into words, count the words and print
103+
val lines: JavaDStream<String> = messages.map { it.value() }
104+
val words: JavaDStream<String> = lines.flatMap { it.split(SPACE).iterator() }
105+
106+
val wordCounts: JavaDStream<Tuple2<String, Int>> = words
107+
.map { it X 1 }
108+
.reduceByKey { a: Int, b: Int -> a + b }
109+
110+
wordCounts.print()
111+
112+
}
113+
}
114+
}

0 commit comments

Comments
 (0)