diff --git a/README.md b/README.md index 8ac19a9..f58d8b3 100644 --- a/README.md +++ b/README.md @@ -62,8 +62,9 @@ $ scripts/grid start zookeeper You can run directly within the project using maven: ``` +$ mvn package $ mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.WordCount \ - -Dexec.args="--runner=SamzaRunner" -P samza-runner + -Dexec.args="--inputFile=pom.xml --output=counts --runner=SamzaRunner" -P samza-runner ``` ### Packaging Your Application @@ -77,15 +78,15 @@ After packaging, we deploy and explode the tgz in the deploy folder: ### Standalone Cluster with Zookeeper You can use the `run-beam-standalone.sh` script included in this repo to run an example -in standalone mode. The config file is provided as `config/standalone.properties`. Note by -default we create one single input partition for the whole input. To set the number of +in standalone mode. The config file is provided as `config/standalone.properties`. Note that by +default we create a single input partition for the whole input. To set the number of partitions, you can add "--maxSourceParallelism=" argument. For example, "--maxSourceParallelism=2" will create two partitions of the input file, based on size. ``` $ deploy/examples/bin/run-beam-standalone.sh org.apache.beam.examples.WordCount \ --configFilePath=$PWD/deploy/examples/config/standalone.properties \ - --inputFile=/Users/xiliu/opensource/samza-beam-examples/pom.xml --output=word-counts.txt \ + --inputFile=$PWD/pom.xml --output=word-counts.txt \ --maxSourceParallelism=2 ``` @@ -101,13 +102,15 @@ $ deploy/examples/bin/run-beam-standalone.sh org.apache.beam.examples.KafkaWordC ### Yarn Cluster Similar to running standalone, we can use the `run-beam-yarn.sh` to run the examples -in Yarn cluster. The config file is provided as `config/yarn.properties`. To run the -WordCount example in yarn: +in Yarn cluster. The config file is provided as `config/yarn.properties`. +Note that for yarn, we don't need to wait after submitting the job, so there is no need for `waitUntilFinish()`. +Please change `p.run().waitUtilFinish()` to `p.run()` in the `WordCount.java` class. +To run the WordCount example in yarn: ``` $ deploy/examples/bin/run-beam-yarn.sh org.apache.beam.examples.WordCount \ --configFilePath=$PWD/deploy/examples/config/yarn.properties \ - --inputFile=/Users/xiliu/opensource/samza-beam-examples/pom.xml \ + --inputFile=$PWD/pom.xml \ --output=/tmp/word-counts.txt --maxSourceParallelism=2 ``` diff --git a/src/main/java/org/apache/beam/examples/KafkaWordCount.java b/src/main/java/org/apache/beam/examples/KafkaWordCount.java index 7d1658d..02d4e5a 100644 --- a/src/main/java/org/apache/beam/examples/KafkaWordCount.java +++ b/src/main/java/org/apache/beam/examples/KafkaWordCount.java @@ -67,13 +67,16 @@ *
To run in standalone with zookeeper: * (large parallelism will enforce each partition in a task) *
{@code
- * $ deploy/examples/bin/run-beam-standalone.sh org.apache.beam.examples.KafkaWordCount --configFilePath=$PWD/deploy/examples/config/standalone.properties --maxSourceParallelism=1024
+ * $ ./deploy/examples/bin/run-beam-standalone.sh org.apache.beam.examples.KafkaWordCount --configFilePath=$PWD/deploy/examples/config/standalone.properties --maxSourceParallelism=1024
* }
*
* To run in yarn: + * For yarn, we don't need to wait after submitting the job, so there is no need for + * waitUntilFinish(). Please change p.run().waitUtilFinish() to p.run(). + * * (large parallelism will enforce each partition in a task) *
{@code
- * $ deploy/examples/bin/run-beam-yarn.sh org.apache.beam.examples.KafkaWordCount --configFilePath=$PWD/deploy/examples/config/yarn.properties --maxSourceParallelism=1024
+ * $ ./deploy/examples/bin/run-beam-yarn.sh org.apache.beam.examples.KafkaWordCount --configFilePath=$PWD/deploy/examples/config/yarn.properties --maxSourceParallelism=1024
* }
*
* To produce some test data: @@ -127,9 +130,9 @@ public static void main(String[] args) { .withKeySerializer(StringSerializer.class) .withValueSerializer(StringSerializer.class)); - //For yarn, we don't need to wait after submitting the job, - //so there is no need for waitUntilFinish(). Please use - //p.run() + // For yarn, we don't need to wait after submitting the job, + // so there is no need for waitUntilFinish(). Please use + // p.run() p.run().waitUntilFinish(); } } diff --git a/src/main/java/org/apache/beam/examples/WordCount.java b/src/main/java/org/apache/beam/examples/WordCount.java index a647c0a..dedc28f 100644 --- a/src/main/java/org/apache/beam/examples/WordCount.java +++ b/src/main/java/org/apache/beam/examples/WordCount.java @@ -70,18 +70,21 @@ *
To execute this example in standalone with zookeeper: * (split the input by 2) *
{@code
- * $ deploy/examples/bin/run-beam-standalone.sh org.apache.beam.examples.WordCount \
+ * $ ./deploy/examples/bin/run-beam-standalone.sh org.apache.beam.examples.WordCount \
* --configFilePath=$PWD/deploy/examples/config/standalone.properties \
- * --inputFile=/Users/xiliu/opensource/samza-beam-examples/pom.xml --output=word-counts.txt \
+ * --inputFile=$PWD/pom.xml --output=word-counts.txt \
* --maxSourceParallelism=2
* }
*
* To execute this example in yarn: + * For yarn, we don't need to wait after submitting the job, so there is no need for + * waitUntilFinish(). Please change p.run().waitUtilFinish() to p.run(). + * * (split the input by 2) *
{@code
- * $ deploy/examples/bin/run-beam-yarn.sh org.apache.beam.examples.WordCount \
+ * $ ./deploy/examples/bin/run-beam-yarn.sh org.apache.beam.examples.WordCount \
* --configFilePath=$PWD/deploy/examples/config/yarn.properties \
- * --inputFile=/Users/xiliu/opensource/samza-beam-examples/pom.xml \
+ * --inputFile=$PWD/pom.xml \
* --output=/tmp/word-counts.txt --maxSourceParallelism=2
* }
*/
@@ -187,10 +190,10 @@ static void runWordCount(WordCountOptions options) {
.apply(MapElements.via(new FormatAsTextFn()))
.apply("WriteCounts", TextIO.write().to(options.getOutput()).withoutSharding());
- //For yarn, we don't need to wait after submitting the job,
- //so there is no need for waitUntilFinish(). Please use
- //p.run()
- p.run().waitUntilFinish();
+ // For yarn, we don't need to wait after submitting the job,
+ // so there is no need for waitUntilFinish(). Please use
+ // p.run()
+ p.run();
}
public static void main(String[] args) {