From 2aec497a137c373d60069523212735e4419d284b Mon Sep 17 00:00:00 2001 From: Daniel Chen Date: Wed, 23 Jan 2019 16:46:46 -0800 Subject: [PATCH] Documentation updates --- README.md | 17 ++++++++++------- .../apache/beam/examples/KafkaWordCount.java | 13 ++++++++----- .../org/apache/beam/examples/WordCount.java | 19 +++++++++++-------- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 8ac19a9..f58d8b3 100644 --- a/README.md +++ b/README.md @@ -62,8 +62,9 @@ $ scripts/grid start zookeeper You can run directly within the project using maven: ``` +$ mvn package $ mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.WordCount \ - -Dexec.args="--runner=SamzaRunner" -P samza-runner + -Dexec.args="--inputFile=pom.xml --output=counts --runner=SamzaRunner" -P samza-runner ``` ### Packaging Your Application @@ -77,15 +78,15 @@ After packaging, we deploy and explode the tgz in the deploy folder: ### Standalone Cluster with Zookeeper You can use the `run-beam-standalone.sh` script included in this repo to run an example -in standalone mode. The config file is provided as `config/standalone.properties`. Note by -default we create one single input partition for the whole input. To set the number of +in standalone mode. The config file is provided as `config/standalone.properties`. Note that by +default we create a single input partition for the whole input. To set the number of partitions, you can add "--maxSourceParallelism=" argument. For example, "--maxSourceParallelism=2" will create two partitions of the input file, based on size. ``` $ deploy/examples/bin/run-beam-standalone.sh org.apache.beam.examples.WordCount \ --configFilePath=$PWD/deploy/examples/config/standalone.properties \ - --inputFile=/Users/xiliu/opensource/samza-beam-examples/pom.xml --output=word-counts.txt \ + --inputFile=$PWD/pom.xml --output=word-counts.txt \ --maxSourceParallelism=2 ``` @@ -101,13 +102,15 @@ $ deploy/examples/bin/run-beam-standalone.sh org.apache.beam.examples.KafkaWordC ### Yarn Cluster Similar to running standalone, we can use the `run-beam-yarn.sh` to run the examples -in Yarn cluster. The config file is provided as `config/yarn.properties`. To run the -WordCount example in yarn: +in Yarn cluster. The config file is provided as `config/yarn.properties`. +Note that for yarn, we don't need to wait after submitting the job, so there is no need for `waitUntilFinish()`. +Please change `p.run().waitUtilFinish()` to `p.run()` in the `WordCount.java` class. +To run the WordCount example in yarn: ``` $ deploy/examples/bin/run-beam-yarn.sh org.apache.beam.examples.WordCount \ --configFilePath=$PWD/deploy/examples/config/yarn.properties \ - --inputFile=/Users/xiliu/opensource/samza-beam-examples/pom.xml \ + --inputFile=$PWD/pom.xml \ --output=/tmp/word-counts.txt --maxSourceParallelism=2 ``` diff --git a/src/main/java/org/apache/beam/examples/KafkaWordCount.java b/src/main/java/org/apache/beam/examples/KafkaWordCount.java index 7d1658d..02d4e5a 100644 --- a/src/main/java/org/apache/beam/examples/KafkaWordCount.java +++ b/src/main/java/org/apache/beam/examples/KafkaWordCount.java @@ -67,13 +67,16 @@ *

To run in standalone with zookeeper: * (large parallelism will enforce each partition in a task) *

{@code
- * $ deploy/examples/bin/run-beam-standalone.sh org.apache.beam.examples.KafkaWordCount --configFilePath=$PWD/deploy/examples/config/standalone.properties --maxSourceParallelism=1024
+ * $ ./deploy/examples/bin/run-beam-standalone.sh org.apache.beam.examples.KafkaWordCount --configFilePath=$PWD/deploy/examples/config/standalone.properties --maxSourceParallelism=1024
  * }
* *

To run in yarn: + * For yarn, we don't need to wait after submitting the job, so there is no need for + * waitUntilFinish(). Please change p.run().waitUtilFinish() to p.run(). + * * (large parallelism will enforce each partition in a task) *

{@code
- * $ deploy/examples/bin/run-beam-yarn.sh org.apache.beam.examples.KafkaWordCount --configFilePath=$PWD/deploy/examples/config/yarn.properties --maxSourceParallelism=1024
+ * $ ./deploy/examples/bin/run-beam-yarn.sh org.apache.beam.examples.KafkaWordCount --configFilePath=$PWD/deploy/examples/config/yarn.properties --maxSourceParallelism=1024
  * }
* *

To produce some test data: @@ -127,9 +130,9 @@ public static void main(String[] args) { .withKeySerializer(StringSerializer.class) .withValueSerializer(StringSerializer.class)); - //For yarn, we don't need to wait after submitting the job, - //so there is no need for waitUntilFinish(). Please use - //p.run() + // For yarn, we don't need to wait after submitting the job, + // so there is no need for waitUntilFinish(). Please use + // p.run() p.run().waitUntilFinish(); } } diff --git a/src/main/java/org/apache/beam/examples/WordCount.java b/src/main/java/org/apache/beam/examples/WordCount.java index a647c0a..dedc28f 100644 --- a/src/main/java/org/apache/beam/examples/WordCount.java +++ b/src/main/java/org/apache/beam/examples/WordCount.java @@ -70,18 +70,21 @@ *

To execute this example in standalone with zookeeper: * (split the input by 2) *

{@code
- * $ deploy/examples/bin/run-beam-standalone.sh org.apache.beam.examples.WordCount \
+ * $ ./deploy/examples/bin/run-beam-standalone.sh org.apache.beam.examples.WordCount \
  *     --configFilePath=$PWD/deploy/examples/config/standalone.properties \
- *     --inputFile=/Users/xiliu/opensource/samza-beam-examples/pom.xml --output=word-counts.txt \
+ *     --inputFile=$PWD/pom.xml --output=word-counts.txt \
  *     --maxSourceParallelism=2
  * }
* *

To execute this example in yarn: + * For yarn, we don't need to wait after submitting the job, so there is no need for + * waitUntilFinish(). Please change p.run().waitUtilFinish() to p.run(). + * * (split the input by 2) *

{@code
- * $ deploy/examples/bin/run-beam-yarn.sh org.apache.beam.examples.WordCount \
+ * $ ./deploy/examples/bin/run-beam-yarn.sh org.apache.beam.examples.WordCount \
  *     --configFilePath=$PWD/deploy/examples/config/yarn.properties \
- *     --inputFile=/Users/xiliu/opensource/samza-beam-examples/pom.xml \
+ *     --inputFile=$PWD/pom.xml \
  *     --output=/tmp/word-counts.txt --maxSourceParallelism=2
  * }
*/ @@ -187,10 +190,10 @@ static void runWordCount(WordCountOptions options) { .apply(MapElements.via(new FormatAsTextFn())) .apply("WriteCounts", TextIO.write().to(options.getOutput()).withoutSharding()); - //For yarn, we don't need to wait after submitting the job, - //so there is no need for waitUntilFinish(). Please use - //p.run() - p.run().waitUntilFinish(); + // For yarn, we don't need to wait after submitting the job, + // so there is no need for waitUntilFinish(). Please use + // p.run() + p.run(); } public static void main(String[] args) {