From 2ff7e418d3ae64a1f6a67959bae708fddedf741e Mon Sep 17 00:00:00 2001 From: Jon Bringhurst Date: Thu, 1 May 2025 11:00:42 -0700 Subject: [PATCH 1/3] Add annotations for each line identified as having a potential issue. --- samza-shell/src/main/bash/run-class.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/samza-shell/src/main/bash/run-class.sh b/samza-shell/src/main/bash/run-class.sh index 1669332b52..040f4462ea 100755 --- a/samza-shell/src/main/bash/run-class.sh +++ b/samza-shell/src/main/bash/run-class.sh @@ -28,6 +28,9 @@ cd $base_dir base_dir=`pwd` cd $home_dir +# Note: When using samza-yarn, base_dir and pwd here looks something like: +# //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/__package + echo "Current time: $(date '+%Y-%m-%d %H:%M:%S')" echo home_dir=$home_dir @@ -78,15 +81,26 @@ fi # permissions for the classpath-related files when they are in their own directory. An example of where # this is helpful is when using container images which might have predefined permissions for certain # directories. + +# FIXME(SAMZA-2804): CLASSPATH_WORKSPACE_DIR is shared among all containers running on the host when using samza-yarn. +# Using the same path for all containers running on the host for manifest.txt and pathing.jar is a race condition. +# e.g. "//usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/__package/classpath_workspace/pathing.jar" CLASSPATH_WORKSPACE_DIR=$base_dir/classpath_workspace mkdir -p $CLASSPATH_WORKSPACE_DIR + +# FIXME(SAMZA-2804): This is a race condition when using samza-yarn. # file containing the classpath string; used to avoid passing long classpaths directly to the jar command PATHING_MANIFEST_FILE=$CLASSPATH_WORKSPACE_DIR/manifest.txt + +# FIXME(SAMZA-2804): This is a race condition when using samza-yarn. # jar file to include on the classpath for running the main class PATHING_JAR_FILE=$CLASSPATH_WORKSPACE_DIR/pathing.jar +# FIXME(SAMZA-2804): This is a race condition when using samza-yarn. # Newlines and spaces are intended to ensure proper parsing of manifest in pathing jar printf "Class-Path: \n $CLASSPATH \n" > $PATHING_MANIFEST_FILE + +# FIXME(SAMZA-2804): This is a race condition when using samza-yarn. # Creates a new archive and adds custom manifest information to pathing.jar eval "$JAR -cvmf $PATHING_MANIFEST_FILE $PATHING_JAR_FILE" @@ -96,10 +110,14 @@ else JAVA="$JAVA_HOME/bin/java" fi +# FIXME(SAMZA-2804): This log directory is shared among all containers running on the host when using samza-yarn. if [ -z "$SAMZA_LOG_DIR" ]; then SAMZA_LOG_DIR="$base_dir" fi +# FIXME(SAMZA-2804): This directory is shared among all containers running on the host when using samza-yarn. We should +# likely be using a per-container tmp directory instead. +# # add usercache directory mkdir -p $base_dir/tmp JAVA_TEMP_DIR=$base_dir/tmp From 942e0d16c211f3c67651f047ddc98e82337008dc Mon Sep 17 00:00:00 2001 From: Jon Bringhurst Date: Thu, 1 May 2025 13:35:42 -0700 Subject: [PATCH 2/3] Resolve multiple concurrency issues ## Race condition in pathing jar manifest creation A race condition exists when setting up the classpath during container launch. During container launch using samza-yarn, run-class.sh creates a pathing jar file (which holds the classpath for the container launch). However, during the creation of this pathing jar, temporary files, as well as the pathing jar itself is not placed in a location unique to the container. This results in multiple containers writing to the same pathing jar location and temporary file location, which results in a race condition. This race condition may show up in several ways, such as when Yarn removes jars from a finished container (other containers will point to a classpath which no longer exists) or when multiple run-class.sh scripts attempt to write the manifest.txt or pathing jar at the same time. Note that host affinity being enabled will make this problem worse. The pathing.jar is written to the usercache, so when the container which created the pathing.jar is finished and removed, any new container which launches on that host will point to jar files which do not exist anymore. When host affinity is enabled, it will not move to a new host and just keep failing. ## Container logging directory fallback is not unique for each container The fallback log directory is the same among all containers running on the same host. It should be unique per-container. ## Container tmp dir is not unique per-container The JAVA_TMP_DIR directory is the same for all containers. We should make sure that it's safe to use the same directory for all containers. --- samza-shell/src/main/bash/run-class.sh | 37 +++++++++---------- .../src/main/bash/run-framework-class.sh | 26 ++++++++++--- 2 files changed, 39 insertions(+), 24 deletions(-) mode change 100644 => 100755 samza-shell/src/main/bash/run-framework-class.sh diff --git a/samza-shell/src/main/bash/run-class.sh b/samza-shell/src/main/bash/run-class.sh index 040f4462ea..044d866232 100755 --- a/samza-shell/src/main/bash/run-class.sh +++ b/samza-shell/src/main/bash/run-class.sh @@ -28,12 +28,14 @@ cd $base_dir base_dir=`pwd` cd $home_dir -# Note: When using samza-yarn, base_dir and pwd here looks something like: -# //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/__package - echo "Current time: $(date '+%Y-%m-%d %H:%M:%S')" +# Note: When using samza-yarn, home_dir looks like: +# //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027 echo home_dir=$home_dir + +# Note: When using samza-yarn, base_dir looks like: +# //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/__package echo "framework base (location of this script). base_dir=$base_dir" if [ ! -d "$base_dir/lib" ]; then @@ -82,25 +84,20 @@ fi # this is helpful is when using container images which might have predefined permissions for certain # directories. -# FIXME(SAMZA-2804): CLASSPATH_WORKSPACE_DIR is shared among all containers running on the host when using samza-yarn. -# Using the same path for all containers running on the host for manifest.txt and pathing.jar is a race condition. -# e.g. "//usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/__package/classpath_workspace/pathing.jar" -CLASSPATH_WORKSPACE_DIR=$base_dir/classpath_workspace +# Note: When on samza-yarn, CLASSPATH_WORKSPACE_DIR looks like: +# //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/classpath_workspace +CLASSPATH_WORKSPACE_DIR=$home_dir/classpath_workspace mkdir -p $CLASSPATH_WORKSPACE_DIR -# FIXME(SAMZA-2804): This is a race condition when using samza-yarn. # file containing the classpath string; used to avoid passing long classpaths directly to the jar command PATHING_MANIFEST_FILE=$CLASSPATH_WORKSPACE_DIR/manifest.txt -# FIXME(SAMZA-2804): This is a race condition when using samza-yarn. # jar file to include on the classpath for running the main class PATHING_JAR_FILE=$CLASSPATH_WORKSPACE_DIR/pathing.jar -# FIXME(SAMZA-2804): This is a race condition when using samza-yarn. # Newlines and spaces are intended to ensure proper parsing of manifest in pathing jar printf "Class-Path: \n $CLASSPATH \n" > $PATHING_MANIFEST_FILE -# FIXME(SAMZA-2804): This is a race condition when using samza-yarn. # Creates a new archive and adds custom manifest information to pathing.jar eval "$JAR -cvmf $PATHING_MANIFEST_FILE $PATHING_JAR_FILE" @@ -110,17 +107,19 @@ else JAVA="$JAVA_HOME/bin/java" fi -# FIXME(SAMZA-2804): This log directory is shared among all containers running on the host when using samza-yarn. if [ -z "$SAMZA_LOG_DIR" ]; then - SAMZA_LOG_DIR="$base_dir" + # When on samza-yarn, SAMZA_LOG_DIR will point to the symlink located at: + # //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/logs + # + # When the symlink is resolved, this path will point to: + # //userlogs/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027 + SAMZA_LOG_DIR="$home_dir/logs" fi -# FIXME(SAMZA-2804): This directory is shared among all containers running on the host when using samza-yarn. We should -# likely be using a per-container tmp directory instead. -# -# add usercache directory -mkdir -p $base_dir/tmp -JAVA_TEMP_DIR=$base_dir/tmp +# When on samza-yarn, JAVA_TEMP_DIR will point to a path similar to: +# //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/tmp +mkdir -p $home_dir/tmp +JAVA_TEMP_DIR=$home_dir/tmp # Check whether the JVM supports GC Log rotation, and enable it if so. function check_and_enable_gc_log_rotation { diff --git a/samza-shell/src/main/bash/run-framework-class.sh b/samza-shell/src/main/bash/run-framework-class.sh old mode 100644 new mode 100755 index 342047c585..cb652521e7 --- a/samza-shell/src/main/bash/run-framework-class.sh +++ b/samza-shell/src/main/bash/run-framework-class.sh @@ -28,7 +28,12 @@ cd $base_dir base_dir=`pwd` cd $home_dir +# Note: When using samza-yarn, home_dir looks like: +# //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027 echo home_dir=$home_dir + +# Note: When using samza-yarn, base_dir looks like: +# //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/__package echo "framework base (location of this script). base_dir=$base_dir" if [ ! -d "$base_dir/lib" ]; then @@ -107,10 +112,15 @@ fi # permissions for the classpath-related files when they are in their own directory. An example of where # this is helpful is when using container images which might have predefined permissions for certain # directories. -CLASSPATH_WORKSPACE_DIR=$base_dir/classpath_workspace + +# Note: When on samza-yarn, CLASSPATH_WORKSPACE_DIR looks like: +# //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/classpath_workspace +CLASSPATH_WORKSPACE_DIR=$home_dir/classpath_workspace mkdir -p $CLASSPATH_WORKSPACE_DIR + # file containing the classpath string; used to avoid passing long classpaths directly to the jar command PATHING_MANIFEST_FILE=$CLASSPATH_WORKSPACE_DIR/manifest.txt + # jar file to include on the classpath for running the main class PATHING_JAR_FILE=$CLASSPATH_WORKSPACE_DIR/pathing.jar @@ -126,12 +136,18 @@ else fi if [ -z "$SAMZA_LOG_DIR" ]; then - SAMZA_LOG_DIR="$base_dir" + # When on samza-yarn, SAMZA_LOG_DIR will point to the symlink located at: + # //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/logs + # + # When the symlink is resolved, this path will point to: + # //userlogs/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027 + SAMZA_LOG_DIR="$home_dir" fi -# add usercache directory -mkdir -p $base_dir/tmp -JAVA_TEMP_DIR=$base_dir/tmp +# When on samza-yarn, JAVA_TEMP_DIR will point to a path similar to: +# //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/tmp +mkdir -p $home_dir/tmp +JAVA_TEMP_DIR=$home_dir/tmp # Check whether the JVM supports GC Log rotation, and enable it if so. function check_and_enable_gc_log_rotation { From f34535c223f148fef0518cbd1bc8fe2257808ce7 Mon Sep 17 00:00:00 2001 From: Jon Bringhurst Date: Fri, 2 May 2025 15:17:20 -0700 Subject: [PATCH 3/3] Simplify comments and print manifest file locations --- samza-shell/src/main/bash/run-class.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/samza-shell/src/main/bash/run-class.sh b/samza-shell/src/main/bash/run-class.sh index 044d866232..2334f1d976 100755 --- a/samza-shell/src/main/bash/run-class.sh +++ b/samza-shell/src/main/bash/run-class.sh @@ -30,11 +30,11 @@ cd $home_dir echo "Current time: $(date '+%Y-%m-%d %H:%M:%S')" -# Note: When using samza-yarn, home_dir looks like: +# For example, home_dir looks like: # //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027 echo home_dir=$home_dir -# Note: When using samza-yarn, base_dir looks like: +# For example, base_dir looks like: # //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/__package echo "framework base (location of this script). base_dir=$base_dir" @@ -84,16 +84,18 @@ fi # this is helpful is when using container images which might have predefined permissions for certain # directories. -# Note: When on samza-yarn, CLASSPATH_WORKSPACE_DIR looks like: +# For example, CLASSPATH_WORKSPACE_DIR looks like: # //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/classpath_workspace CLASSPATH_WORKSPACE_DIR=$home_dir/classpath_workspace mkdir -p $CLASSPATH_WORKSPACE_DIR # file containing the classpath string; used to avoid passing long classpaths directly to the jar command PATHING_MANIFEST_FILE=$CLASSPATH_WORKSPACE_DIR/manifest.txt +echo "Pathing manifest txt located at $PATHING_MANIFEST_FILE" # jar file to include on the classpath for running the main class PATHING_JAR_FILE=$CLASSPATH_WORKSPACE_DIR/pathing.jar +echo "Pathing manifest jar located at $PATHING_JAR_FILE" # Newlines and spaces are intended to ensure proper parsing of manifest in pathing jar printf "Class-Path: \n $CLASSPATH \n" > $PATHING_MANIFEST_FILE @@ -108,7 +110,7 @@ else fi if [ -z "$SAMZA_LOG_DIR" ]; then - # When on samza-yarn, SAMZA_LOG_DIR will point to the symlink located at: + # SAMZA_LOG_DIR will point to the symlink located at: # //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/logs # # When the symlink is resolved, this path will point to: @@ -116,7 +118,7 @@ if [ -z "$SAMZA_LOG_DIR" ]; then SAMZA_LOG_DIR="$home_dir/logs" fi -# When on samza-yarn, JAVA_TEMP_DIR will point to a path similar to: +# JAVA_TEMP_DIR will point to a path similar to: # //usercache//appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/tmp mkdir -p $home_dir/tmp JAVA_TEMP_DIR=$home_dir/tmp