From 64b685bac6f4aa4988d6204a42a011b613d6fd32 Mon Sep 17 00:00:00 2001 From: Joey Lin Date: Tue, 28 Oct 2025 16:38:32 -0700 Subject: [PATCH 1/4] add readme --- workflow/salloc/README.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 workflow/salloc/README.md diff --git a/workflow/salloc/README.md b/workflow/salloc/README.md new file mode 100644 index 0000000..91acec5 --- /dev/null +++ b/workflow/salloc/README.md @@ -0,0 +1,35 @@ +## salloc main workflow + +This workflow uses the salloc subworkflow to allocate node(s), run independent jobs on the allocated node(s), and then release the allocation when any job fails, the workflow is cancelled, or all jobs complete. + +--- + +### Jobs + +You can run any independent jobs using the allocated nodes. +This example workflow: + +1. Runs "hello world" twice on the allocated nodes. +2. Runs an MPI hello world job across the allocated nodes. + +Note: The MPI hello world job assumes that OpenMPI is already installed on the cluster (using [this workflow](https://github.com/parallelworks/workflow-utils/blob/main/workflow/build_install_openmpi.yaml)). The job will source OpenMPI according to that workflow. + +### Inputs + +The workflow accepts the following inputs: + +- `resource`: The compute resource to run the workflow on. +- `partition`: The Slurm partition to use. +- `nodes`: Number of nodes to allocate. +- `walltime`: Walltime for the Slurm allocation. + +--- + +### Purpose + +This workflow serves as a basic template for: + +- Allocating nodes using Slurm. +- Demonstrating the use of a subworkflow. +- Running independent jobs on previously allocated nodes. +- Ensuring clean release of allocated nodes, even if jobs fail or the workflow is cancelled. From 078a71d8e8b2098b60f32ecbcec7b253dd54a91b Mon Sep 17 00:00:00 2001 From: Joey Lin Date: Tue, 28 Oct 2025 16:45:39 -0700 Subject: [PATCH 2/4] wait for release allocation file, don't use /tmp --- workflow/salloc/salloc_subworkflow.yaml | 29 ++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/workflow/salloc/salloc_subworkflow.yaml b/workflow/salloc/salloc_subworkflow.yaml index 2161385..0095ec7 100644 --- a/workflow/salloc/salloc_subworkflow.yaml +++ b/workflow/salloc/salloc_subworkflow.yaml @@ -1,20 +1,33 @@ jobs: - allocation: + release_allocation: ssh: remoteHost: ${{inputs.resource.ip}} + steps: + - name: Release Slurm Allocation + run: | + while [ ! -f ../../../RELEASE_ALLOCATION ]; do + echo "$(date) Waiting for RELEASE_ALLOCATION file to release the allocation" + sleep 5 + done + cleanup: | + JOBID=$(grep '^SLURM_JOB_ID=' $(pwd)/slurm_allocation_info.txt | cut -d= -f2) + scancel $JOBID + allocation: steps: - name: Create Slurm Allocation Info File + ssh: + remoteHost: ${{ inputs.resource.ip }} run: | set -e - INFO_FILE="/tmp/slurm_allocation_info.txt" + INFO_FILE="slurm_allocation_info.txt" echo "Cleaning up existing files" rm -f "$INFO_FILE" echo "=== Existing files cleaned up ===" echo "Creating persistent allocation" - nohup salloc --nodes=${{inputs.nodes}} --time=${{inputs.walltime}} --no-shell > salloc.log 2>&1 & + nohup salloc --nodes=${{ inputs.nodes }} --time=${{ inputs.walltime }} --no-shell > salloc.log 2>&1 & sleep 5 @@ -40,17 +53,13 @@ jobs: echo "Info file created" echo "" - copy: - needs: - - allocation - steps: - name: Copy file from cluster run: | set -e echo "Copying file from cluster" - scp $USER@${{inputs.resource.ip}}:/tmp/slurm_allocation_info.txt /$(pwd)/slurm_allocation_info.txt + scp $USER@${{ inputs.resource.ip }}:$(pwd)/slurm_allocation_info.txt /$(pwd)/slurm_allocation_info.txt echo "Copied file from cluster in $(pwd)" -'on': +"on": execute: inputs: resource: @@ -59,7 +68,7 @@ jobs: autoselect: true optional: false partition: - resource: ${{inputs.resource}} + resource: ${{ inputs.resource }} label: Partition type: slurm-partitions nodes: From 77b405cb9c8aed771745746aab55462926b99de2 Mon Sep 17 00:00:00 2001 From: Joey Lin Date: Tue, 28 Oct 2025 16:46:42 -0700 Subject: [PATCH 3/4] add mpi hello world, wait for slurm info text --- workflow/salloc/main_salloc_workflow.yaml | 101 ++++++++++++++++++---- 1 file changed, 84 insertions(+), 17 deletions(-) diff --git a/workflow/salloc/main_salloc_workflow.yaml b/workflow/salloc/main_salloc_workflow.yaml index 10de977..b7ecb14 100644 --- a/workflow/salloc/main_salloc_workflow.yaml +++ b/workflow/salloc/main_salloc_workflow.yaml @@ -4,40 +4,107 @@ jobs: - name: Allocation info file uses: workflow/salloc_subworkflow with: - resource: ${{inputs.resource}} + resource: ${{ inputs.resource }} partition: ${{ inputs.partition }} nodes: ${{ inputs.nodes }} walltime: ${{ inputs.walltime }} + copy-allocation-file: + ssh: + remoteHost: ${{ inputs.resource.ip }} + steps: - name: Copy file to parent directory run: | - cp subworkflows/*/step_*/slurm_allocation_info.txt . + set -e + echo "Looking for slurm_allocation_info.txt under subworkflows..." + + SEARCH_PATH="subworkflows/allocate/step_*/slurm_allocation_info.txt" + + echo "Waiting for $SEARCH_PATH to appear..." + while true; do + FOUND_FILE=$(find subworkflows/allocate/ -path "$SEARCH_PATH" 2>/dev/null | head -n 1) + if [ -n "$FOUND_FILE" ]; then + echo "Found allocation info file at: $FOUND_FILE" + cp "$FOUND_FILE" . + echo "Copied slurm_allocation_info.txt to $(pwd)" + break + fi + echo "Still waiting..." + sleep 5 + done hello1: ssh: - remoteHost: ${{inputs.resource}} - needs: - - allocate + remoteHost: ${{ inputs.resource.ip }} steps: - run: | - JOBID=$(grep '^SLURM_JOB_ID=' /tmp/slurm_allocation_info.txt | cut -d= -f2) - srun --jobid=$JOBID echo "hello world 1 on $(hostname)" + while [ ! -f slurm_allocation_info.txt ]; do + echo "Waiting for file slurm_allocation_info.txt to be created" + sleep 5 + done + JOBID=$(grep '^SLURM_JOB_ID=' $(pwd)/slurm_allocation_info.txt | cut -d= -f2) + srun --jobid="$JOBID" echo "hello world 1 on $(hostname)" hello2: ssh: - remoteHost: ${{inputs.resource}} - needs: - - allocate + remoteHost: ${{ inputs.resource.ip }} steps: - run: | - JOBID=$(grep '^SLURM_JOB_ID=' /tmp/slurm_allocation_info.txt | cut -d= -f2) - srun --jobid=$JOBID echo "hello world 2 on $(hostname)" - relinquish: + while [ ! -f slurm_allocation_info.txt ]; do + echo "Waiting for file slurm_allocation_info.txt to be created" + sleep 5 + done + JOBID=$(grep '^SLURM_JOB_ID=' $(pwd)/slurm_allocation_info.txt | cut -d= -f2) + srun --jobid="$JOBID" echo "hello world 2 on $(hostname)" + mpi-hello-world: ssh: - remoteHost: ${{inputs.resource}} + remoteHost: ${{ inputs.resource.ip }} + steps: + - name: Run MPI Hello World + run: | + while [ ! -f slurm_allocation_info.txt ]; do + echo "Waiting for file slurm_allocation_info.txt to be created" + sleep 5 + done + set -ex + JOBID=$(grep '^SLURM_JOB_ID=' $(pwd)/slurm_allocation_info.txt | cut -d= -f2) + + echo "Running MPI Hello World using srun with jobid=$JOBID" + cat <<'EOF' > mpihello.c + #include + #include + + int main(int argc, char** argv) { + MPI_Init(NULL, NULL); + + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + + char processor_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + MPI_Get_processor_name(processor_name, &name_len); + + printf("Hello world from processor %s, rank %d out of %d processors\n", + processor_name, world_rank, world_size); + + MPI_Finalize(); + } + EOF + + source /home/jlin/pw/software/openmpi-4.1.6/env.sh + mpicc -o mpihello.out mpihello.c + + srun --jobid="$JOBID" -N "${{ inputs.nodes }}" -n "${{ inputs.nodes }}" mpihello.out + release: needs: - hello1 - hello2 + - mpi-hello-world + ssh: + remoteHost: ${{ inputs.resource.ip }} steps: - - run: scancel $JOBID -'on': + - run: touch RELEASE_ALLOCATION +"on": execute: inputs: resource: @@ -53,6 +120,6 @@ jobs: label: Number of Nodes type: number walltime: - default: '60:00' + default: "60:00" label: Walltime type: string From 0e1bcbd55827ef5e61cd8d74fe4ff2173e125b99 Mon Sep 17 00:00:00 2001 From: Joey Lin Date: Tue, 28 Oct 2025 16:57:32 -0700 Subject: [PATCH 4/4] add extra note about openmpi version --- workflow/salloc/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/salloc/README.md b/workflow/salloc/README.md index 91acec5..cccffb7 100644 --- a/workflow/salloc/README.md +++ b/workflow/salloc/README.md @@ -12,7 +12,8 @@ This example workflow: 1. Runs "hello world" twice on the allocated nodes. 2. Runs an MPI hello world job across the allocated nodes. -Note: The MPI hello world job assumes that OpenMPI is already installed on the cluster (using [this workflow](https://github.com/parallelworks/workflow-utils/blob/main/workflow/build_install_openmpi.yaml)). The job will source OpenMPI according to that workflow. +Note: The MPI hello world job assumes that OpenMPI v4.1.6 is already installed on the cluster (using [this workflow](https://github.com/parallelworks/workflow-utils/blob/main/workflow/build_install_openmpi.yaml)), with `with_pmi=true`. +The job will source OpenMPI according to that workflow. ### Inputs