Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions workflow/salloc/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
## salloc main workflow

This workflow uses the salloc subworkflow to allocate node(s), run independent jobs on the allocated node(s), and then release the allocation when any job fails, the workflow is cancelled, or all jobs complete.

---

### Jobs

You can run any independent jobs using the allocated nodes.
This example workflow:

1. Runs "hello world" twice on the allocated nodes.
2. Runs an MPI hello world job across the allocated nodes.

Note: The MPI hello world job assumes that OpenMPI v4.1.6 is already installed on the cluster (using [this workflow](https://github.com/parallelworks/workflow-utils/blob/main/workflow/build_install_openmpi.yaml)), with `with_pmi=true`.
The job will source OpenMPI according to that workflow.

### Inputs

The workflow accepts the following inputs:

- `resource`: The compute resource to run the workflow on.
- `partition`: The Slurm partition to use.
- `nodes`: Number of nodes to allocate.
- `walltime`: Walltime for the Slurm allocation.

---

### Purpose

This workflow serves as a basic template for:

- Allocating nodes using Slurm.
- Demonstrating the use of a subworkflow.
- Running independent jobs on previously allocated nodes.
- Ensuring clean release of allocated nodes, even if jobs fail or the workflow is cancelled.
101 changes: 84 additions & 17 deletions workflow/salloc/main_salloc_workflow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,107 @@ jobs:
- name: Allocation info file
uses: workflow/salloc_subworkflow
with:
resource: ${{inputs.resource}}
resource: ${{ inputs.resource }}
partition: ${{ inputs.partition }}
nodes: ${{ inputs.nodes }}
walltime: ${{ inputs.walltime }}
copy-allocation-file:
ssh:
remoteHost: ${{ inputs.resource.ip }}
steps:
- name: Copy file to parent directory
run: |
cp subworkflows/*/step_*/slurm_allocation_info.txt .
set -e
echo "Looking for slurm_allocation_info.txt under subworkflows..."

SEARCH_PATH="subworkflows/allocate/step_*/slurm_allocation_info.txt"

echo "Waiting for $SEARCH_PATH to appear..."
while true; do
FOUND_FILE=$(find subworkflows/allocate/ -path "$SEARCH_PATH" 2>/dev/null | head -n 1)
if [ -n "$FOUND_FILE" ]; then
echo "Found allocation info file at: $FOUND_FILE"
cp "$FOUND_FILE" .
echo "Copied slurm_allocation_info.txt to $(pwd)"
break
fi
echo "Still waiting..."
sleep 5
done
hello1:
ssh:
remoteHost: ${{inputs.resource}}
needs:
- allocate
remoteHost: ${{ inputs.resource.ip }}
steps:
- run: |
JOBID=$(grep '^SLURM_JOB_ID=' /tmp/slurm_allocation_info.txt | cut -d= -f2)
srun --jobid=$JOBID echo "hello world 1 on $(hostname)"
while [ ! -f slurm_allocation_info.txt ]; do
echo "Waiting for file slurm_allocation_info.txt to be created"
sleep 5
done
JOBID=$(grep '^SLURM_JOB_ID=' $(pwd)/slurm_allocation_info.txt | cut -d= -f2)
srun --jobid="$JOBID" echo "hello world 1 on $(hostname)"
hello2:
ssh:
remoteHost: ${{inputs.resource}}
needs:
- allocate
remoteHost: ${{ inputs.resource.ip }}
steps:
- run: |
JOBID=$(grep '^SLURM_JOB_ID=' /tmp/slurm_allocation_info.txt | cut -d= -f2)
srun --jobid=$JOBID echo "hello world 2 on $(hostname)"
relinquish:
while [ ! -f slurm_allocation_info.txt ]; do
echo "Waiting for file slurm_allocation_info.txt to be created"
sleep 5
done
JOBID=$(grep '^SLURM_JOB_ID=' $(pwd)/slurm_allocation_info.txt | cut -d= -f2)
srun --jobid="$JOBID" echo "hello world 2 on $(hostname)"
mpi-hello-world:
ssh:
remoteHost: ${{inputs.resource}}
remoteHost: ${{ inputs.resource.ip }}
steps:
- name: Run MPI Hello World
run: |
while [ ! -f slurm_allocation_info.txt ]; do
echo "Waiting for file slurm_allocation_info.txt to be created"
sleep 5
done
set -ex
JOBID=$(grep '^SLURM_JOB_ID=' $(pwd)/slurm_allocation_info.txt | cut -d= -f2)

echo "Running MPI Hello World using srun with jobid=$JOBID"
cat <<'EOF' > mpihello.c
#include <mpi.h>
#include <stdio.h>

int main(int argc, char** argv) {
MPI_Init(NULL, NULL);

int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);

int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processor_name, &name_len);

printf("Hello world from processor %s, rank %d out of %d processors\n",
processor_name, world_rank, world_size);

MPI_Finalize();
}
EOF

source /home/jlin/pw/software/openmpi-4.1.6/env.sh
mpicc -o mpihello.out mpihello.c

srun --jobid="$JOBID" -N "${{ inputs.nodes }}" -n "${{ inputs.nodes }}" mpihello.out
release:
needs:
- hello1
- hello2
- mpi-hello-world
ssh:
remoteHost: ${{ inputs.resource.ip }}
steps:
- run: scancel $JOBID
'on':
- run: touch RELEASE_ALLOCATION
"on":
execute:
inputs:
resource:
Expand All @@ -53,6 +120,6 @@ jobs:
label: Number of Nodes
type: number
walltime:
default: '60:00'
default: "60:00"
label: Walltime
type: string
29 changes: 19 additions & 10 deletions workflow/salloc/salloc_subworkflow.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
jobs:
allocation:
release_allocation:
ssh:
remoteHost: ${{inputs.resource.ip}}
steps:
- name: Release Slurm Allocation
run: |
while [ ! -f ../../../RELEASE_ALLOCATION ]; do
echo "$(date) Waiting for RELEASE_ALLOCATION file to release the allocation"
sleep 5
done
cleanup: |
JOBID=$(grep '^SLURM_JOB_ID=' $(pwd)/slurm_allocation_info.txt | cut -d= -f2)
scancel $JOBID
allocation:
steps:
- name: Create Slurm Allocation Info File
ssh:
remoteHost: ${{ inputs.resource.ip }}
run: |
set -e

INFO_FILE="/tmp/slurm_allocation_info.txt"
INFO_FILE="slurm_allocation_info.txt"

echo "Cleaning up existing files"
rm -f "$INFO_FILE"
echo "=== Existing files cleaned up ==="

echo "Creating persistent allocation"
nohup salloc --nodes=${{inputs.nodes}} --time=${{inputs.walltime}} --no-shell > salloc.log 2>&1 &
nohup salloc --nodes=${{ inputs.nodes }} --time=${{ inputs.walltime }} --no-shell > salloc.log 2>&1 &

sleep 5

Expand All @@ -40,17 +53,13 @@ jobs:

echo "Info file created"
echo ""
copy:
needs:
- allocation
steps:
- name: Copy file from cluster
run: |
set -e
echo "Copying file from cluster"
scp $USER@${{inputs.resource.ip}}:/tmp/slurm_allocation_info.txt /$(pwd)/slurm_allocation_info.txt
scp $USER@${{ inputs.resource.ip }}:$(pwd)/slurm_allocation_info.txt /$(pwd)/slurm_allocation_info.txt
echo "Copied file from cluster in $(pwd)"
'on':
"on":
execute:
inputs:
resource:
Expand All @@ -59,7 +68,7 @@ jobs:
autoselect: true
optional: false
partition:
resource: ${{inputs.resource}}
resource: ${{ inputs.resource }}
label: Partition
type: slurm-partitions
nodes:
Expand Down