From 0d91921b21c240b070d19305c0a0c88145856950 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Tue, 9 Dec 2025 17:03:46 -0600 Subject: [PATCH 01/33] Initial commit of ungeneral nemotron model --- singularity/singularity-compose.yml | 1 + workflow.yaml | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml index 99fa6dd..8d0c39d 100644 --- a/singularity/singularity-compose.yml +++ b/singularity/singularity-compose.yml @@ -20,6 +20,7 @@ instances: volumes: - ./logs:/logs - ./cache:/root/.cache + - /p/work1/mshaxted/Llama-3_3-Nemotron-Super-49B-v1_5:/models/Llama-3_3-Nemotron-Super-49B-v1_5 - ./env.sh:/.singularity.d/env/env.sh rag: diff --git a/workflow.yaml b/workflow.yaml index 47e9712..07babde 100644 --- a/workflow.yaml +++ b/workflow.yaml @@ -313,22 +313,22 @@ jobs: label: Run Type type: dropdown options: - - value: all - label: vLLM+RAG - value: vllm label: vLLM Only + - value: all + label: vLLM+RAG build: label: Build Containers type: boolean default: false hfmodel: label: HF Model - default: meta-llama/Llama-3.1-8B-Instruct + default: /models/Llama-3_3-Nemotron-Super-49B-v1_5 type: string vllm_extra_args: label: VLLM Extra Args - default: "--dtype float16 --max-model-len 16384 --gpu-memory-utilization=0.95 --trust_remote_code" - placeholder: "--dtype float16 --max-model-len 16384 --gpu-memory-utilization=0.95 --max-num-seqs 1 --trust_remote_code" + default: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --max-model-len=2048 --gpu-memory-utilization 0.85" + placeholder: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --max-model-len=2048 --gpu-memory-utilization 0.85" type: string hftoken: label: HF Token (gated models) From c44e03b5adea38390cff1a2e044c8a893a2f3778 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Tue, 9 Dec 2025 17:04:45 -0600 Subject: [PATCH 02/33] Initial commit of ungeneral nemotron model --- yamls/hsp.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 3790404..8706bc0 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -415,10 +415,10 @@ jobs: label: Run Type type: dropdown options: - - value: all - label: vLLM+RAG - value: vllm label: vLLM Only + - value: all + label: vLLM+RAG build: label: Build Containers type: boolean From aa2d95b546fa7539137531c7dc37fb2fb74f7852 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Tue, 9 Dec 2025 17:05:50 -0600 Subject: [PATCH 03/33] Updating the HSP yaml --- yamls/hsp.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 8706bc0..a169d99 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -425,12 +425,12 @@ jobs: default: false hfmodel: label: HF Model - default: meta-llama/Llama-3.1-8B-Instruct + default: /models/Llama-3_3-Nemotron-Super-49B-v1_5 type: string vllm_extra_args: label: VLLM Extra Args - default: "--dtype float16 --max-model-len 16384 --gpu-memory-utilization=0.95 --trust_remote_code" - placeholder: "--dtype float16 --max-model-len 16384 --gpu-memory-utilization=0.95 --max-num-seqs 1 --trust_remote_code" + default: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --max-model-len=2048 --gpu-memory-utilization 0.85" + placeholder: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --max-model-len=2048 --gpu-memory-utilization 0.85" type: string hftoken: label: HF Token (gated models) From aa3bfe7a92f843b65721041fbb185c99803a2c62 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Tue, 9 Dec 2025 17:14:32 -0600 Subject: [PATCH 04/33] Updating branch --- yamls/hsp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index a169d99..0a042ed 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -560,4 +560,4 @@ jobs: repository_branch: type: string label: Repository Branch - default: main + default: nemotron From 1163cab0629cd238c5b3b3ed0d401f4bb55a04f6 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Tue, 9 Dec 2025 17:33:15 -0600 Subject: [PATCH 05/33] Removing minimum model params --- yamls/hsp.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 0a042ed..b123386 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -429,8 +429,8 @@ jobs: type: string vllm_extra_args: label: VLLM Extra Args - default: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --max-model-len=2048 --gpu-memory-utilization 0.85" - placeholder: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --max-model-len=2048 --gpu-memory-utilization 0.85" + default: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --gpu-memory-utilization 0.85" + placeholder: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --gpu-memory-utilization 0.85" type: string hftoken: label: HF Token (gated models) From a7f9e991ca7059ee6387ee7a764277bb1f4706c4 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Tue, 9 Dec 2025 17:49:40 -0600 Subject: [PATCH 06/33] Generalizing the model selection --- singularity/singularity-compose.yml | 2 +- yamls/hsp.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml index 8d0c39d..275ccad 100644 --- a/singularity/singularity-compose.yml +++ b/singularity/singularity-compose.yml @@ -20,7 +20,7 @@ instances: volumes: - ./logs:/logs - ./cache:/root/.cache - - /p/work1/mshaxted/Llama-3_3-Nemotron-Super-49B-v1_5:/models/Llama-3_3-Nemotron-Super-49B-v1_5 + - ${MODEL_NAME}:${MODEL_NAME} - ./env.sh:/.singularity.d/env/env.sh rag: diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index b123386..4d5428e 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -425,7 +425,7 @@ jobs: default: false hfmodel: label: HF Model - default: /models/Llama-3_3-Nemotron-Super-49B-v1_5 + default: /p/work1/mshaxted/Llama-3_3-Nemotron-Super-49B-v1_5 type: string vllm_extra_args: label: VLLM Extra Args From 2106e65b055f56ada9b4eb13ff07750e2eebde1a Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Tue, 9 Dec 2025 17:54:22 -0600 Subject: [PATCH 07/33] Adding fixed localport --- singularity/singularity-compose.yml | 2 +- yamls/hsp.yaml | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml index 275ccad..da4c1e9 100644 --- a/singularity/singularity-compose.yml +++ b/singularity/singularity-compose.yml @@ -20,8 +20,8 @@ instances: volumes: - ./logs:/logs - ./cache:/root/.cache - - ${MODEL_NAME}:${MODEL_NAME} - ./env.sh:/.singularity.d/env/env.sh + - ${MODEL_NAME}:${MODEL_NAME} rag: build: diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 4d5428e..13258db 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -374,6 +374,7 @@ jobs: target: '${{ inputs.resource.id }}' name: '${{ sessions.session }}' remoteHost: '${{ needs.create_session.outputs.target_hostname }}' + localPort: '${{ inputs.localport }}' 'on': @@ -455,6 +456,11 @@ jobs: textarea: true optional: true default: You are a careful assistant. Use ONLY the provided context blocks to answer. Each block is numbered [1], [2], … and includes source metadata. When you use information from a block, you MUST cite it inline with [n]. At the end of your response, include a 'References' section with one reference per line formatted as [n] file_path (chunk index). Do not invent citations or sources. If the context does not contain the answer, say so briefly. + localport: + label: User Workspace Port + default: '5555' + tooltip: Port that runs within the user workspace and used to connect to the code assist and chat interfaces. + type: string slurm: type: group label: SLURM Directives From b286838d68508e7179f4e55ab94f826e30479763 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Tue, 9 Dec 2025 17:58:58 -0600 Subject: [PATCH 08/33] Updating model selection --- singularity/singularity-compose.yml | 2 +- start_service.sh | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml index da4c1e9..a59666d 100644 --- a/singularity/singularity-compose.yml +++ b/singularity/singularity-compose.yml @@ -21,7 +21,7 @@ instances: - ./logs:/logs - ./cache:/root/.cache - ./env.sh:/.singularity.d/env/env.sh - - ${MODEL_NAME}:${MODEL_NAME} + - __MODEL_NAME__:__MODEL_NAME__ rag: build: diff --git a/start_service.sh b/start_service.sh index f0ef4d2..24c8399 100755 --- a/start_service.sh +++ b/start_service.sh @@ -224,6 +224,9 @@ elif [ "$RUNMODE" == "singularity" ]; then sed -i "s|^[#[:space:]]*\(export[[:space:]]\+\)\?MODEL_NAME=.*|export MODEL_NAME=$MODEL_NAME|" env.sh sed -i "s|^[#[:space:]]*\(export[[:space:]]\+\)\?DOCS_DIR=.*|export DOCS_DIR=$DOCS_DIR|" env.sh sed -i "s|__VLLM_EXTRA_ARGS__|${VLLM_EXTRA_ARGS}|" env.sh + + sed -i "s|__MODEL_NAME__|${MODEL_NAME}|" singularity-compose.yml + # Disable weight download # Check if cache/huggingface directory exists if [ -d "cache/huggingface" ]; then From ca49ffd7d45a99712696a8f0f181ceb17a8d8183 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Tue, 9 Dec 2025 19:44:30 -0600 Subject: [PATCH 09/33] fixing model replacement --- start_service.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/start_service.sh b/start_service.sh index 24c8399..e7c9b6f 100755 --- a/start_service.sh +++ b/start_service.sh @@ -224,8 +224,8 @@ elif [ "$RUNMODE" == "singularity" ]; then sed -i "s|^[#[:space:]]*\(export[[:space:]]\+\)\?MODEL_NAME=.*|export MODEL_NAME=$MODEL_NAME|" env.sh sed -i "s|^[#[:space:]]*\(export[[:space:]]\+\)\?DOCS_DIR=.*|export DOCS_DIR=$DOCS_DIR|" env.sh sed -i "s|__VLLM_EXTRA_ARGS__|${VLLM_EXTRA_ARGS}|" env.sh - - sed -i "s|__MODEL_NAME__|${MODEL_NAME}|" singularity-compose.yml + + sed -i "s|__MODEL_NAME__|${MODEL_NAME}|g" singularity-compose.yml # Disable weight download # Check if cache/huggingface directory exists From e93febd6efa821052e24b5c0e84a0c0c881885b9 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Wed, 10 Dec 2025 10:52:26 -0600 Subject: [PATCH 10/33] Revising exit with break so workflow finishes on walltime exit --- yamls/hsp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 13258db..7b4184c 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -154,7 +154,7 @@ jobs: job_status=$(sacct -j ${jobid} --format=state | tail -n1) echo "$(date) Job exited with status ${job_status}" touch job.ended - exit 0 + break fi done cleanup: | From 3898e4728a3b4fc9a79b35f2014fee051776fde7 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Wed, 10 Dec 2025 12:11:42 -0600 Subject: [PATCH 11/33] Cleaning up model location parameters --- singularity/singularity-compose.yml | 4 ++-- start_service.sh | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml index a59666d..4c5f60a 100644 --- a/singularity/singularity-compose.yml +++ b/singularity/singularity-compose.yml @@ -16,12 +16,12 @@ instances: run: args: - > - nohup python3 -m vllm.entrypoints.openai.api_server --model "${MODEL_NAME}" --tokenizer "${MODEL_NAME}" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 & + nohup python3 -m vllm.entrypoints.openai.api_server --model "${__MODEL_BASE__}" --tokenizer "${__MODEL_BASE__}" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 & volumes: - ./logs:/logs - ./cache:/root/.cache - ./env.sh:/.singularity.d/env/env.sh - - __MODEL_NAME__:__MODEL_NAME__ + - __MODEL_PATH__:__MODEL_BASE__ rag: build: diff --git a/start_service.sh b/start_service.sh index e7c9b6f..f8822a6 100755 --- a/start_service.sh +++ b/start_service.sh @@ -225,7 +225,12 @@ elif [ "$RUNMODE" == "singularity" ]; then sed -i "s|^[#[:space:]]*\(export[[:space:]]\+\)\?DOCS_DIR=.*|export DOCS_DIR=$DOCS_DIR|" env.sh sed -i "s|__VLLM_EXTRA_ARGS__|${VLLM_EXTRA_ARGS}|" env.sh - sed -i "s|__MODEL_NAME__|${MODEL_NAME}|g" singularity-compose.yml + # get the base model name + MODEL_PATH="${MODEL_NAME}" + MODEL_BASE=$(basename $MODEL_NAME) + + sed -i "s|__MODEL_PATH__|${MODEL_PATH}|g" singularity-compose.yml + sed -i "s|__MODEL_BASE__|${MODEL_BASE}|g" singularity-compose.yml # Disable weight download # Check if cache/huggingface directory exists From 76e584a4a78919447dc256caddbc30f6b1d9f595 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Wed, 10 Dec 2025 12:18:08 -0600 Subject: [PATCH 12/33] Cleaning up model location parameters --- singularity/singularity-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml index 4c5f60a..f8d5f81 100644 --- a/singularity/singularity-compose.yml +++ b/singularity/singularity-compose.yml @@ -16,12 +16,12 @@ instances: run: args: - > - nohup python3 -m vllm.entrypoints.openai.api_server --model "${__MODEL_BASE__}" --tokenizer "${__MODEL_BASE__}" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 & + nohup python3 -m vllm.entrypoints.openai.api_server --model "./__MODEL_BASE__" --tokenizer "./__MODEL_BASE__" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 & volumes: - ./logs:/logs - ./cache:/root/.cache - ./env.sh:/.singularity.d/env/env.sh - - __MODEL_PATH__:__MODEL_BASE__ + - __MODEL_PATH__:./__MODEL_BASE__ rag: build: From 6d3c4fe9bd548d383f668f1379fe0a56a6c67c9d Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Wed, 10 Dec 2025 12:28:43 -0600 Subject: [PATCH 13/33] Changing to absolute path --- singularity/singularity-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml index f8d5f81..1385809 100644 --- a/singularity/singularity-compose.yml +++ b/singularity/singularity-compose.yml @@ -16,12 +16,12 @@ instances: run: args: - > - nohup python3 -m vllm.entrypoints.openai.api_server --model "./__MODEL_BASE__" --tokenizer "./__MODEL_BASE__" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 & + nohup python3 -m vllm.entrypoints.openai.api_server --model "/__MODEL_BASE__" --tokenizer "/__MODEL_BASE__" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 & volumes: - ./logs:/logs - ./cache:/root/.cache - ./env.sh:/.singularity.d/env/env.sh - - __MODEL_PATH__:./__MODEL_BASE__ + - __MODEL_PATH__:/__MODEL_BASE__ rag: build: From ba5d3fa721b92426658b95a51f19f5120491e4ce Mon Sep 17 00:00:00 2001 From: Alvaro Vidal Torreira Date: Fri, 12 Dec 2025 18:27:48 +0100 Subject: [PATCH 14/33] Move gress and contraint directives to the scheduler_directives parameter (#32) --- yamls/hsp.yaml | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 7b4184c..183cef3 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -82,11 +82,9 @@ jobs: chmod +x run.sh echo "#SBATCH --account=${{ inputs.slurm.account }}" >> run.sh echo "#SBATCH --qos=${{ inputs.slurm.qos }}" >> run.sh - echo "#SBATCH --constraint=${{ inputs.slurm.constraint }}" >> run.sh if [[ "${{ inputs.slurm.partition }}" != "undefined" && "${{ inputs.slurm.partition }}" != "" ]]; then echo "#SBATCH --partition=${{ inputs.slurm.partition }}" >> run.sh fi - echo "#SBATCH --gres=gpu:${{ inputs.slurm.number_of_gpus }}" >> run.sh echo "#SBATCH --cpus-per-task=${{ inputs.slurm.cpus_per_task }}" >> run.sh echo "#SBATCH --nodes=1" >> run.sh echo "#SBATCH --time=${{ inputs.slurm.time }}" >> run.sh @@ -491,17 +489,6 @@ jobs: ignore: ${{ inputs.execmethod != 'SLURM' || 'existing' != inputs.resource.provider }} optional: ${{ .ignore }} hidden: ${{ .ignore }} - constraint: - label: Constraint - type: dropdown - ignore: ${{ inputs.execmethod != 'SLURM' || 'existing' != inputs.resource.provider }} - hidden: ${{ .ignore }} - default: viz - options: - - value: viz - label: viz - - value: mla - label: mla cpus_per_task: type: number label: CPUs per task @@ -511,21 +498,14 @@ jobs: max: 32 default: 1 tooltip: '--cpus-per-task=value SLURM directive' - number_of_gpus: - type: number - label: Number of GPUs - ignore: ${{ inputs.execmethod != 'SLURM' || 'existing' != inputs.resource.provider }} - hidden: ${{ .ignore }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' scheduler_directives: type: editor ignore: ${{ inputs.execmethod != 'SLURM' }} optional: true - tooltip: | - Type in additional scheduler directives. + tooltip: Type in additional scheduler directives. + default: | + #SBATCH --constraint=mla + #SBATCH --gres=gpu:4 time: label: Walltime type: string From b99999e2e74a4457cb7a8f8c6bfd6a3de862e195 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Fri, 12 Dec 2025 11:30:04 -0600 Subject: [PATCH 15/33] Generalizing the yaml for different useres --- yamls/hsp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 183cef3..a38a2cc 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -424,7 +424,7 @@ jobs: default: false hfmodel: label: HF Model - default: /p/work1/mshaxted/Llama-3_3-Nemotron-Super-49B-v1_5 + default: /p/work1/${USER}/Llama-3_3-Nemotron-Super-49B-v1_5 type: string vllm_extra_args: label: VLLM Extra Args From e5d12f37b2fc029ad5b1693c9d2837c3ba2e68b4 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sat, 13 Dec 2025 11:40:29 -0600 Subject: [PATCH 16/33] Updating readme with manual data requirements for model and container --- README.md | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 25140ea..c454820 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# ACTIVATE — vLLM + RAG + Proxy +# ACTIVATE — vLLM + RAG This Compose stack runs from the [github repo here](https://github.com/parallelworks/activate-rag-vllm) and executes the below services in Docker or Singularity modes: @@ -14,7 +14,24 @@ See a turnkey demonstration of the workflow running on ACTIVATE at the link belo -## Quickstart +## Workflow Instructions + +Pull down the weights of your choice into a known directory. For example we recommend using git lfs to pull down weights as this is more widely open to firewalls and is relatively fast at pulls: + +``` +cd /mymodeldir/ +git lfs install +git clone https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5 +``` + +The workflow will provide a field to also pull down a prebuilt vllm singularity container if running in this mode, but you can also pull this down manually for example using the authenticated pw cli: + +``` +cd ~/pw/activate-rag-vllm +pw buckets cp pw://mshaxted/codeassist/vllm.sif ./ +``` + +## Manual Quickstart ```bash export HF_TOKEN=hf_xyz export RUNMODE=docker # or singularity From 059cc7d3d0d64006fe88658c1bd947e83afeabc4 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sat, 13 Dec 2025 11:56:12 -0600 Subject: [PATCH 17/33] Updating yaml to provide a container pull option --- yamls/hsp.yaml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index a38a2cc..17a9562 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -65,6 +65,19 @@ jobs: echo "$(date) ERROR: Failed to install singularity-compose" exit 1 fi + - name: Pull Singularity Containers + if: ${{ inputs.runmode == 'singularity' && inputs.pull == true }} + early-cancel: any-job-failed + run: | + set -x + cd ${{ inputs.rundir }} + echo pulling vllm.sif from ${{ inputs.container_bucket }} + pw bucket cp ${{ inputs.container_bucket }}/vllm.sif ./ + + if [[ "${{ inputs.runmode }}" == "all" ]]; then + echo pulling rag.sif from ${{ inputs.container_bucket }} + pw bucket cp ${{ inputs.container_bucket }}/rag.sif ./ + fi slurm_job: needs: @@ -422,6 +435,16 @@ jobs: label: Build Containers type: boolean default: false + pull: + label: Pull Containers + type: boolean + default: true + container_bucket: + label: Container Bucket (.sif) + hidden: ${{ inputs.runmode != 'singularity' }} + type: string + default: pw://mshaxted/codeassist + help: PW singularity container bucket that holds vllm.sif and rag.sif containers hfmodel: label: HF Model default: /p/work1/${USER}/Llama-3_3-Nemotron-Super-49B-v1_5 From a0aae85eea16ab622fc3ae7d74f9a6978191dc42 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sat, 13 Dec 2025 12:01:29 -0600 Subject: [PATCH 18/33] Updating container conditional logic --- yamls/hsp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 17a9562..bcff9d0 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -441,7 +441,7 @@ jobs: default: true container_bucket: label: Container Bucket (.sif) - hidden: ${{ inputs.runmode != 'singularity' }} + hidden: ${{ inputs.runmode != 'singularity' && inputs.pull == true }} type: string default: pw://mshaxted/codeassist help: PW singularity container bucket that holds vllm.sif and rag.sif containers From 81461ef4221857de28587f1f31c22c4b484b4410 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sat, 13 Dec 2025 12:02:19 -0600 Subject: [PATCH 19/33] Updating tooltip --- yamls/hsp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index bcff9d0..32274f0 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -444,7 +444,7 @@ jobs: hidden: ${{ inputs.runmode != 'singularity' && inputs.pull == true }} type: string default: pw://mshaxted/codeassist - help: PW singularity container bucket that holds vllm.sif and rag.sif containers + tooltip: PW singularity container bucket that holds vllm.sif and rag.sif containers hfmodel: label: HF Model default: /p/work1/${USER}/Llama-3_3-Nemotron-Super-49B-v1_5 From 43277936516e4bb8e221452ff730cfc7895108a7 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sat, 13 Dec 2025 12:04:18 -0600 Subject: [PATCH 20/33] Updating tooltip --- yamls/hsp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 32274f0..0e51940 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -441,7 +441,7 @@ jobs: default: true container_bucket: label: Container Bucket (.sif) - hidden: ${{ inputs.runmode != 'singularity' && inputs.pull == true }} + hidden: ${{ inputs.runmode != 'singularity' || inputs.pull != true }} type: string default: pw://mshaxted/codeassist tooltip: PW singularity container bucket that holds vllm.sif and rag.sif containers From df8b1a6c60106139e02e20e81ee83d1dae1712dc Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sat, 13 Dec 2025 12:05:09 -0600 Subject: [PATCH 21/33] Updating tooltip --- yamls/hsp.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 0e51940..60c76b1 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -437,6 +437,7 @@ jobs: default: false pull: label: Pull Containers + hidden: ${{ inputs.runmode == 'docker' }} type: boolean default: true container_bucket: From 7658ff40805e3d84e49b570c508ed0992bea11af Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sat, 13 Dec 2025 12:14:55 -0600 Subject: [PATCH 22/33] Adding comment for constraints --- yamls/hsp.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 60c76b1..eb92b62 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -528,8 +528,10 @@ jobs: optional: true tooltip: Type in additional scheduler directives. default: | - #SBATCH --constraint=mla #SBATCH --gres=gpu:4 + + ##SBATCH --constraint=mla # uncomment for Navy and AFRL DSRC systems + time: label: Walltime type: string From 7a30d72928e55473af79f244fbfeed7f6e88a506 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sat, 13 Dec 2025 12:22:45 -0600 Subject: [PATCH 23/33] skipping pull if containers already exist --- yamls/hsp.yaml | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index eb92b62..5d5767f 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -71,12 +71,23 @@ jobs: run: | set -x cd ${{ inputs.rundir }} - echo pulling vllm.sif from ${{ inputs.container_bucket }} - pw bucket cp ${{ inputs.container_bucket }}/vllm.sif ./ + + # vllm container + if [[ ! -f "vllm.sif" ]]; then + echo "vllm.sif not found, pulling from ${{ inputs.container_bucket }}" + pw bucket cp "${{ inputs.container_bucket }}/vllm.sif" ./ + else + echo "vllm.sif already exists, skipping pull" + fi + # rag container (only for runmode=all) if [[ "${{ inputs.runmode }}" == "all" ]]; then - echo pulling rag.sif from ${{ inputs.container_bucket }} - pw bucket cp ${{ inputs.container_bucket }}/rag.sif ./ + if [[ ! -f "rag.sif" ]]; then + echo "rag.sif not found, pulling from ${{ inputs.container_bucket }}" + pw bucket cp "${{ inputs.container_bucket }}/rag.sif" ./ + else + echo "rag.sif already exists, skipping pull" + fi fi slurm_job: From 3b74aca7e6efeb00743f0a588947f8ea6ae40e57 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sat, 13 Dec 2025 12:23:05 -0600 Subject: [PATCH 24/33] skipping pull if containers already exist --- yamls/hsp.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 5d5767f..e919736 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -90,6 +90,8 @@ jobs: fi fi + echo Singularity container pull step complete. + slurm_job: needs: - prepare_job_directory From 7752919bbe7a9e6b3893b0a149fcc8934be01e94 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sat, 13 Dec 2025 15:24:43 -0600 Subject: [PATCH 25/33] Adding ticktoken encodings for gptoss --- yamls/hsp.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index e919736..1c0c267 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -35,6 +35,7 @@ jobs: echo "export DOCS_DIR=${{ inputs.docsdir }}" >> .run.env echo "export VLLM_EXTRA_ARGS=\"${{ inputs.vllm_extra_args }}\"" >> .run.env echo "export TRANSFORMERS_OFFLINE=1" >> .run.env + echo "export TIKTOKEN_ENCODINGS_BASE=/root/.cache/tiktoken_encodings" >> .run.env - name: Install Singularity Compose if: ${{ inputs.runmode == 'singularity' }} @@ -91,6 +92,15 @@ jobs: fi echo Singularity container pull step complete. + - name: Pull Tiktoken Encodings + if: ${{ inputs.advanced_settings.tiktoken_encodings == true }} + early-cancel: any-job-failed + run: | + set -x + cd ${{ inputs.rundir }} + mkdir -p cache/tiktoken_encodings + wget -O cache/tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" + wget -O cache/tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" slurm_job: needs: @@ -586,3 +596,8 @@ jobs: type: string label: Repository Branch default: nemotron + tiktoken_encodings: + label: Pull Tiktoken Encodings + tooltip: For GPT-OSS pull the tiktoken encodings. + type: boolean + default: false \ No newline at end of file From f6b32d63423ff3c5435fd73a428a50d3f546909b Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sat, 13 Dec 2025 15:46:00 -0600 Subject: [PATCH 26/33] Fixing singularity loading if already in path --- yamls/hsp.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index 1c0c267..de80195 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -136,7 +136,17 @@ jobs: echo "touch job.started" >> run.sh echo "hostname >> HOSTNAME" >> run.sh - echo "module load singularity" >> run.sh + # load singularity via module if not already in path + cat << 'EOF' >> run.sh + # Ensure Singularity is available + if ! command -v singularity >/dev/null 2>&1; then + if command -v module >/dev/null 2>&1; then + module load singularity || module load apptainer + else + echo "ERROR: singularity/apptainer not found" >&2 + fi + fi + EOF cat start_service.sh >> run.sh - name: Submit SLURM Script From 686bfb9effbb171b15c62184d3cad544f83a662b Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sun, 14 Dec 2025 08:21:56 -0600 Subject: [PATCH 27/33] Updating the ATTN settings for latest vllm update --- singularity/env.sh.example | 2 +- yamls/hsp.yaml | 32 +++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/singularity/env.sh.example b/singularity/env.sh.example index a234cf3..3a03eb3 100644 --- a/singularity/env.sh.example +++ b/singularity/env.sh.example @@ -23,7 +23,7 @@ export HF_HOME="/root/.cache/huggingface" # Recommended on T4/V100 and for mistral tokenizer export DOCS_DIR=./docs -export VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 +export VLLM_ATTENTION_BACKEND=TRITON_ATTN export VLLM_EXTRA_ARGS="__VLLM_EXTRA_ARGS__" export TRITON_CC=gcc export CC=/usr/bin/gcc diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index c24611d..aa083a9 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -36,6 +36,7 @@ jobs: echo "export VLLM_EXTRA_ARGS=\"${{ inputs.vllm_extra_args }}\"" >> .run.env echo "export TRANSFORMERS_OFFLINE=1" >> .run.env echo "export TIKTOKEN_ENCODINGS_BASE=/root/.cache/tiktoken_encodings" >> .run.env + echo "export VLLM_ATTENTION_BACKEND=${{ inputs.advanced_settings.vllm_attention_backend }}" >> .run.env - name: Install Singularity Compose if: ${{ inputs.runmode == 'singularity' }} @@ -607,7 +608,36 @@ jobs: label: Repository Branch default: nemotron tiktoken_encodings: - label: Pull Tiktoken Encodings + label: Pull Encodings tooltip: For GPT-OSS pull the tiktoken encodings. type: boolean default: false + vllm_attention_backend: + type: dropdown + label: VLLM Attention Backend + default: TRITON_ATTN + tooltip: Select the attention backend implementation used by vLLM + options: + - value: FLASH_ATTN + - value: TRITON_ATTN + - value: ROCM_ATTN + - value: ROCM_AITER_MLA + - value: ROCM_AITER_TRITON_MLA + - value: ROCM_AITER_FA + - value: ROCM_AITER_MLA_SPARSE + - value: TORCH_SDPA + - value: FLASHINFER + - value: FLASHINFER_MLA + - value: TRITON_MLA + - value: CUTLASS_MLA + - value: FLASHMLA + - value: FLASHMLA_SPARSE + - value: FLASH_ATTN_MLA + - value: PALLAS + - value: IPEX + - value: NO_ATTENTION + - value: FLEX_ATTENTION + - value: TREE_ATTN + - value: ROCM_AITER_UNIFIED_ATTN + - value: CPU_ATTN + - value: CUSTOM \ No newline at end of file From 24e750ba69621eb36f27713d1c424b9645d5a0c5 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sun, 14 Dec 2025 09:00:39 -0600 Subject: [PATCH 28/33] Adding sagemaker fix for updated vllm --- singularity/singularity-compose.yml | 1 + start_service.sh | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml index 1385809..1cb2430 100644 --- a/singularity/singularity-compose.yml +++ b/singularity/singularity-compose.yml @@ -20,6 +20,7 @@ instances: volumes: - ./logs:/logs - ./cache:/root/.cache + - ./cache/sagemaker_sessions:/dev/shm/sagemaker_sessions - ./env.sh:/.singularity.d/env/env.sh - __MODEL_PATH__:/__MODEL_BASE__ diff --git a/start_service.sh b/start_service.sh index f8822a6..37173e8 100755 --- a/start_service.sh +++ b/start_service.sh @@ -245,6 +245,10 @@ elif [ "$RUNMODE" == "singularity" ]; then mkdir -p logs cache cache/chroma $DOCS_DIR + # fixing updated vllm sagemarker sessions issue + mkdir -p cache/sagemaker_sessions + chmod 700 cache/sagemaker_sessions + # singularity-compose does not support env variables in the yml config file if [ "$DOCS_DIR" != "./docs" ];then ln -s $DOCS_DIR ./docs From ac616f1de2da57c378cca3f88970139226d21698 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sun, 14 Dec 2025 09:29:48 -0600 Subject: [PATCH 29/33] Updating singularity build file --- singularity/Singularity.vllm | 53 ++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/singularity/Singularity.vllm b/singularity/Singularity.vllm index 9aa97b8..95edf24 100644 --- a/singularity/Singularity.vllm +++ b/singularity/Singularity.vllm @@ -1,11 +1,52 @@ Bootstrap: docker From: vllm/vllm-openai:latest +%post + set -eux + + apt-get update + apt-get install -y --no-install-recommends \ + clang lld llvm \ + build-essential \ + git ca-certificates curl \ + pkg-config cmake ninja-build + rm -rf /var/lib/apt/lists/* + + # Pick a Python interpreter that actually exists in the base image + if command -v python >/dev/null 2>&1; then + PY=python + elif command -v python3 >/dev/null 2>&1; then + PY=python3 + elif [ -x /opt/conda/bin/python ]; then + PY=/opt/conda/bin/python + else + echo "No Python interpreter found (python/python3/conda)."; exit 1 + fi + + # Prefer clang for any builds happening inside the container + echo "export CC=clang" >> /etc/profile.d/clang.sh + echo "export CXX=clang++" >> /etc/profile.d/clang.sh + + # Upgrade packaging tooling + $PY -m pip install -U pip setuptools wheel + + # Ensure Transformers has the ministral3 config mapping + $PY -m pip uninstall -y transformers || true + $PY -m pip install -U git+https://github.com/huggingface/transformers + + $PY - << 'PY' +import transformers +from transformers.models.auto import CONFIG_MAPPING +print("Transformers:", transformers.__version__) +print("ministral3 in CONFIG_MAPPING:", "ministral3" in CONFIG_MAPPING) +PY + %runscript -mkdir -p /app -cd /app -exec /bin/bash -lc "$@" + mkdir -p /app + cd /app + exec /bin/bash -lc "$@" + %startscript -mkdir -p /app -cd /app -exec /bin/bash -lc "$@" \ No newline at end of file + mkdir -p /app + cd /app + exec /bin/bash -lc "$@" \ No newline at end of file From ddfb3a25a7eeaa54bd30c5858f49e09793025e3f Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sun, 14 Dec 2025 11:10:20 -0600 Subject: [PATCH 30/33] More gracefully exit log on fail when container fails --- start_service.sh | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/start_service.sh b/start_service.sh index 37173e8..5028ff8 100755 --- a/start_service.sh +++ b/start_service.sh @@ -267,7 +267,18 @@ elif [ "$RUNMODE" == "singularity" ]; then [ "$BUILD" = "true" ] && singularity-compose build "${RUNTYPE}1" singularity-compose up "${RUNTYPE}1" fi - # Follow the logs - tail -f logs/* + + # Only follow logs if up succeeded + # Make tail die when this script dies (and don't explode if logs don't exist yet) + shopt -s nullglob + logs=(logs/*) + if ((${#logs[@]} > 0)); then + tail -F "${logs[@]}" & + tail_pid=$! + trap 'kill "$tail_pid" >/dev/null 2>&1 || true; cleanup' EXIT + wait "$tail_pid" + else + echo "No logs found under logs/. Skipping tail." + fi fi From c640078cbba3e74d230464b3785abac3f90ccdac Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Sun, 14 Dec 2025 11:13:00 -0600 Subject: [PATCH 31/33] Removing bad char --- singularity/env.sh.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singularity/env.sh.example b/singularity/env.sh.example index 3a03eb3..fd93b28 100644 --- a/singularity/env.sh.example +++ b/singularity/env.sh.example @@ -32,7 +32,7 @@ export CXX=/usr/bin/g++ export TMPDIR=${PWD}/tmp export CUDA_CACHE_PATH=${TMPDIR}/cuda_cache export TORCH_EXTENSIONS_DIR=${TMPDIR}/torch_extensions -export FLASHINFER_JIT_DIR=${TMPDIR}/flashinfer_jitß +export FLASHINFER_JIT_DIR=${TMPDIR}/flashinfer_jit # Other VLLM tuning settings export VLLM_LOGGING_LEVEL=INFO From e5382b81d95cbfd6e697087a1f57e455226b5de9 Mon Sep 17 00:00:00 2001 From: Matthew Shaxted Date: Mon, 15 Dec 2025 15:46:15 -0600 Subject: [PATCH 32/33] Fixing the shm dir issue --- start_service.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/start_service.sh b/start_service.sh index 5028ff8..97dc180 100755 --- a/start_service.sh +++ b/start_service.sh @@ -203,7 +203,7 @@ elif [ "$RUNMODE" == "singularity" ]; then cp singularity/* ./ -Rf cp env.sh.example env.sh - + VLLM_SERVER_PORT=$(findAvailablePort) RAG_PORT=$(findAvailablePort) PROXY_PORT=$(findAvailablePort) @@ -249,6 +249,9 @@ elif [ "$RUNMODE" == "singularity" ]; then mkdir -p cache/sagemaker_sessions chmod 700 cache/sagemaker_sessions + mkdir -p /dev/shm/sagemaker_sessions + chmod 700 /dev/shm/sagemaker_sessions + # singularity-compose does not support env variables in the yml config file if [ "$DOCS_DIR" != "./docs" ];then ln -s $DOCS_DIR ./docs From 8f332332080a58a702405227256bf8805aa70ed5 Mon Sep 17 00:00:00 2001 From: Alvaro Vidal Torreira Date: Fri, 16 Jan 2026 17:40:43 +0100 Subject: [PATCH 33/33] Add support for AECM (#40) * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Updating for support on AECM * Revert change * transition code --- controller.sh | 51 ++ singularity/env.sh.example | 1 + singularity/singularity-compose.yml | 2 +- start_service.sh | 35 +- yamls/emed.yaml | 741 +++++++++++++++------------- 5 files changed, 459 insertions(+), 371 deletions(-) create mode 100644 controller.sh diff --git a/controller.sh b/controller.sh new file mode 100644 index 0000000..100eec8 --- /dev/null +++ b/controller.sh @@ -0,0 +1,51 @@ + + +if [[ "${service_runmode}" == "singularity" ]];then + # Check if singularity-compose is installed globally + if ! command -v singularity-compose &> /dev/null; then + # Check if virtual environment exists and activate it + if [ -d ~/pw/software/singularity-compose ]; then + source ~/pw/software/singularity-compose/bin/activate + fi + # Check again if singularity-compose is available after activation + if ! command -v singularity-compose &> /dev/null; then + echo "$(date) singularity-compose not found, installing..." + # Create directory for Python environment + mkdir -p ~/pw/software + + # Create virtual environment named singularity-compose and install singularity-compose + python3 -m venv ~/pw/software/singularity-compose + source ~/pw/software/singularity-compose/bin/activate + pip install --upgrade pip + pip install singularity-compose + fi + fi + if ! command -v singularity-compose >/dev/null 2>&1; then + echo "$(date) Error: Failed to install singularity-compose" + exit 1 + fi +fi + +if ! [ -z "${service_container_bucket}" ]; then + # vllm container + if [[ ! -f "vllm.sif" ]]; then + echo "$(date) vllm.sif not found, pulling from ${service_container_bucket}" + pw bucket cp "${service_container_bucket}/vllm.sif" ./ + else + echo "$(date) vllm.sif already exists, skipping pull" + fi + # rag container (only for runmode=all) + if [[ "${service_runmode}" == "all" ]]; then + if [[ ! -f "rag.sif" ]]; then + echo "$(date) rag.sif not found, pulling from ${service_container_bucket}" + pw bucket cp "${service_container_bucket}/rag.sif" ./ + else + echo "$(date) rag.sif already exists, skipping pull" + fi + fi + echo "$(date) Singularity container pull step complete." +fi + +mkdir -p cache/tiktoken_encodings +wget --no-check-certificate -O cache/tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" || true +wget --no-check-certificate -O cache/tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" || true \ No newline at end of file diff --git a/singularity/env.sh.example b/singularity/env.sh.example index fd93b28..0b883c0 100644 --- a/singularity/env.sh.example +++ b/singularity/env.sh.example @@ -29,6 +29,7 @@ export TRITON_CC=gcc export CC=/usr/bin/gcc export CXX=/usr/bin/g++ + export TMPDIR=${PWD}/tmp export CUDA_CACHE_PATH=${TMPDIR}/cuda_cache export TORCH_EXTENSIONS_DIR=${TMPDIR}/torch_extensions diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml index 1cb2430..8b526a7 100644 --- a/singularity/singularity-compose.yml +++ b/singularity/singularity-compose.yml @@ -16,7 +16,7 @@ instances: run: args: - > - nohup python3 -m vllm.entrypoints.openai.api_server --model "/__MODEL_BASE__" --tokenizer "/__MODEL_BASE__" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 & + nohup python3 -m vllm.entrypoints.openai.api_server --model "/__MODEL_BASE__" --tokenizer "/__MODEL_BASE__" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 & volumes: - ./logs:/logs - ./cache:/root/.cache diff --git a/start_service.sh b/start_service.sh index 97dc180..40f215a 100755 --- a/start_service.sh +++ b/start_service.sh @@ -30,15 +30,6 @@ install_docker_compose(){ chmod +x docker-compose } -findAvailablePort() { - availablePort=$(pw agent open-port) - echo ${availablePort} - if [ -z "${availablePort}" ]; then - echo "$(date) ERROR: No port found. Exiting job" - exit 1 - fi -} - start_rootless_docker() { local MAX_RETRIES=20 local RETRY_INTERVAL=2 @@ -116,13 +107,15 @@ if [ "$RUNMODE" == "docker" ];then cp docker/* ./ -Rf cp env.example .env - VLLM_SERVER_PORT=$(findAvailablePort) - PROXY_PORT=$(findAvailablePort) - - if [ "$RUNTYPE" == "vllm" ];then + if [ "$RUNTYPE" == "all" ];then + VLLM_SERVER_PORT=$(pw agent open-port) + PROXY_PORT=${service_port} + # TRANSITION CODE echo "SESSION_PORT=${VLLM_SERVER_PORT}" > SESSION_PORT else - echo "SESSION_PORT=${PROXY_PORT}" > SESSION_PORT + PROXY_PORT=$(pw agent open-port) + VLLM_SERVER_PORT=${service_port} + echo "SESSION_PORT=${VLLM_SERVER_PORT}" > SESSION_PORT fi sed -i "s/^VLLM_SERVER_PORT=.*/VLLM_SERVER_PORT=${VLLM_SERVER_PORT}/" .env @@ -204,15 +197,17 @@ elif [ "$RUNMODE" == "singularity" ]; then cp singularity/* ./ -Rf cp env.sh.example env.sh - VLLM_SERVER_PORT=$(findAvailablePort) - RAG_PORT=$(findAvailablePort) - PROXY_PORT=$(findAvailablePort) - CHROMA_PORT=$(findAvailablePort) + RAG_PORT=$(pw agent open-port) + CHROMA_PORT=$(pw agent open-port) if [ "$RUNTYPE" == "all" ];then - echo "SESSION_PORT=${PROXY_PORT}" > SESSION_PORT + VLLM_SERVER_PORT=$(pw agent open-port) + PROXY_PORT=${service_port} + echo "SESSION_PORT=${VLLM_SERVER_PORT}" > SESSION_PORT else - echo "SESSION_PORT=${VLLM_SERVER_PORT}" > SESSION_PORT + PROXY_PORT=$(pw agent open-port) + VLLM_SERVER_PORT=${service_port} + echo "SESSION_PORT=${VLLM_SERVER_PORT}" > SESSION_PORT fi sed -i "s/^export VLLM_SERVER_PORT=.*/export VLLM_SERVER_PORT=${VLLM_SERVER_PORT}/" env.sh diff --git a/yamls/emed.yaml b/yamls/emed.yaml index 1fd976e..e46eb92 100644 --- a/yamls/emed.yaml +++ b/yamls/emed.yaml @@ -4,273 +4,200 @@ sessions: session: redirect: false openAI: true - jobs: - prepare_job_directory: + preprocessing: + working-directory: ${{ inputs.service.rundir }} ssh: - remoteHost: ${{ inputs.resource.ip }} + remoteHost: ${{ inputs.cluster.resource.ip }} steps: - - name: Preparing Run Directory + - name: Checkout + uses: parallelworks/checkout + with: + repo: ${{ inputs.service.advanced_settings.repository }} + branch: ${{ inputs.service.advanced_settings.repository_branch }} + - name: Create Environment File + early-cancel: any-job-failed run: | - set -x - mkdir -p $(dirname ${{ inputs.rundir }}) - git clone -b ${{ inputs.advanced_settings.repository_branch }} ${{ inputs.advanced_settings.repository }} ${{ inputs.rundir }} - cd ${{ inputs.rundir }} - git checkout ${{ inputs.advanced_settings.repository_branch }} - git branch --set-upstream-to=origin/${{ inputs.advanced_settings.repository_branch }} - git pull - rm -f jobid SESSION_PORT job.started job.ended run.out HOSTNAME + # FIXME: remove when issue 11915 is fixed + cp -rfT ${PW_PARENT_JOB_DIR}/ ./ + rm -f SESSION_PORT job.started job.ended run.out HOSTNAME rm -rf logs - - name: Install Singularity Compose - if: ${{ inputs.runmode == 'singularity' }} - early-cancel: any-job-failed + cat > .run.env << 'EOF' + export RUNMODE="${{ inputs.service.runmode }}" + export BUILD="${{ inputs.service.build }}" + export RUNTYPE="${{ inputs.service.runtype }}" + export SYSTEM_PROMPT="${{ inputs.service.systemprompt }}" + export HF_TOKEN="${{ inputs.service.hftoken }}" + export MODEL_NAME="${{ inputs.service.hfmodel }}" + export API_KEY="${{ inputs.service.apikey }}" + export DOCS_DIR="${{ inputs.service.docsdir }}" + export VLLM_EXTRA_ARGS="${{ inputs.service.vllm_extra_args }}" + export TRANSFORMERS_OFFLINE=1 + export TIKTOKEN_ENCODINGS_BASE="/root/.cache/tiktoken_encodings" + export VLLM_ATTENTION_BACKEND="${{ inputs.service.advanced_settings.vllm_attention_backend }}" + EOF + - name: Controller Preprocessing run: | - # Check if singularity-compose is installed globally - if ! command -v singularity-compose &> /dev/null; then - # Check if virtual environment exists and activate it - if [ -d ~/pw/software/singularity-compose ]; then - source ~/pw/software/singularity-compose/bin/activate - fi - # Check again if singularity-compose is available after activation - if ! command -v singularity-compose &> /dev/null; then - echo "$(date) singularity-compose not found, installing..." - - # Create directory for Python environment - mkdir -p ~/pw/software - - # Create virtual environment named singularity-compose and install singularity-compose - python3 -m venv ~/pw/software/singularity-compose - source ~/pw/software/singularity-compose/bin/activate - pip install --upgrade pip - pip install singularity-compose - fi - fi - if ! command -v singularity-compose >/dev/null 2>&1; then - echo "$(date) Error: Failed to install singularity-compose" - exit 1 - fi - - - name: Create Environment File - early-cancel: any-job-failed + set -x + export service_runmode=${{ inputs.service.runmode }} + bash controller.sh + - name: Create Service Script run: | set -x - cd ${{ inputs.rundir }} - echo "export RUNMODE=${{ inputs.runmode }}" > .run.env - echo "export BUILD=${{ inputs.build }}" >> .run.env - echo "export RUNTYPE=${{ inputs.runtype }}" >> .run.env - echo "export SYSTEM_PROMPT=\"${{ inputs.systemprompt }}\"" >> .run.env - echo "export HF_TOKEN=${{ inputs.hftoken }}" >> .run.env - echo "export API_KEY=${{ inputs.apikey }}" >> .run.env - echo "export MODEL_NAME=${{ inputs.hfmodel }}" >> .run.env - echo "export DOCS_DIR=${{ inputs.docsdir }}" >> .run.env - echo "export VLLM_EXTRA_ARGS=\"${{ inputs.vllm_extra_args }}\"" >> .run.env - echo "export TRANSFORMERS_OFFLINE=1" >> .run.env + # Write code common to all services + cat > start_service_mod.sh << 'EOF' - slurm_job: - needs: - - prepare_job_directory - if: ${{ inputs.execmethod == 'SLURM' }} - ssh: - remoteHost: ${{ inputs.resource.ip }} - steps: - - name: Create SLURM Script - early-cancel: any-job-failed - run: | - cd ${{ inputs.rundir }} - echo '#!/bin/bash' > run.sh - chmod +x run.sh - if [[ "${{ inputs.slurm.partition }}" != "undefined" ]]; then - echo "#SBATCH --partition=${{ inputs.slurm.partition }}" >> run.sh - fi - echo "#SBATCH --gres=gpu:${{ inputs.slurm.number_of_gpus }}" >> run.sh - echo "#SBATCH --cpus-per-task=${{ inputs.slurm.cpus_per_task }}" >> run.sh - echo "#SBATCH --mem=${{ inputs.slurm.memory }}" >> run.sh - echo "#SBATCH --chdir=${PWD}" >> run.sh - echo "#SBATCH -o ${PWD}/run.out" >> run.sh - echo "#SBATCH -e ${PWD}/run.out" >> run.sh - if [[ "${{ inputs.slurm.scheduler_directives }}" != "undefined" ]]; then - echo "${{ inputs.slurm.scheduler_directives }}" >> run.sh + if [ -z "${service_port}" ]; then + service_port=$(pw agent open-port) fi - - # Indicates job started running - echo "touch job.started" >> run.sh - echo "hostname >> HOSTNAME" >> run.sh - - echo "module load singularity" >> run.sh - - cat start_service.sh >> run.sh - - name: Submit SLURM Script - run: | - cd ${{ inputs.rundir }} - echo "$(date) Submitting SLURM Job" - jobid=$(sbatch run.sh | tail -1 | awk -F ' ' '{print $4}') - if [ -z "${jobid}" ]; then - echo "$(date) Job submission failed" - exit 1 + if [ -z "${service_port}" ]; then + echo "$(date) ERROR: No service port found" + exit 1 fi - echo "jobid=${jobid}" | tee -a $OUTPUTS | tee -a jobid - cleanup: | - set -x - cd ${{ inputs.rundir }} - #jobid=${{ needs.slurm_job.outputs.jobid }} - source jobid - target_hostname=$(squeue -j "${jobid}" --noheader --format="%N") - ssh ${target_hostname} bash cancel.sh - scancel ${jobid} - rm -f jobid SESSION_PORT job.started HOSTNAME - - name: Monitor SLURM Job - run: | - cd ${{ inputs.rundir }} - - #jobid=${{ needs.slurm_job.outputs.jobid }} - max_retries=10 - count=0 - - while [ $count -lt $max_retries ]; do - ls - source jobid - if [ -z "${jobid}" ]; then - echo "$(date) Job ID is empty. Retry $((count+1))/$max_retries" - count=$((count+1)) - sleep 5 - else - break - fi - done - echo "$(date) Monitoring SLURM job ${jobid}" + echo ${service_port} > SESSION_PORT + hostname > HOSTNAME - cd ${{ inputs.rundir }} - touch run.out - tail -f run.out & - echo &! > tail.pid - - get_slurm_job_status() { - # Get the header line to determine the column index corresponding to the job status - if [ -z "${SQUEUE_HEADER}" ]; then - export SQUEUE_HEADER="$(eval squeue | awk 'NR==1')" - fi - status_column=$(echo "${SQUEUE_HEADER}" | awk '{ for (i=1; i<=NF; i++) if ($i ~ /^S/) { print i; exit } }') - status_response=$(eval squeue | awk -v jobid="${jobid}" '$1 == jobid') - echo "${SQUEUE_HEADER}" - echo "${status_response}" - export job_status=$(echo ${status_response} | awk -v id="${jobid}" -v col="$status_column" '{print $col}') + cleanup() { + echo "$(date) Cleaning up..." + kill -- -$$ } - while true; do - sleep 15 - get_slurm_job_status - if [ -z "${job_status}" ]; then - job_status=$(sacct -j ${jobid} --format=state | tail -n1) - echo "$(date) Job exited with status ${job_status}" - touch job.ended - exit 0 - fi - done - cleanup: | - set -x - cd ${{ inputs.rundir }} - kill $(cat tail.pid) + trap cleanup EXIT INT TERM - ssh_job: + echo + echo + echo "$(date) STARTING SERVICE" + echo + touch job.started + EOF + pwd + cat start_service_mod.sh + cat start_service.sh >> start_service_mod.sh + session_runner: + working-directory: ${{ inputs.service.rundir }} needs: - - prepare_job_directory - if: ${{ inputs.execmethod == 'SSH' }} + - preprocessing ssh: - remoteHost: ${{ inputs.resource.ip }} + remoteHost: ${{ inputs.cluster.resource.ip }} steps: - - name: Create SSH Script + - uses: marketplace/script_submitter/v3.5 + early-cancel: any-job-failed + with: + resource: ${{ inputs.cluster.resource }} + shebang: '#!/bin/bash' + rundir: ${{ inputs.service.rundir }} + use_existing_script: true + script_path: ${{ inputs.service.rundir }}/start_service_mod.sh + scheduler: ${{ inputs.cluster.scheduler }} + slurm: + is_disabled: ${{ inputs.cluster.slurm.is_disabled }} + slurm_options: ${{ inputs.cluster.slurm.slurm_options }} + partition_default: ${{ inputs.cluster.slurm.partition_default }} + partition_hpc4: ${{ inputs.cluster.slurm.partition_hpc4 + cpus_per_task: ${{ inputs.cluster.slurm.cpus_per_task }} + mem: ${{ inputs.cluster.slurm.mem }} + gres_gpu_default: ${{ inputs.cluster.slurm.gres_gpu_default }} + gres_gpu_hpc4: ${{ inputs.cluster.slurm.gres_gpu_hpc4 }} + time: ${{ inputs.cluster.slurm.time }} + scheduler_directives: ${{ inputs.cluster.slurm.scheduler_directives }} + pbs: + is_disabled: ${{ inputs.cluster.pbs.is_disabled }} + scheduler_directives: ${{ inputs.cluster.pbs.scheduler_directives }} + - name: Notify job ended early-cancel: any-job-failed run: | - cd ${{ inputs.rundir }} - echo '#!/bin/bash' > run.sh - chmod +x run.sh - - # Indicates job started running - echo "touch job.started" >> run.sh - echo "hostname >> HOSTNAME" >> run.sh - - cat start_service.sh >> run.sh - - name: Submit SSH Script - run: | - cd ${{ inputs.rundir }} - bash ./run.sh - touch job.ended - rm -f jobid SESSION_PORT job.started HOSTNAME - cleanup: | set -x - cd ${{ inputs.rundir }} - bash cancel.sh - - create_session: + pwd + touch job.ended + ls -lat job.ended + wait_for_job_start: + working-directory: ${{ inputs.service.rundir }} needs: - - prepare_job_directory + - preprocessing ssh: - remoteHost: ${{ inputs.resource.ip }} + remoteHost: ${{ inputs.cluster.resource.ip }} steps: - name: Wait for job to start early-cancel: any-job-failed run: | set -x - while [ ! -f ${{ inputs.rundir }}/job.started ]; do - echo "Waiting for job to start..." + while [ ! -f job.started ]; do + if [ -f job.ended ]; then + echo "$(date) ERROR: Job ended before it started. Exiting." + exit 1 + fi + echo "$(date) Waiting for job to start..." sleep 5 done - name: Get Hostname early-cancel: any-job-failed run: | set -x - cd ${{ inputs.rundir }} - if [[ ${{ inputs.execmethod }} == "SLURM" ]]; then - source jobid - target_hostname=$(squeue -j "${jobid}" --noheader --format="%N") - # Add a retry for AECM - if [ -z "${target_hostname}" ]; then - sleep 30 - target_hostname=$(squeue -j "${jobid}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - elif [[ ${{ inputs.execmethod }} == "SSH" ]]; then - target_hostname=$(hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - fi - if [ -z "${target_hostname}" ]; then + HOSTNAME=$(cat HOSTNAME | cut -d'.' -f1) + echo "HOSTNAME=${HOSTNAME}" | tee -a $OUTPUTS + + if [ -z "${HOSTNAME}" ]; then echo "$(date) Failed to get target hostname" exit 1 fi + sleep 5 + cleanup: + working-directory: ${{ inputs.service.rundir }} + if: ${{ always }} + needs: + - session_runner + - wait_for_job_start + ssh: + remoteHost: ${{ inputs.cluster.resource.ip }} + steps: + - name: Controller cleanup + if: ${{ inputs.cluster.slurm.is_disabled && inputs.cluster.pbs.is_disabled }} + run: echo "$(date) Cleaning up..." + cleanup: | + set -x + if [ -f cancel.sh ]; then + bash cancel.sh + fi + - name: Compute cleanup + if: ${{ (inputs.cluster.slurm.is_disabled == false || inputs.cluster.pbs.is_disabled == false) }} + run: echo "Cleaning up..." + cleanup: | + set -x + remote_host="${{ needs.wait_for_job_start.outputs.HOSTNAME }}" + if [ -z "${remote_host}" ]; then + echo "$(date) WARNING: Compute node's hostname is missing. Exiting step." + exit 0 + fi + sshcmd="ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${remote_host}" + if [ -f cancel.sh ]; then + ${sshcmd} 'bash -s' < ${PWD}/cancel.sh + fi + create_session: + working-directory: ${{ inputs.service.rundir }} + needs: + - wait_for_job_start + ssh: + remoteHost: ${{ inputs.cluster.resource.ip }} + steps: - name: Get Session Port early-cancel: any-job-failed run: | - set -euo pipefail set -x + SESSION_PORT=$(cat SESSION_PORT) + echo "SESSION_PORT=${SESSION_PORT}" | tee -a $OUTPUTS - TIMEOUT=5 - RETRY_INTERVAL=3 - cd ${{ inputs.rundir }} - - attempt=1 - while true; do - echo "$(date) Attempt $attempt: Checking for SESSION_PORT file..." - - if [ -f SESSION_PORT ]; then - echo "$(date) Success: SESSION_PORT file found!" - cat SESSION_PORT | tee -a "$OUTPUTS" - exit 0 - elif [ -f job.ended ]; then - echo "$(date) Job was completed but SESSION_PORT was never created. Exiting..." - exit 1 - else - echo "$(date) SESSION_PORT not found. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done + if [ -z "${SESSION_PORT}" ]; then + echo "$(date) Failed to get target session's port" + exit 1 + fi + sleep 5 - name: Wait for Server To Start early-cancel: any-job-failed run: | TIMEOUT=5 RETRY_INTERVAL=3 - remote_host="${{ needs.create_session.outputs.target_hostname }}" + remote_host="${{ needs.wait_for_job_start.outputs.HOSTNAME }}" remote_port="${{ needs.create_session.outputs.SESSION_PORT }}" # Function to check if server is listening @@ -279,8 +206,6 @@ jobs: return $? } - cd ${{ inputs.rundir }} - # Main loop attempt=1 while true; do @@ -288,6 +213,7 @@ jobs: if check_server; then echo "$(date) Success: Server is listening on ${remote_host}:${remote_port}!" + sleep 40 exit 0 elif [ -f job.ended ]; then echo "$(date) Job was completed. Exiting... " @@ -298,145 +224,260 @@ jobs: ((attempt++)) fi done + sleep 5 - name: Update Session uses: parallelworks/update-session with: - remotePort: '${{ needs.create_session.outputs.SESSION_PORT }}' - target: '${{ inputs.resource.id }}' - name: '${{ sessions.session }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - - + target: ${{ inputs.cluster.resource.id }} + name: ${{ sessions.session }} + remoteHost: ${{ needs.wait_for_job_start.outputs.HOSTNAME }} + remotePort: ${{ needs.create_session.outputs.SESSION_PORT }} + localPort: ${{ inputs.service.localport }} 'on': execute: inputs: - resource: - type: compute-clusters - label: Compute Cluster - autoselect: true - include-workspace: false - tooltip: Resource to run the service - execmethod: - type: dropdown - label: Execution Method - default: SLURM - tooltip: Choose whether to run the job directly via SSH or submit it to a SLURM queue - hidden: true - options: - - value: SSH - label: SSH - - value: SLURM - label: SLURM - runmode: - label: Execution Mode - type: dropdown - default: singularity - hidden: true - options: - - value: docker - label: Docker - - value: singularity - label: Singularity - rundir: - label: Run Directory - default: ~/pw/activate-rag-vllm - type: string - runtype: - label: Run Type - type: dropdown - options: - - value: all - label: vLLM+RAG - - value: vllm - label: vLLM Only - build: - label: Build Containers - type: boolean - default: false - hfmodel: - label: HF Model - default: meta-llama/Llama-3.1-8B-Instruct - type: string - vllm_extra_args: - label: VLLM Extra Args - default: "--dtype float16 --max-model-len 4096 --gpu-memory-utilization=0.95 --max-num-seqs 1 --trust_remote_code" - placeholder: "--dtype float16 --max-model-len 4096 --gpu-memory-utilization=0.95 --max-num-seqs 1 --trust_remote_code" - type: string - hftoken: - label: HF Token (gated models) - optional: true - default: ${{ org.HF_TOKEN }} - type: password - apikey: - label: vLLM API Key - optional: true - tooltip: Required for integration with Cline and other code assist tools. - type: password - docsdir: - label: RAG Directory - hidden: ${{ inputs.runtype != 'all' }} - optional: true - default: ./docs - type: string - systemprompt: - type: string - label: System Prompt - hidden: ${{ inputs.runtype != 'all' }} - textarea: true - optional: true - default: You are a careful assistant. Use ONLY the provided context blocks to answer. Each block is numbered [1], [2], … and includes source metadata. When you use information from a block, you MUST cite it inline with [n]. At the end of your response, include a 'References' section with one reference per line formatted as [n] file_path (chunk index). Do not invent citations or sources. If the context does not contain the answer, say so briefly. - slurm: + cluster: type: group - label: SLURM Directives - hidden: ${{ inputs.execmethod != 'SLURM' }} + label: Compute Cluster Settings items: - partition: - type: slurm-partitions - label: SLURM partition - ignore: ${{ inputs.execmethod != 'SLURM' }} - optional: true - resource: ${{ inputs.resource }} + resource: + type: compute-clusters + label: Service host + include-workspace: false + tooltip: Resource to host the service + autoselect: true + scheduler: + type: boolean + default: true + label: Schedule Job? + hidden: true tooltip: | - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - cpus_per_task: - type: number - label: CPUs per task - ignore: ${{ inputs.execmethod != 'SLURM' }} - min: 1 - max: 32 - default: 1 - tooltip: '--cpus-per-task=value SLURM directive' - memory: - type: string - label: Minimum Total Memory Required - ignore: ${{ inputs.execmethod != 'SLURM' }} - default: 8GB - tooltip: '--mem=value SLURM directive' - number_of_gpus: - type: number - label: Number of GPUs - ignore: ${{ inputs.execmethod != 'SLURM' }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - scheduler_directives: - type: editor - ignore: ${{ inputs.execmethod != 'SLURM' }} - optional: true - tooltip: | - Type in additional scheduler directives. - advanced_settings: + Yes → Job is submitted to the scheduler using sbatch, qsub, etc + No → Job is executed in the controller or login node instead + slurm: + type: group + label: SLURM Directives + hidden: ${{ inputs.cluster.resource.provider == 'existing' && inputs.cluster.resource.schedulerType != 'slurm' || inputs.scheduler == false }} + ignore: ${{ inputs.cluster.resource.provider == 'existing' && inputs.cluster.resource.schedulerType != 'slurm' || inputs.scheduler == false }} + items: + slurm_options: + type: dropdown + label: Select Cluster + optional: true + default: '' + options: + - value: '' + label: Default + - value: '-M hpc4' + label: HPC4 + is_disabled: + type: boolean + hidden: true + default: ${{ inputs.cluster.resource.provider == 'existing' && inputs.cluster.resource.schedulerType != 'slurm' || inputs.scheduler == false }} + label: Is SLURM disabled? + partition_default: + type: slurm-partitions + label: SLURM partition + ignore: ${{ '-M hpc4' == inputs.cluster.slurm.slurm_options }} + hidden: ${{ .ignore }} + optional: true + resource: ${{ inputs.cluster.resource }} + tooltip: Select a partition from the drop down menu. Leave empty to let SLURM pick a partition. + partition_hpc4: + type: dropdown + label: SLURM partition + optional: true + tooltip: Select a partition from the drop down menu. Leave empty to let SLURM pick a partition. + ignore: ${{ '-M hpc4' != inputs.cluster.slurm.slurm_options }} + hidden: ${{ .ignore }} + default: normal + options: + - normal + - gpu + - gpu-h200 + - gpu-quick + - ht + - large-mem + - quick + - test + - unlimited + cpus_per_task: + type: number + label: CPUs per task + min: 1 + max: 32 + default: 1 + tooltip: '--cpus-per-task=value slurm directive' + ignore: ${{ 'existing' != inputs.cluster.resource.provider }} + hidden: ${{ .ignore }} + mem: + type: string + label: Minimum total memory required + default: 32GB + tooltip: '--mem=value slurm directive' + hidden: ${{ 'existing' != inputs.cluster.resource.provider }} + ignore: ${{ .hidden }} + optional: true + gres_gpu_default: + type: number + label: Number of GPUs + ignore: ${{ ( inputs.cluster.slurm.partition_default != 'gpu' && inputs.cluster.slurm.partition_default != 'gpu-quick' ) || 'existing' != inputs.cluster.resource.provider }} + hidden: ${{ .ignore }} + min: 1 + max: 4 + default: 4 + tooltip: '--gres=gpu:X slurm directive' + gres_gpu_hpc4: + type: number + label: Number of GPUs + hidden: ${{ ( inputs.cluster.slurm.partition_hpc4 != 'gpu' && inputs.cluster.slurm.partition_hpc4 != 'gpu-quick' && inputs.cluster.slurm.partition_hpc4 != 'gpu-h200' ) || 'existing' != inputs.cluster.resource.provider }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + min: 1 + max: 4 + default: 4 + tooltip: '--gres=gpu:X slurm directive' + time: + label: Walltime + type: string + default: '01:00:00' + tooltip: '--time= SLURM directive to set the maximum wall-clock time limit for the job' + scheduler_directives: + type: editor + optional: true + tooltip: | + Type in additional scheduler directives. + pbs: + type: group + label: PBS Directives + hidden: ${{ inputs.cluster.resource.schedulerType != 'pbs' || inputs.scheduler == false }} + ignore: ${{ inputs.cluster.resource.schedulerType != 'pbs' || inputs.scheduler == false }} + items: + is_disabled: + type: boolean + hidden: true + default: ${{ inputs.cluster.resource.schedulerType != 'pbs' || inputs.scheduler == false }} + label: Is PBS disabled? + scheduler_directives: + label: Scheduler Directives + type: editor + tooltip: Type the PBS scheduler directives + service: type: group - label: Advanced Settings - collapsed: true + label: Service items: - repository: + runmode: + label: Execution Mode + type: dropdown + default: singularity + hidden: ${{ 'existing' == inputs.resource.provider }} + options: + - value: docker + label: Docker + - value: singularity + label: Singularity + rundir: + label: Run Directory + default: ${HOME}/pw/activate-rag-vllm2 type: string - label: Repository - default: https://github.com/parallelworks/activate-rag-vllm.git - repository_branch: + runtype: + label: Run Type + type: dropdown + options: + - value: vllm + label: vLLM Only + - value: all + label: vLLM+RAG + build: + label: Build Containers + type: boolean + default: false + pull: + label: Pull Containers + hidden: ${{ inputs.service.runmode == 'docker' }} + type: boolean + default: true + hfmodel: + label: HF Model + default: /gs/gsfs0/home/avidaltorr/pw/software/Llama-3_3-Nemotron-Super-49B-v1_5 + type: string + vllm_extra_args: + label: VLLM Extra Args + default: '--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --gpu-memory-utilization 0.85' + placeholder: '--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --gpu-memory-utilization 0.85' + type: string + hftoken: + label: HF Token (gated models) + optional: true + default: ${{ org.HF_TOKEN }} + type: password + apikey: + label: vLLM API Key + optional: true + tooltip: Required for integration with Cline and other code assist tools. + type: password + docsdir: + label: RAG Directory + hidden: ${{ inputs.service.runtype != 'all' }} + optional: true + default: ./docs + type: string + systemprompt: + type: string + label: System Prompt + hidden: ${{ inputs.service.runtype != 'all' }} + textarea: true + optional: true + default: You are a careful assistant. Use ONLY the provided context blocks to answer. Each block is numbered [1], [2], … and includes source metadata. When you use information from a block, you MUST cite it inline with [n]. At the end of your response, include a 'References' section with one reference per line formatted as [n] file_path (chunk index). Do not invent citations or sources. If the context does not contain the answer, say so briefly. + localport: + label: User Workspace Port + default: '5555' + tooltip: Port that runs within the user workspace and used to connect to the code assist and chat interfaces. type: string - label: Repository Branch - default: main + advanced_settings: + type: group + label: Advanced Settings + collapsed: true + items: + repository: + type: string + label: Repository + default: https://github.com/parallelworks/activate-rag-vllm.git + repository_branch: + type: string + label: Repository Branch + default: nemotron-aecm + tiktoken_encodings: + label: Pull Encodings + tooltip: For GPT-OSS pull the tiktoken encodings. + type: boolean + default: false + vllm_attention_backend: + type: dropdown + label: VLLM Attention Backend + default: FLASH_ATTN + tooltip: Select the attention backend implementation used by vLLM + options: + - value: FLASH_ATTN + - value: TRITON_ATTN + - value: ROCM_ATTN + - value: ROCM_AITER_MLA + - value: ROCM_AITER_TRITON_MLA + - value: ROCM_AITER_FA + - value: ROCM_AITER_MLA_SPARSE + - value: TORCH_SDPA + - value: FLASHINFER + - value: FLASHINFER_MLA + - value: TRITON_MLA + - value: CUTLASS_MLA + - value: FLASHMLA + - value: FLASHMLA_SPARSE + - value: FLASH_ATTN_MLA + - value: PALLAS + - value: IPEX + - value: NO_ATTENTION + - value: FLEX_ATTENTION + - value: TREE_ATTN + - value: ROCM_AITER_UNIFIED_ATTN + - value: CPU_ATTN + - value: CUSTOM