From 0d91921b21c240b070d19305c0a0c88145856950 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Tue, 9 Dec 2025 17:03:46 -0600
Subject: [PATCH 01/33] Initial commit of ungeneral nemotron model

---
 singularity/singularity-compose.yml |  1 +
 workflow.yaml                       | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml
index 99fa6dd..8d0c39d 100644
--- a/singularity/singularity-compose.yml
+++ b/singularity/singularity-compose.yml
@@ -20,6 +20,7 @@ instances:
     volumes:
       - ./logs:/logs
       - ./cache:/root/.cache
+      - /p/work1/mshaxted/Llama-3_3-Nemotron-Super-49B-v1_5:/models/Llama-3_3-Nemotron-Super-49B-v1_5
       - ./env.sh:/.singularity.d/env/env.sh
 
   rag:
diff --git a/workflow.yaml b/workflow.yaml
index 47e9712..07babde 100644
--- a/workflow.yaml
+++ b/workflow.yaml
@@ -313,22 +313,22 @@ jobs:
         label: Run Type
         type: dropdown
         options:
-          - value: all
-            label: vLLM+RAG
           - value: vllm
             label: vLLM Only
+          - value: all
+            label: vLLM+RAG
       build:
         label: Build Containers
         type: boolean
         default: false
       hfmodel:
         label: HF Model
-        default: meta-llama/Llama-3.1-8B-Instruct
+        default: /models/Llama-3_3-Nemotron-Super-49B-v1_5
         type: string
       vllm_extra_args:
         label: VLLM Extra Args
-        default: "--dtype float16 --max-model-len 16384 --gpu-memory-utilization=0.95 --trust_remote_code"
-        placeholder: "--dtype float16 --max-model-len 16384 --gpu-memory-utilization=0.95 --max-num-seqs 1 --trust_remote_code"
+        default: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --max-model-len=2048 --gpu-memory-utilization 0.85"
+        placeholder: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --max-model-len=2048 --gpu-memory-utilization 0.85"
         type: string
       hftoken:
         label: HF Token (gated models)

From c44e03b5adea38390cff1a2e044c8a893a2f3778 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Tue, 9 Dec 2025 17:04:45 -0600
Subject: [PATCH 02/33] Initial commit of ungeneral nemotron model

---
 yamls/hsp.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 3790404..8706bc0 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -415,10 +415,10 @@ jobs:
         label: Run Type
         type: dropdown
         options:
-          - value: all
-            label: vLLM+RAG
           - value: vllm
             label: vLLM Only
+          - value: all
+            label: vLLM+RAG
       build:
         label: Build Containers
         type: boolean

From aa2d95b546fa7539137531c7dc37fb2fb74f7852 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Tue, 9 Dec 2025 17:05:50 -0600
Subject: [PATCH 03/33] Updating the HSP yaml

---
 yamls/hsp.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 8706bc0..a169d99 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -425,12 +425,12 @@ jobs:
         default: false
       hfmodel:
         label: HF Model
-        default: meta-llama/Llama-3.1-8B-Instruct
+        default: /models/Llama-3_3-Nemotron-Super-49B-v1_5
         type: string
       vllm_extra_args:
         label: VLLM Extra Args
-        default: "--dtype float16 --max-model-len 16384 --gpu-memory-utilization=0.95 --trust_remote_code"
-        placeholder: "--dtype float16 --max-model-len 16384 --gpu-memory-utilization=0.95 --max-num-seqs 1 --trust_remote_code"
+        default: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --max-model-len=2048 --gpu-memory-utilization 0.85"
+        placeholder: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --max-model-len=2048 --gpu-memory-utilization 0.85"
         type: string
       hftoken:
         label: HF Token (gated models)

From aa3bfe7a92f843b65721041fbb185c99803a2c62 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Tue, 9 Dec 2025 17:14:32 -0600
Subject: [PATCH 04/33] Updating branch

---
 yamls/hsp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index a169d99..0a042ed 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -560,4 +560,4 @@ jobs:
           repository_branch:
             type: string
             label: Repository Branch
-            default: main
+            default: nemotron

From 1163cab0629cd238c5b3b3ed0d401f4bb55a04f6 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Tue, 9 Dec 2025 17:33:15 -0600
Subject: [PATCH 05/33] Removing minimum model params

---
 yamls/hsp.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 0a042ed..b123386 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -429,8 +429,8 @@ jobs:
         type: string
       vllm_extra_args:
         label: VLLM Extra Args
-        default: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --max-model-len=2048 --gpu-memory-utilization 0.85"
-        placeholder: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --max-model-len=2048 --gpu-memory-utilization 0.85"
+        default: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --gpu-memory-utilization 0.85"
+        placeholder: "--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --gpu-memory-utilization 0.85"
         type: string
       hftoken:
         label: HF Token (gated models)

From a7f9e991ca7059ee6387ee7a764277bb1f4706c4 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Tue, 9 Dec 2025 17:49:40 -0600
Subject: [PATCH 06/33] Generalizing the model selection

---
 singularity/singularity-compose.yml | 2 +-
 yamls/hsp.yaml                      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml
index 8d0c39d..275ccad 100644
--- a/singularity/singularity-compose.yml
+++ b/singularity/singularity-compose.yml
@@ -20,7 +20,7 @@ instances:
     volumes:
       - ./logs:/logs
       - ./cache:/root/.cache
-      - /p/work1/mshaxted/Llama-3_3-Nemotron-Super-49B-v1_5:/models/Llama-3_3-Nemotron-Super-49B-v1_5
+      - ${MODEL_NAME}:${MODEL_NAME}
       - ./env.sh:/.singularity.d/env/env.sh
 
   rag:
diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index b123386..4d5428e 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -425,7 +425,7 @@ jobs:
         default: false
       hfmodel:
         label: HF Model
-        default: /models/Llama-3_3-Nemotron-Super-49B-v1_5
+        default: /p/work1/mshaxted/Llama-3_3-Nemotron-Super-49B-v1_5
         type: string
       vllm_extra_args:
         label: VLLM Extra Args

From 2106e65b055f56ada9b4eb13ff07750e2eebde1a Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Tue, 9 Dec 2025 17:54:22 -0600
Subject: [PATCH 07/33] Adding fixed localport

---
 singularity/singularity-compose.yml | 2 +-
 yamls/hsp.yaml                      | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml
index 275ccad..da4c1e9 100644
--- a/singularity/singularity-compose.yml
+++ b/singularity/singularity-compose.yml
@@ -20,8 +20,8 @@ instances:
     volumes:
       - ./logs:/logs
       - ./cache:/root/.cache
-      - ${MODEL_NAME}:${MODEL_NAME}
       - ./env.sh:/.singularity.d/env/env.sh
+      - ${MODEL_NAME}:${MODEL_NAME}
 
   rag:
     build:
diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 4d5428e..13258db 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -374,6 +374,7 @@ jobs:
           target: '${{ inputs.resource.id }}'
           name: '${{ sessions.session }}'
           remoteHost: '${{ needs.create_session.outputs.target_hostname }}'
+          localPort: '${{ inputs.localport }}'
 
 
 'on':
@@ -455,6 +456,11 @@ jobs:
         textarea: true
         optional: true
         default: You are a careful assistant. Use ONLY the provided context blocks to answer. Each block is numbered [1], [2], … and includes source metadata. When you use information from a block, you MUST cite it inline with [n]. At the end of your response, include a 'References' section with one reference per line formatted as [n] file_path (chunk index). Do not invent citations or sources. If the context does not contain the answer, say so briefly.
+      localport:
+        label: User Workspace Port
+        default: '5555'
+        tooltip: Port that runs within the user workspace and used to connect to the code assist and chat interfaces.
+        type: string
       slurm:
         type: group
         label: SLURM Directives

From b286838d68508e7179f4e55ab94f826e30479763 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Tue, 9 Dec 2025 17:58:58 -0600
Subject: [PATCH 08/33] Updating model selection

---
 singularity/singularity-compose.yml | 2 +-
 start_service.sh                    | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml
index da4c1e9..a59666d 100644
--- a/singularity/singularity-compose.yml
+++ b/singularity/singularity-compose.yml
@@ -21,7 +21,7 @@ instances:
       - ./logs:/logs
       - ./cache:/root/.cache
       - ./env.sh:/.singularity.d/env/env.sh
-      - ${MODEL_NAME}:${MODEL_NAME}
+      - __MODEL_NAME__:__MODEL_NAME__
 
   rag:
     build:
diff --git a/start_service.sh b/start_service.sh
index f0ef4d2..24c8399 100755
--- a/start_service.sh
+++ b/start_service.sh
@@ -224,6 +224,9 @@ elif [ "$RUNMODE" == "singularity" ]; then
     sed -i "s|^[#[:space:]]*\(export[[:space:]]\+\)\?MODEL_NAME=.*|export MODEL_NAME=$MODEL_NAME|" env.sh
     sed -i "s|^[#[:space:]]*\(export[[:space:]]\+\)\?DOCS_DIR=.*|export DOCS_DIR=$DOCS_DIR|" env.sh
     sed -i "s|__VLLM_EXTRA_ARGS__|${VLLM_EXTRA_ARGS}|" env.sh
+    
+    sed -i "s|__MODEL_NAME__|${MODEL_NAME}|" singularity-compose.yml
+
     # Disable weight download
     # Check if cache/huggingface directory exists
     if [ -d "cache/huggingface" ]; then

From ca49ffd7d45a99712696a8f0f181ceb17a8d8183 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Tue, 9 Dec 2025 19:44:30 -0600
Subject: [PATCH 09/33] fixing model replacement

---
 start_service.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/start_service.sh b/start_service.sh
index 24c8399..e7c9b6f 100755
--- a/start_service.sh
+++ b/start_service.sh
@@ -224,8 +224,8 @@ elif [ "$RUNMODE" == "singularity" ]; then
     sed -i "s|^[#[:space:]]*\(export[[:space:]]\+\)\?MODEL_NAME=.*|export MODEL_NAME=$MODEL_NAME|" env.sh
     sed -i "s|^[#[:space:]]*\(export[[:space:]]\+\)\?DOCS_DIR=.*|export DOCS_DIR=$DOCS_DIR|" env.sh
     sed -i "s|__VLLM_EXTRA_ARGS__|${VLLM_EXTRA_ARGS}|" env.sh
-    
-    sed -i "s|__MODEL_NAME__|${MODEL_NAME}|" singularity-compose.yml
+
+    sed -i "s|__MODEL_NAME__|${MODEL_NAME}|g" singularity-compose.yml
 
     # Disable weight download
     # Check if cache/huggingface directory exists

From e93febd6efa821052e24b5c0e84a0c0c881885b9 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Wed, 10 Dec 2025 10:52:26 -0600
Subject: [PATCH 10/33] Revising exit with break so workflow finishes on
 walltime exit

---
 yamls/hsp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 13258db..7b4184c 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -154,7 +154,7 @@ jobs:
               job_status=$(sacct -j ${jobid}  --format=state | tail -n1)
               echo "$(date) Job exited with status ${job_status}"
               touch job.ended
-              exit 0
+              break
             fi
           done
         cleanup: |

From 3898e4728a3b4fc9a79b35f2014fee051776fde7 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Wed, 10 Dec 2025 12:11:42 -0600
Subject: [PATCH 11/33] Cleaning up model location parameters

---
 singularity/singularity-compose.yml | 4 ++--
 start_service.sh                    | 7 ++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml
index a59666d..4c5f60a 100644
--- a/singularity/singularity-compose.yml
+++ b/singularity/singularity-compose.yml
@@ -16,12 +16,12 @@ instances:
     run:
       args:
         - >
-          nohup python3 -m vllm.entrypoints.openai.api_server --model "${MODEL_NAME}" --tokenizer "${MODEL_NAME}" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 &
+          nohup python3 -m vllm.entrypoints.openai.api_server --model "${__MODEL_BASE__}" --tokenizer "${__MODEL_BASE__}" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 &
     volumes:
       - ./logs:/logs
       - ./cache:/root/.cache
       - ./env.sh:/.singularity.d/env/env.sh
-      - __MODEL_NAME__:__MODEL_NAME__
+      - __MODEL_PATH__:__MODEL_BASE__
 
   rag:
     build:
diff --git a/start_service.sh b/start_service.sh
index e7c9b6f..f8822a6 100755
--- a/start_service.sh
+++ b/start_service.sh
@@ -225,7 +225,12 @@ elif [ "$RUNMODE" == "singularity" ]; then
     sed -i "s|^[#[:space:]]*\(export[[:space:]]\+\)\?DOCS_DIR=.*|export DOCS_DIR=$DOCS_DIR|" env.sh
     sed -i "s|__VLLM_EXTRA_ARGS__|${VLLM_EXTRA_ARGS}|" env.sh
 
-    sed -i "s|__MODEL_NAME__|${MODEL_NAME}|g" singularity-compose.yml
+    # get the base model name
+    MODEL_PATH="${MODEL_NAME}"
+    MODEL_BASE=$(basename $MODEL_NAME)
+    
+    sed -i "s|__MODEL_PATH__|${MODEL_PATH}|g" singularity-compose.yml
+    sed -i "s|__MODEL_BASE__|${MODEL_BASE}|g" singularity-compose.yml
 
     # Disable weight download
     # Check if cache/huggingface directory exists

From 76e584a4a78919447dc256caddbc30f6b1d9f595 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Wed, 10 Dec 2025 12:18:08 -0600
Subject: [PATCH 12/33] Cleaning up model location parameters

---
 singularity/singularity-compose.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml
index 4c5f60a..f8d5f81 100644
--- a/singularity/singularity-compose.yml
+++ b/singularity/singularity-compose.yml
@@ -16,12 +16,12 @@ instances:
     run:
       args:
         - >
-          nohup python3 -m vllm.entrypoints.openai.api_server --model "${__MODEL_BASE__}" --tokenizer "${__MODEL_BASE__}" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 &
+          nohup python3 -m vllm.entrypoints.openai.api_server --model "./__MODEL_BASE__" --tokenizer "./__MODEL_BASE__" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 &
     volumes:
       - ./logs:/logs
       - ./cache:/root/.cache
       - ./env.sh:/.singularity.d/env/env.sh
-      - __MODEL_PATH__:__MODEL_BASE__
+      - __MODEL_PATH__:./__MODEL_BASE__
 
   rag:
     build:

From 6d3c4fe9bd548d383f668f1379fe0a56a6c67c9d Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Wed, 10 Dec 2025 12:28:43 -0600
Subject: [PATCH 13/33] Changing to absolute path

---
 singularity/singularity-compose.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml
index f8d5f81..1385809 100644
--- a/singularity/singularity-compose.yml
+++ b/singularity/singularity-compose.yml
@@ -16,12 +16,12 @@ instances:
     run:
       args:
         - >
-          nohup python3 -m vllm.entrypoints.openai.api_server --model "./__MODEL_BASE__" --tokenizer "./__MODEL_BASE__" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 &
+          nohup python3 -m vllm.entrypoints.openai.api_server --model "/__MODEL_BASE__" --tokenizer "/__MODEL_BASE__" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 &
     volumes:
       - ./logs:/logs
       - ./cache:/root/.cache
       - ./env.sh:/.singularity.d/env/env.sh
-      - __MODEL_PATH__:./__MODEL_BASE__
+      - __MODEL_PATH__:/__MODEL_BASE__
 
   rag:
     build:

From ba5d3fa721b92426658b95a51f19f5120491e4ce Mon Sep 17 00:00:00 2001
From: Alvaro Vidal Torreira <alvaro@parallelworks.com>
Date: Fri, 12 Dec 2025 18:27:48 +0100
Subject: [PATCH 14/33] Move gress and contraint directives to the
 scheduler_directives parameter (#32)

---
 yamls/hsp.yaml | 28 ++++------------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 7b4184c..183cef3 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -82,11 +82,9 @@ jobs:
           chmod +x run.sh
           echo "#SBATCH --account=${{ inputs.slurm.account }}" >> run.sh
           echo "#SBATCH --qos=${{ inputs.slurm.qos }}" >> run.sh
-          echo "#SBATCH --constraint=${{ inputs.slurm.constraint }}" >> run.sh
           if [[ "${{ inputs.slurm.partition }}" != "undefined" && "${{ inputs.slurm.partition }}" != "" ]]; then
             echo "#SBATCH --partition=${{ inputs.slurm.partition }}" >> run.sh
           fi
-          echo "#SBATCH --gres=gpu:${{ inputs.slurm.number_of_gpus }}" >> run.sh
           echo "#SBATCH --cpus-per-task=${{ inputs.slurm.cpus_per_task }}" >> run.sh
           echo "#SBATCH --nodes=1" >> run.sh
           echo "#SBATCH --time=${{ inputs.slurm.time }}" >> run.sh
@@ -491,17 +489,6 @@ jobs:
             ignore: ${{ inputs.execmethod != 'SLURM' || 'existing' != inputs.resource.provider }}
             optional: ${{ .ignore }}
             hidden: ${{ .ignore }}
-          constraint:
-            label: Constraint
-            type: dropdown
-            ignore: ${{ inputs.execmethod != 'SLURM' || 'existing' != inputs.resource.provider }}
-            hidden: ${{ .ignore }}
-            default: viz
-            options:
-              - value: viz
-                label: viz
-              - value: mla
-                label: mla
           cpus_per_task:
             type: number
             label: CPUs per task
@@ -511,21 +498,14 @@ jobs:
             max: 32
             default: 1
             tooltip: '--cpus-per-task=value SLURM directive'
-          number_of_gpus:
-            type: number
-            label: Number of GPUs
-            ignore: ${{ inputs.execmethod != 'SLURM' || 'existing' != inputs.resource.provider }}
-            hidden: ${{ .ignore }}
-            min: 1
-            max: 4
-            default: 1
-            tooltip: '--gres=gpu:X slurm directive'
           scheduler_directives:
             type: editor
             ignore: ${{ inputs.execmethod != 'SLURM' }}
             optional: true
-            tooltip: |
-              Type in additional scheduler directives. 
+            tooltip: Type in additional scheduler directives. 
+            default: |
+              #SBATCH --constraint=mla
+              #SBATCH --gres=gpu:4 
           time:
             label: Walltime
             type: string

From b99999e2e74a4457cb7a8f8c6bfd6a3de862e195 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Fri, 12 Dec 2025 11:30:04 -0600
Subject: [PATCH 15/33] Generalizing the yaml for different useres

---
 yamls/hsp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 183cef3..a38a2cc 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -424,7 +424,7 @@ jobs:
         default: false
       hfmodel:
         label: HF Model
-        default: /p/work1/mshaxted/Llama-3_3-Nemotron-Super-49B-v1_5
+        default: /p/work1/${USER}/Llama-3_3-Nemotron-Super-49B-v1_5
         type: string
       vllm_extra_args:
         label: VLLM Extra Args

From e5d12f37b2fc029ad5b1693c9d2837c3ba2e68b4 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sat, 13 Dec 2025 11:40:29 -0600
Subject: [PATCH 16/33] Updating readme with manual data requirements for model
 and container

---
 README.md | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 25140ea..c454820 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# ACTIVATE — vLLM + RAG + Proxy
+# ACTIVATE — vLLM + RAG
 
 This Compose stack runs from the [github repo here](https://github.com/parallelworks/activate-rag-vllm) and executes the below services in Docker or Singularity modes:
 
@@ -14,7 +14,24 @@ See a turnkey demonstration of the workflow running on ACTIVATE at the link belo
 <img target="_blank" src="https://www.dropbox.com/scl/fi/xyjf75inw6pa5uk2kyv1p/vllmragthumb.png?rlkey=498wwpesf90nfdon3xj5vyhwy&raw=1" width="350">
 </a>
 
-## Quickstart
+## Workflow Instructions
+
+Pull down the weights of your choice into a known directory. For example we recommend using git lfs to pull down weights as this is more widely open to firewalls and is relatively fast at pulls:
+
+```
+cd /mymodeldir/
+git lfs install
+git clone https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5
+```
+
+The workflow will provide a field to also pull down a prebuilt vllm singularity container if running in this mode, but you can also pull this down manually for example using the authenticated pw cli:
+
+```
+cd ~/pw/activate-rag-vllm
+pw buckets cp pw://mshaxted/codeassist/vllm.sif ./
+```
+
+## Manual Quickstart
 ```bash
 export HF_TOKEN=hf_xyz
 export RUNMODE=docker # or singularity

From 059cc7d3d0d64006fe88658c1bd947e83afeabc4 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sat, 13 Dec 2025 11:56:12 -0600
Subject: [PATCH 17/33] Updating yaml to provide a container pull option

---
 yamls/hsp.yaml | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index a38a2cc..17a9562 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -65,6 +65,19 @@ jobs:
             echo "$(date) ERROR: Failed to install singularity-compose"
             exit 1
           fi
+      - name: Pull Singularity Containers
+        if: ${{ inputs.runmode == 'singularity' && inputs.pull == true }}
+        early-cancel: any-job-failed
+        run: |
+          set -x
+          cd ${{ inputs.rundir }}
+          echo pulling vllm.sif from ${{ inputs.container_bucket }}
+          pw bucket cp ${{ inputs.container_bucket }}/vllm.sif ./
+
+          if [[ "${{ inputs.runmode }}" == "all" ]]; then
+            echo pulling rag.sif from ${{ inputs.container_bucket }}
+            pw bucket cp ${{ inputs.container_bucket }}/rag.sif ./
+          fi
 
   slurm_job:
     needs:
@@ -422,6 +435,16 @@ jobs:
         label: Build Containers
         type: boolean
         default: false
+      pull:
+        label: Pull Containers
+        type: boolean
+        default: true
+      container_bucket:
+        label: Container Bucket (.sif)
+        hidden: ${{ inputs.runmode != 'singularity' }}
+        type: string
+        default: pw://mshaxted/codeassist
+        help: PW singularity container bucket that holds vllm.sif and rag.sif containers
       hfmodel:
         label: HF Model
         default: /p/work1/${USER}/Llama-3_3-Nemotron-Super-49B-v1_5

From a0aae85eea16ab622fc3ae7d74f9a6978191dc42 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sat, 13 Dec 2025 12:01:29 -0600
Subject: [PATCH 18/33] Updating container conditional logic

---
 yamls/hsp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 17a9562..bcff9d0 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -441,7 +441,7 @@ jobs:
         default: true
       container_bucket:
         label: Container Bucket (.sif)
-        hidden: ${{ inputs.runmode != 'singularity' }}
+        hidden: ${{ inputs.runmode != 'singularity' && inputs.pull == true }}
         type: string
         default: pw://mshaxted/codeassist
         help: PW singularity container bucket that holds vllm.sif and rag.sif containers

From 81461ef4221857de28587f1f31c22c4b484b4410 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sat, 13 Dec 2025 12:02:19 -0600
Subject: [PATCH 19/33] Updating tooltip

---
 yamls/hsp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index bcff9d0..32274f0 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -444,7 +444,7 @@ jobs:
         hidden: ${{ inputs.runmode != 'singularity' && inputs.pull == true }}
         type: string
         default: pw://mshaxted/codeassist
-        help: PW singularity container bucket that holds vllm.sif and rag.sif containers
+        tooltip: PW singularity container bucket that holds vllm.sif and rag.sif containers
       hfmodel:
         label: HF Model
         default: /p/work1/${USER}/Llama-3_3-Nemotron-Super-49B-v1_5

From 43277936516e4bb8e221452ff730cfc7895108a7 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sat, 13 Dec 2025 12:04:18 -0600
Subject: [PATCH 20/33] Updating tooltip

---
 yamls/hsp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 32274f0..0e51940 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -441,7 +441,7 @@ jobs:
         default: true
       container_bucket:
         label: Container Bucket (.sif)
-        hidden: ${{ inputs.runmode != 'singularity' && inputs.pull == true }}
+        hidden: ${{ inputs.runmode != 'singularity' || inputs.pull != true }}
         type: string
         default: pw://mshaxted/codeassist
         tooltip: PW singularity container bucket that holds vllm.sif and rag.sif containers

From df8b1a6c60106139e02e20e81ee83d1dae1712dc Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sat, 13 Dec 2025 12:05:09 -0600
Subject: [PATCH 21/33] Updating tooltip

---
 yamls/hsp.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 0e51940..60c76b1 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -437,6 +437,7 @@ jobs:
         default: false
       pull:
         label: Pull Containers
+        hidden: ${{ inputs.runmode == 'docker' }}
         type: boolean
         default: true
       container_bucket:

From 7658ff40805e3d84e49b570c508ed0992bea11af Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sat, 13 Dec 2025 12:14:55 -0600
Subject: [PATCH 22/33] Adding comment for constraints

---
 yamls/hsp.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 60c76b1..eb92b62 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -528,8 +528,10 @@ jobs:
             optional: true
             tooltip: Type in additional scheduler directives. 
             default: |
-              #SBATCH --constraint=mla
               #SBATCH --gres=gpu:4 
+
+              ##SBATCH --constraint=mla # uncomment for Navy and AFRL DSRC systems
+
           time:
             label: Walltime
             type: string

From 7a30d72928e55473af79f244fbfeed7f6e88a506 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sat, 13 Dec 2025 12:22:45 -0600
Subject: [PATCH 23/33] skipping pull if containers already exist

---
 yamls/hsp.yaml | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index eb92b62..5d5767f 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -71,12 +71,23 @@ jobs:
         run: |
           set -x
           cd ${{ inputs.rundir }}
-          echo pulling vllm.sif from ${{ inputs.container_bucket }}
-          pw bucket cp ${{ inputs.container_bucket }}/vllm.sif ./
+          
+          # vllm container
+          if [[ ! -f "vllm.sif" ]]; then
+            echo "vllm.sif not found, pulling from ${{ inputs.container_bucket }}"
+            pw bucket cp "${{ inputs.container_bucket }}/vllm.sif" ./
+          else
+            echo "vllm.sif already exists, skipping pull"
+          fi
 
+          # rag container (only for runmode=all)
           if [[ "${{ inputs.runmode }}" == "all" ]]; then
-            echo pulling rag.sif from ${{ inputs.container_bucket }}
-            pw bucket cp ${{ inputs.container_bucket }}/rag.sif ./
+            if [[ ! -f "rag.sif" ]]; then
+              echo "rag.sif not found, pulling from ${{ inputs.container_bucket }}"
+              pw bucket cp "${{ inputs.container_bucket }}/rag.sif" ./
+            else
+              echo "rag.sif already exists, skipping pull"
+            fi
           fi
 
   slurm_job:

From 3b74aca7e6efeb00743f0a588947f8ea6ae40e57 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sat, 13 Dec 2025 12:23:05 -0600
Subject: [PATCH 24/33] skipping pull if containers already exist

---
 yamls/hsp.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 5d5767f..e919736 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -90,6 +90,8 @@ jobs:
             fi
           fi
 
+          echo Singularity container pull step complete.
+
   slurm_job:
     needs:
       - prepare_job_directory

From 7752919bbe7a9e6b3893b0a149fcc8934be01e94 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sat, 13 Dec 2025 15:24:43 -0600
Subject: [PATCH 25/33] Adding ticktoken encodings for gptoss

---
 yamls/hsp.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index e919736..1c0c267 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -35,6 +35,7 @@ jobs:
           echo "export DOCS_DIR=${{ inputs.docsdir }}" >> .run.env
           echo "export VLLM_EXTRA_ARGS=\"${{ inputs.vllm_extra_args }}\"" >> .run.env
           echo "export TRANSFORMERS_OFFLINE=1" >> .run.env
+          echo "export TIKTOKEN_ENCODINGS_BASE=/root/.cache/tiktoken_encodings" >> .run.env
 
       - name: Install Singularity Compose
         if: ${{ inputs.runmode == 'singularity' }}
@@ -91,6 +92,15 @@ jobs:
           fi
 
           echo Singularity container pull step complete.
+      - name: Pull Tiktoken Encodings
+        if: ${{ inputs.advanced_settings.tiktoken_encodings == true }}
+        early-cancel: any-job-failed
+        run: |
+          set -x
+          cd ${{ inputs.rundir }}
+          mkdir -p cache/tiktoken_encodings
+          wget -O cache/tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"
+          wget -O cache/tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
 
   slurm_job:
     needs:
@@ -586,3 +596,8 @@ jobs:
             type: string
             label: Repository Branch
             default: nemotron
+          tiktoken_encodings:
+            label: Pull Tiktoken Encodings
+            tooltip: For GPT-OSS pull the tiktoken encodings.
+            type: boolean
+            default: false
\ No newline at end of file

From f6b32d63423ff3c5435fd73a428a50d3f546909b Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sat, 13 Dec 2025 15:46:00 -0600
Subject: [PATCH 26/33] Fixing singularity loading if already in path

---
 yamls/hsp.yaml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index 1c0c267..de80195 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -136,7 +136,17 @@ jobs:
           echo "touch job.started" >> run.sh
           echo "hostname >> HOSTNAME" >> run.sh
 
-          echo "module load singularity" >> run.sh
+          # load singularity via module if not already in path
+          cat << 'EOF' >> run.sh
+          # Ensure Singularity is available
+          if ! command -v singularity >/dev/null 2>&1; then
+            if command -v module >/dev/null 2>&1; then
+              module load singularity || module load apptainer
+            else
+              echo "ERROR: singularity/apptainer not found" >&2
+            fi
+          fi
+          EOF
                     
           cat start_service.sh >> run.sh
       - name: Submit SLURM Script

From 686bfb9effbb171b15c62184d3cad544f83a662b Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sun, 14 Dec 2025 08:21:56 -0600
Subject: [PATCH 27/33] Updating the ATTN settings for latest vllm update

---
 singularity/env.sh.example |  2 +-
 yamls/hsp.yaml             | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/singularity/env.sh.example b/singularity/env.sh.example
index a234cf3..3a03eb3 100644
--- a/singularity/env.sh.example
+++ b/singularity/env.sh.example
@@ -23,7 +23,7 @@ export HF_HOME="/root/.cache/huggingface"
 
 # Recommended on T4/V100 and for mistral tokenizer
 export DOCS_DIR=./docs
-export VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1
+export VLLM_ATTENTION_BACKEND=TRITON_ATTN
 export VLLM_EXTRA_ARGS="__VLLM_EXTRA_ARGS__"
 export TRITON_CC=gcc
 export CC=/usr/bin/gcc
diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml
index c24611d..aa083a9 100644
--- a/yamls/hsp.yaml
+++ b/yamls/hsp.yaml
@@ -36,6 +36,7 @@ jobs:
           echo "export VLLM_EXTRA_ARGS=\"${{ inputs.vllm_extra_args }}\"" >> .run.env
           echo "export TRANSFORMERS_OFFLINE=1" >> .run.env
           echo "export TIKTOKEN_ENCODINGS_BASE=/root/.cache/tiktoken_encodings" >> .run.env
+          echo "export VLLM_ATTENTION_BACKEND=${{ inputs.advanced_settings.vllm_attention_backend }}" >> .run.env
 
       - name: Install Singularity Compose
         if: ${{ inputs.runmode == 'singularity' }}
@@ -607,7 +608,36 @@ jobs:
             label: Repository Branch
             default: nemotron
           tiktoken_encodings:
-            label: Pull Tiktoken Encodings
+            label: Pull Encodings
             tooltip: For GPT-OSS pull the tiktoken encodings.
             type: boolean
             default: false
+          vllm_attention_backend:
+            type: dropdown
+            label: VLLM Attention Backend
+            default: TRITON_ATTN
+            tooltip: Select the attention backend implementation used by vLLM
+            options:
+              - value: FLASH_ATTN
+              - value: TRITON_ATTN
+              - value: ROCM_ATTN
+              - value: ROCM_AITER_MLA
+              - value: ROCM_AITER_TRITON_MLA
+              - value: ROCM_AITER_FA
+              - value: ROCM_AITER_MLA_SPARSE
+              - value: TORCH_SDPA
+              - value: FLASHINFER
+              - value: FLASHINFER_MLA
+              - value: TRITON_MLA
+              - value: CUTLASS_MLA
+              - value: FLASHMLA
+              - value: FLASHMLA_SPARSE
+              - value: FLASH_ATTN_MLA
+              - value: PALLAS
+              - value: IPEX
+              - value: NO_ATTENTION
+              - value: FLEX_ATTENTION
+              - value: TREE_ATTN
+              - value: ROCM_AITER_UNIFIED_ATTN
+              - value: CPU_ATTN
+              - value: CUSTOM
\ No newline at end of file

From 24e750ba69621eb36f27713d1c424b9645d5a0c5 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sun, 14 Dec 2025 09:00:39 -0600
Subject: [PATCH 28/33] Adding sagemaker fix for updated vllm

---
 singularity/singularity-compose.yml | 1 +
 start_service.sh                    | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml
index 1385809..1cb2430 100644
--- a/singularity/singularity-compose.yml
+++ b/singularity/singularity-compose.yml
@@ -20,6 +20,7 @@ instances:
     volumes:
       - ./logs:/logs
       - ./cache:/root/.cache
+      - ./cache/sagemaker_sessions:/dev/shm/sagemaker_sessions
       - ./env.sh:/.singularity.d/env/env.sh
       - __MODEL_PATH__:/__MODEL_BASE__
 
diff --git a/start_service.sh b/start_service.sh
index f8822a6..37173e8 100755
--- a/start_service.sh
+++ b/start_service.sh
@@ -245,6 +245,10 @@ elif [ "$RUNMODE" == "singularity" ]; then
 
     mkdir -p logs cache cache/chroma $DOCS_DIR
 
+    # fixing updated vllm sagemarker sessions issue
+    mkdir -p cache/sagemaker_sessions
+    chmod 700 cache/sagemaker_sessions
+
     # singularity-compose does not support env variables in the yml config file
     if [ "$DOCS_DIR" != "./docs" ];then
         ln -s $DOCS_DIR ./docs

From ac616f1de2da57c378cca3f88970139226d21698 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sun, 14 Dec 2025 09:29:48 -0600
Subject: [PATCH 29/33] Updating singularity build file

---
 singularity/Singularity.vllm | 53 ++++++++++++++++++++++++++++++++----
 1 file changed, 47 insertions(+), 6 deletions(-)

diff --git a/singularity/Singularity.vllm b/singularity/Singularity.vllm
index 9aa97b8..95edf24 100644
--- a/singularity/Singularity.vllm
+++ b/singularity/Singularity.vllm
@@ -1,11 +1,52 @@
 Bootstrap: docker
 From: vllm/vllm-openai:latest
 
+%post
+    set -eux
+
+    apt-get update
+    apt-get install -y --no-install-recommends \
+        clang lld llvm \
+        build-essential \
+        git ca-certificates curl \
+        pkg-config cmake ninja-build
+    rm -rf /var/lib/apt/lists/*
+
+    # Pick a Python interpreter that actually exists in the base image
+    if command -v python >/dev/null 2>&1; then
+        PY=python
+    elif command -v python3 >/dev/null 2>&1; then
+        PY=python3
+    elif [ -x /opt/conda/bin/python ]; then
+        PY=/opt/conda/bin/python
+    else
+        echo "No Python interpreter found (python/python3/conda)."; exit 1
+    fi
+
+    # Prefer clang for any builds happening inside the container
+    echo "export CC=clang"  >> /etc/profile.d/clang.sh
+    echo "export CXX=clang++" >> /etc/profile.d/clang.sh
+
+    # Upgrade packaging tooling
+    $PY -m pip install -U pip setuptools wheel
+
+    # Ensure Transformers has the ministral3 config mapping
+    $PY -m pip uninstall -y transformers || true
+    $PY -m pip install -U git+https://github.com/huggingface/transformers
+
+    $PY - << 'PY'
+import transformers
+from transformers.models.auto import CONFIG_MAPPING
+print("Transformers:", transformers.__version__)
+print("ministral3 in CONFIG_MAPPING:", "ministral3" in CONFIG_MAPPING)
+PY
+
 %runscript
-mkdir -p /app
-cd /app
-exec /bin/bash -lc "$@"
+    mkdir -p /app
+    cd /app
+    exec /bin/bash -lc "$@"
+
 %startscript
-mkdir -p /app
-cd /app
-exec /bin/bash -lc "$@"
\ No newline at end of file
+    mkdir -p /app
+    cd /app
+    exec /bin/bash -lc "$@"
\ No newline at end of file

From ddfb3a25a7eeaa54bd30c5858f49e09793025e3f Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sun, 14 Dec 2025 11:10:20 -0600
Subject: [PATCH 30/33] More gracefully exit log on fail when container fails

---
 start_service.sh | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/start_service.sh b/start_service.sh
index 37173e8..5028ff8 100755
--- a/start_service.sh
+++ b/start_service.sh
@@ -267,7 +267,18 @@ elif [ "$RUNMODE" == "singularity" ]; then
         [ "$BUILD" = "true" ] && singularity-compose build "${RUNTYPE}1"
         singularity-compose up "${RUNTYPE}1"
     fi
-    # Follow the logs
-    tail -f logs/*
+
+    # Only follow logs if up succeeded
+    # Make tail die when this script dies (and don't explode if logs don't exist yet)
+    shopt -s nullglob
+    logs=(logs/*)
+    if ((${#logs[@]} > 0)); then
+        tail -F "${logs[@]}" &
+        tail_pid=$!
+        trap 'kill "$tail_pid" >/dev/null 2>&1 || true; cleanup' EXIT
+        wait "$tail_pid"
+    else
+        echo "No logs found under logs/. Skipping tail."
+    fi
 
 fi

From c640078cbba3e74d230464b3785abac3f90ccdac Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Sun, 14 Dec 2025 11:13:00 -0600
Subject: [PATCH 31/33] Removing bad char

---
 singularity/env.sh.example | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/singularity/env.sh.example b/singularity/env.sh.example
index 3a03eb3..fd93b28 100644
--- a/singularity/env.sh.example
+++ b/singularity/env.sh.example
@@ -32,7 +32,7 @@ export CXX=/usr/bin/g++
 export TMPDIR=${PWD}/tmp
 export CUDA_CACHE_PATH=${TMPDIR}/cuda_cache
 export TORCH_EXTENSIONS_DIR=${TMPDIR}/torch_extensions
-export FLASHINFER_JIT_DIR=${TMPDIR}/flashinfer_jitß
+export FLASHINFER_JIT_DIR=${TMPDIR}/flashinfer_jit
 
 # Other VLLM tuning settings
 export VLLM_LOGGING_LEVEL=INFO

From e5382b81d95cbfd6e697087a1f57e455226b5de9 Mon Sep 17 00:00:00 2001
From: Matthew Shaxted <mattshax@gmail.com>
Date: Mon, 15 Dec 2025 15:46:15 -0600
Subject: [PATCH 32/33] Fixing the shm dir issue

---
 start_service.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/start_service.sh b/start_service.sh
index 5028ff8..97dc180 100755
--- a/start_service.sh
+++ b/start_service.sh
@@ -203,7 +203,7 @@ elif [ "$RUNMODE" == "singularity" ]; then
 
     cp singularity/* ./ -Rf
     cp env.sh.example env.sh
-    
+
     VLLM_SERVER_PORT=$(findAvailablePort)
     RAG_PORT=$(findAvailablePort)
     PROXY_PORT=$(findAvailablePort)
@@ -249,6 +249,9 @@ elif [ "$RUNMODE" == "singularity" ]; then
     mkdir -p cache/sagemaker_sessions
     chmod 700 cache/sagemaker_sessions
 
+    mkdir -p /dev/shm/sagemaker_sessions
+    chmod 700 /dev/shm/sagemaker_sessions
+
     # singularity-compose does not support env variables in the yml config file
     if [ "$DOCS_DIR" != "./docs" ];then
         ln -s $DOCS_DIR ./docs

From 8f332332080a58a702405227256bf8805aa70ed5 Mon Sep 17 00:00:00 2001
From: Alvaro Vidal Torreira <alvaro@parallelworks.com>
Date: Fri, 16 Jan 2026 17:40:43 +0100
Subject: [PATCH 33/33] Add support for AECM (#40)

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Updating for support on AECM

* Revert change

* transition code
---
 controller.sh                       |  51 ++
 singularity/env.sh.example          |   1 +
 singularity/singularity-compose.yml |   2 +-
 start_service.sh                    |  35 +-
 yamls/emed.yaml                     | 741 +++++++++++++++-------------
 5 files changed, 459 insertions(+), 371 deletions(-)
 create mode 100644 controller.sh

diff --git a/controller.sh b/controller.sh
new file mode 100644
index 0000000..100eec8
--- /dev/null
+++ b/controller.sh
@@ -0,0 +1,51 @@
+
+
+if [[ "${service_runmode}" == "singularity" ]];then
+    # Check if singularity-compose is installed globally
+    if ! command -v singularity-compose &> /dev/null; then
+        # Check if virtual environment exists and activate it
+        if [ -d ~/pw/software/singularity-compose ]; then
+            source ~/pw/software/singularity-compose/bin/activate
+        fi
+        # Check again if singularity-compose is available after activation
+        if ! command -v singularity-compose &> /dev/null; then
+            echo "$(date) singularity-compose not found, installing..."      
+            # Create directory for Python environment
+            mkdir -p ~/pw/software
+                  
+            # Create virtual environment named singularity-compose and install singularity-compose
+            python3 -m venv ~/pw/software/singularity-compose
+            source ~/pw/software/singularity-compose/bin/activate
+            pip install --upgrade pip
+            pip install singularity-compose
+        fi
+    fi
+    if ! command -v singularity-compose >/dev/null 2>&1; then
+        echo "$(date) Error: Failed to install singularity-compose"
+        exit 1
+    fi
+fi
+
+if ! [ -z "${service_container_bucket}" ]; then
+    # vllm container
+    if [[ ! -f "vllm.sif" ]]; then
+        echo "$(date) vllm.sif not found, pulling from ${service_container_bucket}"
+        pw bucket cp "${service_container_bucket}/vllm.sif" ./
+    else
+        echo "$(date) vllm.sif already exists, skipping pull"
+    fi
+    # rag container (only for runmode=all)
+    if [[ "${service_runmode}" == "all" ]]; then
+        if [[ ! -f "rag.sif" ]]; then
+            echo "$(date) rag.sif not found, pulling from ${service_container_bucket}"
+            pw bucket cp "${service_container_bucket}/rag.sif" ./
+        else
+            echo "$(date) rag.sif already exists, skipping pull"
+        fi
+    fi
+    echo "$(date) Singularity container pull step complete."
+fi
+
+mkdir -p cache/tiktoken_encodings
+wget --no-check-certificate -O cache/tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" || true
+wget --no-check-certificate -O cache/tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" || true
\ No newline at end of file
diff --git a/singularity/env.sh.example b/singularity/env.sh.example
index fd93b28..0b883c0 100644
--- a/singularity/env.sh.example
+++ b/singularity/env.sh.example
@@ -29,6 +29,7 @@ export TRITON_CC=gcc
 export CC=/usr/bin/gcc
 export CXX=/usr/bin/g++
 
+
 export TMPDIR=${PWD}/tmp
 export CUDA_CACHE_PATH=${TMPDIR}/cuda_cache
 export TORCH_EXTENSIONS_DIR=${TMPDIR}/torch_extensions
diff --git a/singularity/singularity-compose.yml b/singularity/singularity-compose.yml
index 1cb2430..8b526a7 100644
--- a/singularity/singularity-compose.yml
+++ b/singularity/singularity-compose.yml
@@ -16,7 +16,7 @@ instances:
     run:
       args:
         - >
-          nohup python3 -m vllm.entrypoints.openai.api_server --model "/__MODEL_BASE__" --tokenizer "/__MODEL_BASE__" --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 &
+          nohup python3 -m vllm.entrypoints.openai.api_server --model "/__MODEL_BASE__" --tokenizer "/__MODEL_BASE__"  --host 0.0.0.0 --port ${VLLM_SERVER_PORT} ${VLLM_EXTRA_ARGS} > /logs/vllm.out 2>&1 &
     volumes:
       - ./logs:/logs
       - ./cache:/root/.cache
diff --git a/start_service.sh b/start_service.sh
index 97dc180..40f215a 100755
--- a/start_service.sh
+++ b/start_service.sh
@@ -30,15 +30,6 @@ install_docker_compose(){
     chmod +x docker-compose
 }
 
-findAvailablePort() {
-    availablePort=$(pw agent open-port)
-    echo ${availablePort}
-    if [ -z "${availablePort}" ]; then
-        echo "$(date) ERROR: No port found. Exiting job"
-        exit 1
-    fi
-}
-
 start_rootless_docker() {
     local MAX_RETRIES=20
     local RETRY_INTERVAL=2
@@ -116,13 +107,15 @@ if [ "$RUNMODE" == "docker" ];then
     cp docker/* ./ -Rf
     cp env.example .env
 
-    VLLM_SERVER_PORT=$(findAvailablePort)
-    PROXY_PORT=$(findAvailablePort)
-
-    if [ "$RUNTYPE" == "vllm" ];then
+    if [ "$RUNTYPE" == "all" ];then
+        VLLM_SERVER_PORT=$(pw agent open-port)
+        PROXY_PORT=${service_port}
+        # TRANSITION CODE
         echo "SESSION_PORT=${VLLM_SERVER_PORT}" > SESSION_PORT
     else
-        echo "SESSION_PORT=${PROXY_PORT}" > SESSION_PORT
+        PROXY_PORT=$(pw agent open-port)
+        VLLM_SERVER_PORT=${service_port}
+        echo "SESSION_PORT=${VLLM_SERVER_PORT}" > SESSION_PORT
     fi
     
     sed -i "s/^VLLM_SERVER_PORT=.*/VLLM_SERVER_PORT=${VLLM_SERVER_PORT}/" .env
@@ -204,15 +197,17 @@ elif [ "$RUNMODE" == "singularity" ]; then
     cp singularity/* ./ -Rf
     cp env.sh.example env.sh
 
-    VLLM_SERVER_PORT=$(findAvailablePort)
-    RAG_PORT=$(findAvailablePort)
-    PROXY_PORT=$(findAvailablePort)
-    CHROMA_PORT=$(findAvailablePort)
+    RAG_PORT=$(pw agent open-port)
+    CHROMA_PORT=$(pw agent open-port)
 
     if [ "$RUNTYPE" == "all" ];then
-        echo "SESSION_PORT=${PROXY_PORT}" > SESSION_PORT
+        VLLM_SERVER_PORT=$(pw agent open-port)
+        PROXY_PORT=${service_port}
+        echo "SESSION_PORT=${VLLM_SERVER_PORT}" > SESSION_PORT 
     else
-        echo "SESSION_PORT=${VLLM_SERVER_PORT}" > SESSION_PORT
+        PROXY_PORT=$(pw agent open-port)
+        VLLM_SERVER_PORT=${service_port}
+        echo "SESSION_PORT=${VLLM_SERVER_PORT}" > SESSION_PORT 
     fi
 
     sed -i "s/^export VLLM_SERVER_PORT=.*/export VLLM_SERVER_PORT=${VLLM_SERVER_PORT}/" env.sh
diff --git a/yamls/emed.yaml b/yamls/emed.yaml
index 1fd976e..e46eb92 100644
--- a/yamls/emed.yaml
+++ b/yamls/emed.yaml
@@ -4,273 +4,200 @@ sessions:
   session:
     redirect: false
     openAI: true
-
 jobs:
-  prepare_job_directory:
+  preprocessing:
+    working-directory: ${{ inputs.service.rundir }}
     ssh:
-      remoteHost: ${{ inputs.resource.ip }}
+      remoteHost: ${{ inputs.cluster.resource.ip }}
     steps:
-      - name: Preparing Run Directory
+      - name: Checkout
+        uses: parallelworks/checkout
+        with:
+          repo: ${{ inputs.service.advanced_settings.repository }}
+          branch: ${{ inputs.service.advanced_settings.repository_branch }}
+      - name: Create Environment File
+        early-cancel: any-job-failed
         run: |
-          set -x
-          mkdir -p $(dirname ${{ inputs.rundir }})
-          git clone -b ${{ inputs.advanced_settings.repository_branch }} ${{ inputs.advanced_settings.repository }} ${{ inputs.rundir }}
-          cd ${{ inputs.rundir }}
-          git checkout ${{ inputs.advanced_settings.repository_branch }}
-          git branch --set-upstream-to=origin/${{ inputs.advanced_settings.repository_branch }}
-          git pull
-          rm -f jobid SESSION_PORT job.started job.ended run.out HOSTNAME
+          # FIXME: remove when issue 11915 is fixed
+          cp -rfT ${PW_PARENT_JOB_DIR}/ ./
+          rm -f SESSION_PORT job.started job.ended run.out HOSTNAME
           rm -rf logs
-      - name: Install Singularity Compose
-        if: ${{ inputs.runmode == 'singularity' }}
-        early-cancel: any-job-failed
+          cat > .run.env << 'EOF'
+          export RUNMODE="${{ inputs.service.runmode }}"
+          export BUILD="${{ inputs.service.build }}"
+          export RUNTYPE="${{ inputs.service.runtype }}"
+          export SYSTEM_PROMPT="${{ inputs.service.systemprompt }}"
+          export HF_TOKEN="${{ inputs.service.hftoken }}"
+          export MODEL_NAME="${{ inputs.service.hfmodel }}"
+          export API_KEY="${{ inputs.service.apikey }}"
+          export DOCS_DIR="${{ inputs.service.docsdir }}"
+          export VLLM_EXTRA_ARGS="${{ inputs.service.vllm_extra_args }}"
+          export TRANSFORMERS_OFFLINE=1
+          export TIKTOKEN_ENCODINGS_BASE="/root/.cache/tiktoken_encodings"
+          export VLLM_ATTENTION_BACKEND="${{ inputs.service.advanced_settings.vllm_attention_backend }}"
+          EOF
+      - name: Controller Preprocessing
         run: |
-          # Check if singularity-compose is installed globally
-          if ! command -v singularity-compose &> /dev/null; then
-              # Check if virtual environment exists and activate it
-              if [ -d ~/pw/software/singularity-compose ]; then
-                  source ~/pw/software/singularity-compose/bin/activate
-              fi
-              # Check again if singularity-compose is available after activation
-              if ! command -v singularity-compose &> /dev/null; then
-                  echo "$(date) singularity-compose not found, installing..."
-                  
-                  # Create directory for Python environment
-                  mkdir -p ~/pw/software
-                  
-                  # Create virtual environment named singularity-compose and install singularity-compose
-                  python3 -m venv ~/pw/software/singularity-compose
-                  source ~/pw/software/singularity-compose/bin/activate
-                  pip install --upgrade pip
-                  pip install singularity-compose
-              fi
-          fi
-          if ! command -v singularity-compose >/dev/null 2>&1; then
-            echo "$(date) Error: Failed to install singularity-compose"
-            exit 1
-          fi
-
-      - name: Create Environment File
-        early-cancel: any-job-failed
+          set -x
+          export service_runmode=${{ inputs.service.runmode }}
+          bash controller.sh
+      - name: Create Service Script
         run: |
           set -x
-          cd ${{ inputs.rundir }}
-          echo "export RUNMODE=${{ inputs.runmode  }}" > .run.env
-          echo "export BUILD=${{ inputs.build  }}" >> .run.env
-          echo "export RUNTYPE=${{ inputs.runtype  }}" >> .run.env
-          echo "export SYSTEM_PROMPT=\"${{ inputs.systemprompt }}\"" >> .run.env
-          echo "export HF_TOKEN=${{ inputs.hftoken  }}" >> .run.env
-          echo "export API_KEY=${{ inputs.apikey  }}" >> .run.env
-          echo "export MODEL_NAME=${{ inputs.hfmodel  }}" >> .run.env
-          echo "export DOCS_DIR=${{ inputs.docsdir }}" >> .run.env
-          echo "export VLLM_EXTRA_ARGS=\"${{ inputs.vllm_extra_args }}\"" >> .run.env
-          echo "export TRANSFORMERS_OFFLINE=1" >> .run.env
+          # Write code common to all services
+          cat > start_service_mod.sh << 'EOF'
 
-  slurm_job:
-    needs:
-      - prepare_job_directory
-    if: ${{ inputs.execmethod == 'SLURM' }}
-    ssh:
-      remoteHost: ${{ inputs.resource.ip }}
-    steps:
-      - name: Create SLURM Script
-        early-cancel: any-job-failed
-        run: |
-          cd ${{ inputs.rundir }}
-          echo '#!/bin/bash' > run.sh
-          chmod +x run.sh
-          if [[ "${{ inputs.slurm.partition }}" != "undefined" ]]; then
-            echo "#SBATCH --partition=${{ inputs.slurm.partition }}" >> run.sh
-          fi
-          echo "#SBATCH --gres=gpu:${{ inputs.slurm.number_of_gpus }}" >> run.sh
-          echo "#SBATCH --cpus-per-task=${{ inputs.slurm.cpus_per_task }}" >> run.sh
-          echo "#SBATCH --mem=${{ inputs.slurm.memory }}" >> run.sh
-          echo "#SBATCH --chdir=${PWD}" >> run.sh
-          echo "#SBATCH -o ${PWD}/run.out" >> run.sh
-          echo "#SBATCH -e ${PWD}/run.out" >> run.sh
-          if [[ "${{ inputs.slurm.scheduler_directives }}" != "undefined" ]]; then
-            echo "${{ inputs.slurm.scheduler_directives }}" >> run.sh
+          if [ -z "${service_port}" ]; then
+            service_port=$(pw agent open-port)
           fi
-          
 
-          # Indicates job started running
-          echo "touch job.started" >> run.sh
-          echo "hostname >> HOSTNAME" >> run.sh
-
-          echo "module load singularity" >> run.sh
-          
-          cat start_service.sh >> run.sh
-      - name: Submit SLURM Script
-        run: |
-          cd ${{ inputs.rundir }}
-          echo "$(date) Submitting SLURM Job"
-          jobid=$(sbatch run.sh | tail -1 | awk -F ' ' '{print $4}')
-          if [ -z "${jobid}" ]; then
-            echo "$(date) Job submission failed"
-            exit 1
+          if [ -z "${service_port}" ]; then
+            echo "$(date) ERROR: No service port found"
+            exit 1            
           fi
-          echo "jobid=${jobid}"  | tee -a $OUTPUTS | tee -a jobid
-        cleanup: |
-          set -x
-          cd ${{ inputs.rundir }}
-          #jobid=${{ needs.slurm_job.outputs.jobid }}
-          source jobid
-          target_hostname=$(squeue -j "${jobid}" --noheader --format="%N")
-          ssh ${target_hostname} bash cancel.sh
-          scancel ${jobid}
-          rm -f jobid SESSION_PORT job.started HOSTNAME
-      - name: Monitor SLURM Job
-        run: |
-          cd ${{ inputs.rundir }}
-          
-          #jobid=${{ needs.slurm_job.outputs.jobid }}
-          max_retries=10
-          count=0
-
-          while [ $count -lt $max_retries ]; do
-            ls
-            source jobid
-            if [ -z "${jobid}" ]; then
-              echo "$(date) Job ID is empty. Retry $((count+1))/$max_retries"
-              count=$((count+1))
-              sleep 5
-            else
-              break
-            fi
-          done
-          echo "$(date) Monitoring SLURM job ${jobid}"
+          echo ${service_port} > SESSION_PORT
+          hostname > HOSTNAME
 
-          cd ${{ inputs.rundir }}
-          touch run.out
-          tail -f run.out &
-          echo &! > tail.pid
-
-          get_slurm_job_status() {
-              # Get the header line to determine the column index corresponding to the job status
-              if [ -z "${SQUEUE_HEADER}" ]; then
-                  export SQUEUE_HEADER="$(eval squeue | awk 'NR==1')"
-              fi
-              status_column=$(echo "${SQUEUE_HEADER}" | awk '{ for (i=1; i<=NF; i++) if ($i ~ /^S/) { print i; exit } }')
-              status_response=$(eval squeue | awk -v jobid="${jobid}" '$1 == jobid')
-              echo "${SQUEUE_HEADER}"
-              echo "${status_response}"
-              export job_status=$(echo ${status_response} | awk -v id="${jobid}" -v col="$status_column" '{print $col}')
+          cleanup() {
+              echo "$(date) Cleaning up..."
+              kill -- -$$
           }
 
-          while true; do
-            sleep 15
-            get_slurm_job_status
-            if [ -z "${job_status}" ]; then
-              job_status=$(sacct -j ${jobid}  --format=state | tail -n1)
-              echo "$(date) Job exited with status ${job_status}"
-              touch job.ended
-              exit 0
-            fi
-          done
-        cleanup: |
-          set -x
-          cd ${{ inputs.rundir }}
-          kill $(cat tail.pid)
+          trap cleanup EXIT INT TERM
 
-  ssh_job:
+          echo
+          echo
+          echo "$(date) STARTING SERVICE"
+          echo
+          touch job.started
+          EOF
+          pwd
+          cat start_service_mod.sh
+          cat start_service.sh >> start_service_mod.sh
+  session_runner:
+    working-directory: ${{ inputs.service.rundir }}
     needs:
-      - prepare_job_directory
-    if: ${{ inputs.execmethod == 'SSH' }}
+      - preprocessing
     ssh:
-      remoteHost: ${{ inputs.resource.ip }}
+      remoteHost: ${{ inputs.cluster.resource.ip }}
     steps:
-      - name: Create SSH Script
+      - uses: marketplace/script_submitter/v3.5
+        early-cancel: any-job-failed
+        with:
+          resource: ${{ inputs.cluster.resource }}
+          shebang: '#!/bin/bash'
+          rundir: ${{ inputs.service.rundir }}
+          use_existing_script: true
+          script_path: ${{ inputs.service.rundir }}/start_service_mod.sh
+          scheduler: ${{ inputs.cluster.scheduler }}
+          slurm:
+            is_disabled: ${{ inputs.cluster.slurm.is_disabled }}
+            slurm_options: ${{ inputs.cluster.slurm.slurm_options }}
+            partition_default: ${{ inputs.cluster.slurm.partition_default }}
+            partition_hpc4: ${{ inputs.cluster.slurm.partition_hpc4
+            cpus_per_task: ${{ inputs.cluster.slurm.cpus_per_task }}
+            mem: ${{ inputs.cluster.slurm.mem }}
+            gres_gpu_default: ${{ inputs.cluster.slurm.gres_gpu_default }}
+            gres_gpu_hpc4: ${{ inputs.cluster.slurm.gres_gpu_hpc4 }}
+            time: ${{ inputs.cluster.slurm.time }}
+            scheduler_directives: ${{ inputs.cluster.slurm.scheduler_directives }}
+          pbs:
+            is_disabled: ${{ inputs.cluster.pbs.is_disabled }}
+            scheduler_directives: ${{ inputs.cluster.pbs.scheduler_directives }}
+      - name: Notify job ended
         early-cancel: any-job-failed
         run: |
-          cd ${{ inputs.rundir }}
-          echo '#!/bin/bash' > run.sh
-          chmod +x run.sh
-
-          # Indicates job started running
-          echo "touch job.started" >> run.sh
-          echo "hostname >> HOSTNAME" >> run.sh
-
-          cat start_service.sh >> run.sh
-      - name: Submit SSH Script
-        run: |
-          cd ${{ inputs.rundir }}
-          bash ./run.sh
-          touch job.ended
-          rm -f jobid SESSION_PORT job.started HOSTNAME
-        cleanup: |
           set -x
-          cd ${{ inputs.rundir }}
-          bash cancel.sh
-
-  create_session:
+          pwd
+          touch job.ended
+          ls -lat job.ended
+  wait_for_job_start:
+    working-directory: ${{ inputs.service.rundir }}
     needs:
-      - prepare_job_directory
+      - preprocessing
     ssh:
-      remoteHost: ${{ inputs.resource.ip }}
+      remoteHost: ${{ inputs.cluster.resource.ip }}
     steps:
       - name: Wait for job to start
         early-cancel: any-job-failed
         run: |
           set -x
-          while [ ! -f ${{ inputs.rundir }}/job.started ]; do
-            echo "Waiting for job to start..."
+          while [ ! -f job.started ]; do
+            if [ -f job.ended ]; then
+              echo "$(date) ERROR: Job ended before it started. Exiting."
+              exit 1
+            fi
+            echo "$(date) Waiting for job to start..."
             sleep 5
           done
       - name: Get Hostname
         early-cancel: any-job-failed
         run: |
           set -x
-          cd ${{ inputs.rundir }}
-          if [[ ${{ inputs.execmethod }} == "SLURM" ]]; then
-            source jobid
-            target_hostname=$(squeue -j "${jobid}" --noheader --format="%N")
-            # Add a retry for AECM
-            if [ -z "${target_hostname}" ]; then
-              sleep 30
-              target_hostname=$(squeue -j "${jobid}" --noheader --format="%N")
-            fi
-            echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS
-          elif [[ ${{ inputs.execmethod }} == "SSH" ]]; then
-            target_hostname=$(hostname)
-            echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS
-          fi
-          if [ -z "${target_hostname}" ]; then
+          HOSTNAME=$(cat HOSTNAME | cut -d'.' -f1)
+          echo "HOSTNAME=${HOSTNAME}" | tee -a $OUTPUTS
+
+          if [ -z "${HOSTNAME}" ]; then
             echo "$(date) Failed to get target hostname"
             exit 1
           fi
+          sleep 5
+  cleanup:
+    working-directory: ${{ inputs.service.rundir }}
+    if: ${{ always }}
+    needs:
+      - session_runner
+      - wait_for_job_start
+    ssh:
+      remoteHost: ${{ inputs.cluster.resource.ip }}
+    steps:
+      - name: Controller cleanup
+        if: ${{ inputs.cluster.slurm.is_disabled && inputs.cluster.pbs.is_disabled }}
+        run: echo "$(date) Cleaning up..."
+        cleanup: |
+          set -x
+          if [ -f cancel.sh ]; then
+            bash cancel.sh
+          fi
+      - name: Compute cleanup
+        if: ${{ (inputs.cluster.slurm.is_disabled == false || inputs.cluster.pbs.is_disabled == false) }}
+        run: echo "Cleaning up..."
+        cleanup: |
+          set -x
+          remote_host="${{ needs.wait_for_job_start.outputs.HOSTNAME }}"
+          if [ -z "${remote_host}" ]; then
+            echo "$(date) WARNING: Compute node's hostname is missing. Exiting step."
+            exit 0
+          fi
+          sshcmd="ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${remote_host}"
+          if [ -f cancel.sh ]; then
+            ${sshcmd} 'bash -s' < ${PWD}/cancel.sh
+          fi
+  create_session:
+    working-directory: ${{ inputs.service.rundir }}
+    needs:
+      - wait_for_job_start
+    ssh:
+      remoteHost: ${{ inputs.cluster.resource.ip }}
+    steps:
       - name: Get Session Port
         early-cancel: any-job-failed
         run: |
-          set -euo pipefail
           set -x
+          SESSION_PORT=$(cat SESSION_PORT)
+          echo "SESSION_PORT=${SESSION_PORT}" | tee -a $OUTPUTS
 
-          TIMEOUT=5
-          RETRY_INTERVAL=3
-          cd ${{ inputs.rundir }}
-
-          attempt=1
-          while true; do
-              echo "$(date) Attempt $attempt: Checking for SESSION_PORT file..."
-
-              if [ -f SESSION_PORT ]; then
-                  echo "$(date) Success: SESSION_PORT file found!"
-                  cat SESSION_PORT | tee -a "$OUTPUTS"
-                  exit 0
-              elif [ -f job.ended ]; then
-                  echo "$(date) Job was completed but SESSION_PORT was never created. Exiting..."
-                  exit 1
-              else
-                  echo "$(date) SESSION_PORT not found. Retrying in ${RETRY_INTERVAL} seconds..."
-                  sleep "$RETRY_INTERVAL"
-                  ((attempt++))
-              fi
-          done
+          if [ -z "${SESSION_PORT}" ]; then
+            echo "$(date) Failed to get target session's port"
+            exit 1
+          fi
+          sleep 5
       - name: Wait for Server To Start
         early-cancel: any-job-failed
         run: |
           TIMEOUT=5
           RETRY_INTERVAL=3
-          remote_host="${{ needs.create_session.outputs.target_hostname }}"
+          remote_host="${{ needs.wait_for_job_start.outputs.HOSTNAME }}"
           remote_port="${{ needs.create_session.outputs.SESSION_PORT }}"
 
           # Function to check if server is listening
@@ -279,8 +206,6 @@ jobs:
               return $?
           }
 
-          cd ${{ inputs.rundir }}
-
           # Main loop
           attempt=1
           while true; do
@@ -288,6 +213,7 @@ jobs:
               
               if check_server; then
                   echo "$(date) Success: Server is listening on ${remote_host}:${remote_port}!"
+                  sleep 40
                   exit 0
               elif [ -f job.ended ]; then
                   echo "$(date) Job was completed. Exiting... "
@@ -298,145 +224,260 @@ jobs:
                   ((attempt++))
               fi
           done
+          sleep 5
       - name: Update Session
         uses: parallelworks/update-session
         with:
-          remotePort: '${{ needs.create_session.outputs.SESSION_PORT }}'
-          target: '${{ inputs.resource.id }}'
-          name: '${{ sessions.session }}'
-          remoteHost: '${{ needs.create_session.outputs.target_hostname }}'
-
-
+          target: ${{ inputs.cluster.resource.id }}
+          name: ${{ sessions.session }}
+          remoteHost: ${{ needs.wait_for_job_start.outputs.HOSTNAME }}
+          remotePort: ${{ needs.create_session.outputs.SESSION_PORT }}
+          localPort: ${{ inputs.service.localport }}
 'on':
   execute:
     inputs:
-      resource:
-        type: compute-clusters
-        label: Compute Cluster
-        autoselect: true
-        include-workspace: false
-        tooltip: Resource to run the service
-      execmethod:
-        type: dropdown
-        label: Execution Method
-        default: SLURM
-        tooltip: Choose whether to run the job directly via SSH or submit it to a SLURM queue
-        hidden: true
-        options:
-          - value: SSH
-            label: SSH
-          - value: SLURM
-            label: SLURM
-      runmode:
-        label: Execution Mode
-        type: dropdown
-        default: singularity
-        hidden: true
-        options:
-          - value: docker
-            label: Docker
-          - value: singularity
-            label: Singularity
-      rundir:
-        label: Run Directory
-        default: ~/pw/activate-rag-vllm
-        type: string
-      runtype:
-        label: Run Type
-        type: dropdown
-        options:
-          - value: all
-            label: vLLM+RAG
-          - value: vllm
-            label: vLLM Only
-      build:
-        label: Build Containers
-        type: boolean
-        default: false
-      hfmodel:
-        label: HF Model
-        default: meta-llama/Llama-3.1-8B-Instruct
-        type: string
-      vllm_extra_args:
-        label: VLLM Extra Args
-        default: "--dtype float16 --max-model-len 4096 --gpu-memory-utilization=0.95 --max-num-seqs 1 --trust_remote_code"
-        placeholder: "--dtype float16 --max-model-len 4096 --gpu-memory-utilization=0.95 --max-num-seqs 1 --trust_remote_code"
-        type: string
-      hftoken:
-        label: HF Token (gated models)
-        optional: true
-        default: ${{ org.HF_TOKEN }}
-        type: password
-      apikey:
-        label: vLLM API Key
-        optional: true
-        tooltip: Required for integration with Cline and other code assist tools.
-        type: password
-      docsdir:
-        label: RAG Directory
-        hidden: ${{ inputs.runtype != 'all' }}
-        optional: true
-        default: ./docs
-        type: string
-      systemprompt:
-        type: string
-        label: System Prompt
-        hidden: ${{ inputs.runtype != 'all' }}
-        textarea: true
-        optional: true
-        default: You are a careful assistant. Use ONLY the provided context blocks to answer. Each block is numbered [1], [2], … and includes source metadata. When you use information from a block, you MUST cite it inline with [n]. At the end of your response, include a 'References' section with one reference per line formatted as [n] file_path (chunk index). Do not invent citations or sources. If the context does not contain the answer, say so briefly.
-      slurm:
+      cluster:
         type: group
-        label: SLURM Directives
-        hidden: ${{ inputs.execmethod != 'SLURM' }}
+        label: Compute Cluster Settings
         items:
-          partition:
-            type: slurm-partitions
-            label: SLURM partition
-            ignore: ${{ inputs.execmethod != 'SLURM' }}
-            optional: true
-            resource: ${{ inputs.resource }}
+          resource:
+            type: compute-clusters
+            label: Service host
+            include-workspace: false
+            tooltip: Resource to host the service
+            autoselect: true
+          scheduler:
+            type: boolean
+            default: true
+            label: Schedule Job?
+            hidden: true
             tooltip: |
-              Partition to submit the interactive job. Leave empty to let SLURM pick
-              the optimal option.
-          cpus_per_task:
-            type: number
-            label: CPUs per task
-            ignore: ${{ inputs.execmethod != 'SLURM' }}
-            min: 1
-            max: 32
-            default: 1
-            tooltip: '--cpus-per-task=value SLURM directive'
-          memory:
-            type: string
-            label: Minimum Total Memory Required
-            ignore: ${{ inputs.execmethod != 'SLURM' }}
-            default: 8GB
-            tooltip: '--mem=value SLURM directive'
-          number_of_gpus:
-            type: number
-            label: Number of GPUs
-            ignore: ${{ inputs.execmethod != 'SLURM' }}
-            min: 1
-            max: 4
-            default: 1
-            tooltip: '--gres=gpu:X slurm directive'
-          scheduler_directives:
-            type: editor
-            ignore: ${{ inputs.execmethod != 'SLURM' }}
-            optional: true
-            tooltip: |
-              Type in additional scheduler directives. 
-      advanced_settings:
+              Yes → Job is submitted to the scheduler using sbatch, qsub, etc
+              No  → Job is executed in the controller or login node instead
+          slurm:
+            type: group
+            label: SLURM Directives
+            hidden: ${{ inputs.cluster.resource.provider == 'existing' && inputs.cluster.resource.schedulerType != 'slurm' || inputs.scheduler == false }}
+            ignore: ${{ inputs.cluster.resource.provider == 'existing' && inputs.cluster.resource.schedulerType != 'slurm' || inputs.scheduler == false }}
+            items:
+              slurm_options:
+                type: dropdown
+                label: Select Cluster
+                optional: true
+                default: ''
+                options:
+                  - value: ''
+                    label: Default
+                  - value: '-M hpc4'
+                    label: HPC4
+              is_disabled:
+                type: boolean
+                hidden: true
+                default: ${{ inputs.cluster.resource.provider == 'existing' && inputs.cluster.resource.schedulerType != 'slurm' || inputs.scheduler == false }}
+                label: Is SLURM disabled?
+              partition_default:
+                type: slurm-partitions
+                label: SLURM partition
+                ignore: ${{ '-M hpc4' == inputs.cluster.slurm.slurm_options }}
+                hidden: ${{ .ignore }}
+                optional: true
+                resource: ${{ inputs.cluster.resource }}
+                tooltip: Select a partition from the drop down menu. Leave empty to let SLURM pick a partition.
+              partition_hpc4:
+                type: dropdown
+                label: SLURM partition
+                optional: true
+                tooltip: Select a partition from the drop down menu. Leave empty to let SLURM pick a partition.
+                ignore: ${{ '-M hpc4' != inputs.cluster.slurm.slurm_options }}
+                hidden: ${{ .ignore }}
+                default: normal
+                options:
+                  - normal
+                  - gpu
+                  - gpu-h200
+                  - gpu-quick
+                  - ht
+                  - large-mem
+                  - quick
+                  - test
+                  - unlimited
+              cpus_per_task:
+                type: number
+                label: CPUs per task
+                min: 1
+                max: 32
+                default: 1
+                tooltip: '--cpus-per-task=value slurm directive'
+                ignore: ${{ 'existing' != inputs.cluster.resource.provider }}
+                hidden: ${{ .ignore }}
+              mem:
+                type: string
+                label: Minimum total memory required
+                default: 32GB
+                tooltip: '--mem=value slurm directive'
+                hidden: ${{ 'existing' != inputs.cluster.resource.provider }}
+                ignore: ${{ .hidden }}
+                optional: true
+              gres_gpu_default:
+                type: number
+                label: Number of GPUs
+                ignore: ${{ ( inputs.cluster.slurm.partition_default != 'gpu' && inputs.cluster.slurm.partition_default != 'gpu-quick' ) || 'existing' != inputs.cluster.resource.provider  }}
+                hidden: ${{ .ignore }}
+                min: 1
+                max: 4
+                default: 4
+                tooltip: '--gres=gpu:X slurm directive'
+              gres_gpu_hpc4:
+                type: number
+                label: Number of GPUs
+                hidden: ${{ ( inputs.cluster.slurm.partition_hpc4 != 'gpu' && inputs.cluster.slurm.partition_hpc4 != 'gpu-quick' && inputs.cluster.slurm.partition_hpc4 != 'gpu-h200' ) || 'existing' != inputs.cluster.resource.provider  }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+                min: 1
+                max: 4
+                default: 4
+                tooltip: '--gres=gpu:X slurm directive'
+              time:
+                label: Walltime
+                type: string
+                default: '01:00:00'
+                tooltip: '--time= SLURM directive to set the maximum wall-clock time limit for the job'
+              scheduler_directives:
+                type: editor
+                optional: true
+                tooltip: |
+                  Type in additional scheduler directives. 
+          pbs:
+            type: group
+            label: PBS Directives
+            hidden: ${{ inputs.cluster.resource.schedulerType != 'pbs' || inputs.scheduler == false }}
+            ignore: ${{ inputs.cluster.resource.schedulerType != 'pbs' || inputs.scheduler == false }}
+            items:
+              is_disabled:
+                type: boolean
+                hidden: true
+                default: ${{ inputs.cluster.resource.schedulerType != 'pbs' || inputs.scheduler == false }}
+                label: Is PBS disabled?
+              scheduler_directives:
+                label: Scheduler Directives
+                type: editor
+                tooltip: Type the PBS scheduler directives
+      service:
         type: group
-        label: Advanced Settings
-        collapsed: true
+        label: Service
         items:
-          repository:
+          runmode:
+            label: Execution Mode
+            type: dropdown
+            default: singularity
+            hidden: ${{ 'existing' == inputs.resource.provider }}
+            options:
+              - value: docker
+                label: Docker
+              - value: singularity
+                label: Singularity
+          rundir:
+            label: Run Directory
+            default: ${HOME}/pw/activate-rag-vllm2
             type: string
-            label: Repository
-            default: https://github.com/parallelworks/activate-rag-vllm.git
-          repository_branch:
+          runtype:
+            label: Run Type
+            type: dropdown
+            options:
+              - value: vllm
+                label: vLLM Only
+              - value: all
+                label: vLLM+RAG
+          build:
+            label: Build Containers
+            type: boolean
+            default: false
+          pull:
+            label: Pull Containers
+            hidden: ${{ inputs.service.runmode == 'docker' }}
+            type: boolean
+            default: true
+          hfmodel:
+            label: HF Model
+            default: /gs/gsfs0/home/avidaltorr/pw/software/Llama-3_3-Nemotron-Super-49B-v1_5
+            type: string
+          vllm_extra_args:
+            label: VLLM Extra Args
+            default: '--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --gpu-memory-utilization 0.85'
+            placeholder: '--dtype bfloat16 --trust_remote_code --tensor-parallel-size 4 --async-scheduling --gpu-memory-utilization 0.85'
+            type: string
+          hftoken:
+            label: HF Token (gated models)
+            optional: true
+            default: ${{ org.HF_TOKEN }}
+            type: password
+          apikey:
+            label: vLLM API Key
+            optional: true
+            tooltip: Required for integration with Cline and other code assist tools.
+            type: password
+          docsdir:
+            label: RAG Directory
+            hidden: ${{ inputs.service.runtype != 'all' }}
+            optional: true
+            default: ./docs
+            type: string
+          systemprompt:
+            type: string
+            label: System Prompt
+            hidden: ${{ inputs.service.runtype != 'all' }}
+            textarea: true
+            optional: true
+            default: You are a careful assistant. Use ONLY the provided context blocks to answer. Each block is numbered [1], [2], … and includes source metadata. When you use information from a block, you MUST cite it inline with [n]. At the end of your response, include a 'References' section with one reference per line formatted as [n] file_path (chunk index). Do not invent citations or sources. If the context does not contain the answer, say so briefly.
+          localport:
+            label: User Workspace Port
+            default: '5555'
+            tooltip: Port that runs within the user workspace and used to connect to the code assist and chat interfaces.
             type: string
-            label: Repository Branch
-            default: main
+          advanced_settings:
+            type: group
+            label: Advanced Settings
+            collapsed: true
+            items:
+              repository:
+                type: string
+                label: Repository
+                default: https://github.com/parallelworks/activate-rag-vllm.git
+              repository_branch:
+                type: string
+                label: Repository Branch
+                default: nemotron-aecm
+              tiktoken_encodings:
+                label: Pull Encodings
+                tooltip: For GPT-OSS pull the tiktoken encodings.
+                type: boolean
+                default: false
+              vllm_attention_backend:
+                type: dropdown
+                label: VLLM Attention Backend
+                default: FLASH_ATTN
+                tooltip: Select the attention backend implementation used by vLLM
+                options:
+                  - value: FLASH_ATTN
+                  - value: TRITON_ATTN
+                  - value: ROCM_ATTN
+                  - value: ROCM_AITER_MLA
+                  - value: ROCM_AITER_TRITON_MLA
+                  - value: ROCM_AITER_FA
+                  - value: ROCM_AITER_MLA_SPARSE
+                  - value: TORCH_SDPA
+                  - value: FLASHINFER
+                  - value: FLASHINFER_MLA
+                  - value: TRITON_MLA
+                  - value: CUTLASS_MLA
+                  - value: FLASHMLA
+                  - value: FLASHMLA_SPARSE
+                  - value: FLASH_ATTN_MLA
+                  - value: PALLAS
+                  - value: IPEX
+                  - value: NO_ATTENTION
+                  - value: FLEX_ATTENTION
+                  - value: TREE_ATTN
+                  - value: ROCM_AITER_UNIFIED_ATTN
+                  - value: CPU_ATTN
+                  - value: CUSTOM