From 4de2d1c84f4572c7f093aab9f732d02fca7351d8 Mon Sep 17 00:00:00 2001
From: nhuytan1 <nhuytan1@gmail.com>
Date: Mon, 28 Jul 2025 11:22:53 -0500
Subject: [PATCH 1/5] add sam2-model code from local machine

---
 workflow/yamls/k8s/sam2-model/general.md   |  48 +++
 workflow/yamls/k8s/sam2-model/general.yaml | 332 +++++++++++++++++++++
 2 files changed, 380 insertions(+)
 create mode 100644 workflow/yamls/k8s/sam2-model/general.md
 create mode 100644 workflow/yamls/k8s/sam2-model/general.yaml

diff --git a/workflow/yamls/k8s/sam2-model/general.md b/workflow/yamls/k8s/sam2-model/general.md
new file mode 100644
index 00000000..4421fae3
--- /dev/null
+++ b/workflow/yamls/k8s/sam2-model/general.md
@@ -0,0 +1,48 @@
+# Video Object Tracking using SAM2 model on Kubernetes
+
+This workflow launches a GPU-powered video object tracking interface on a Kubernetes cluster. Users can upload a video, select an object in the first frame, and run the tracking process. Once complete, both the tracked and stacked output videos are available for download.
+
+## Quick Start
+
+- **Select a Kubernetes Cluster:** Choose your target K8s cluster.  
+- **Set Namespace:** Specify the namespace to deploy in (e.g., `default`, `summer2025interns`).  
+- **Choose Number of GPUs:** Define how many GPUs (or MIG instances) to allocate for the workload.  
+- **Run the Workflow:** Launch the interface and wait for the deployment to be ready.
+
+---
+
+##  Using the Web Interface
+
+Once the UI is available, follow these steps:
+
+- **Upload a Video:**  
+  - Accepted formats: `.mp4`, `.mov`  
+  - Recommended: Less than 15 seconds, resolution under 1080p for best performance  
+
+- **Select an Object:**  
+  - Use the interactive canvas to click on the target object in the first frame  
+  - This initializes the tracking point for segmentation  
+
+- **Run Tracking:**  
+  - Start the segmentation and tracking pipeline  
+  - The system will process the video using GPU (or fallback to CPU if needed)
+
+---
+
+## GPU Acceleration & MIG
+
+For best performance, the workflow runs on GPU-enabled nodes.  
+MIG (Multi-Instance GPU) support allows multiple jobs to run concurrently with isolated memory and compute slices.  
+This ensures efficient resource usage when running multiple video tracking sessions in parallel.
+
+---
+
+## Output
+
+Once processing completes:
+
+- **Tracked Video:** Shows the object followed across frames with a visual overlay  
+- **Stacked Video:** Displays input/output side-by-side for comparison  
+- Both files will be available for download directly from the interface
+
+---
diff --git a/workflow/yamls/k8s/sam2-model/general.yaml b/workflow/yamls/k8s/sam2-model/general.yaml
new file mode 100644
index 00000000..06e03521
--- /dev/null
+++ b/workflow/yamls/k8s/sam2-model/general.yaml
@@ -0,0 +1,332 @@
+permissions:
+  - '*'
+sessions:
+  session:
+    useTLS: false
+    redirect: true
+    useCustomDomain: true
+app:
+  target: inputs.k8s.cluster
+jobs:
+  auth_k8s:
+    steps:
+      - name: Authenticate kubectl
+        run: pw kube auth ${{ inputs.k8s.cluster }}
+  prepare_k8s_pvc:
+    needs:
+      - auth_k8s
+    steps:
+      - name: Creating New PVC YAML
+        if: ${{ inputs.k8s.volumes.pvc === 'New' }}
+        run: |
+          pvc_name="${{ inputs.k8s.volumes.pvc_name }}"
+          pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} 
+          if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then
+            default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)' | awk '{print $1}')
+            if [ $? -ne 0 ]; then
+              echo "WARNING: Could not obtain default storageClass with command:"
+              echo "         kubectl get storageclass -n ${{ inputs.k8s.namespace }}"
+              echo "         Using empty storageClassName"
+              storageClassName=""
+            elif [ -z "${default_class}" ]; then
+              echo "ERROR: No default storage class found. You must specify one explicitly."
+              exit 1
+            else
+              storageClassName="storageClassName: ${default_class}"
+            fi
+          else
+            storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}"
+          fi
+          echo "${pvc_name}" > pvc_name
+          cat <<EOF > test-pvc.yaml
+          apiVersion: v1
+          kind: PersistentVolumeClaim
+          metadata:
+            name: ${pvc_name}
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            accessModes:
+              - ReadWriteOnce
+            resources:
+              requests:
+                storage: ${{ inputs.k8s.volumes.pvc_storage_size }}
+            ${storageClassName}
+          EOF
+          cat test-pvc.yaml
+      - name: Dry Run PVC
+        if: ${{ inputs.k8s.volumes.pvc === 'New' }}
+        run: |
+          echo "Performing dry run..."
+          kubectl apply -f test-pvc.yaml --dry-run=client
+      - name: Apply PVC
+        if: ${{ inputs.k8s.volumes.pvc === 'New' }}
+        run: kubectl apply -f test-pvc.yaml
+        cleanup: |
+          if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then
+            kubectl delete -f test-pvc.yaml
+            touch pvc.deleted
+          fi
+  deploy_sam2:
+    needs:
+      - prepare_k8s_pvc
+    steps:
+      - name: Generate Deployment YAML
+        run: |
+          if [[ "${{ inputs.resources.gpu_type }}" == "Custom" ]]; then
+            gpu_limits="${{ inputs.resources.gpu_resource_key }}: ${{ inputs.resources.gpu_count }}"
+          elif [[ "${{ inputs.resources.gpu_type }}" != "None" ]]; then
+            gpu_limits="${{ inputs.resources.gpu_type }}: ${{ inputs.resources.gpu_count }}"
+          fi
+          if kubectl get runtimeclass nvidia &>/dev/null; then
+            echo "nvidia RuntimeClass is available"
+            runtimeClassName="runtimeClassName: nvidia"
+          fi
+          if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then
+            pvc_name=${{ inputs.k8s.volumes.pvc_existing }}
+          else
+            pvc_name=$(cat pvc_name)
+          fi
+          cat <<EOF > sam2-deployment.yaml
+          apiVersion: apps/v1
+          kind: Deployment
+          metadata:
+            name: ${{ inputs.app.name }}
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            replicas: 1
+            selector:
+              matchLabels:
+                app: ${{ inputs.app.name }}
+            template:
+              metadata:
+                labels:
+                  app: ${{ inputs.app.name }}
+              spec:
+                ${runtimeClassName}
+                initContainers:
+                  - name: set-permissions
+                    image: busybox
+                    command: ["sh", "-c", "chmod -R 777 /models"]
+                    securityContext:
+                      runAsUser: 0
+                    volumeMounts:
+                      - name: model-storage
+                        mountPath: /models
+                containers:
+                  - name: ${{ inputs.app.name }}
+                    image: nhuytan/sam2-video-tracker:test
+                    imagePullPolicy: Always
+                    ports:
+                      - containerPort: 3000
+                    command: ["sh", "-c"]
+                    args: ["pnpm install && echo 'Starting Next.js server...' && pnpm start && tail -f /dev/null"]
+                    resources:
+                      limits:
+                        ${gpu_limits}
+                      requests:
+                        ${gpu_limits}
+                    env:
+                      - name: TORCH_DEVICE
+                        value: "cuda"
+                      - name: SAFE_MODE
+                        value: "true"
+                      - name: PYTORCH_CUDA_ALLOC_CONF
+                        value: "max_split_size_mb:128,garbage_collection_threshold:0.8"
+                      - name: NODE_ENV
+                        value: "production"
+                      - name: NEXT_TELEMETRY_DISABLED
+                        value: "1"
+                      - name: PYTHONPATH
+                        value: "/app"
+                      - name: NVIDIA_VISIBLE_DEVICES
+                        value: "all"
+                      - name: NVIDIA_DRIVER_CAPABILITIES
+                        value: "compute,utility"
+                      - name: PYTHONUNBUFFERED
+                        value: "1"
+                    volumeMounts:
+                      - name: model-storage
+                        mountPath: /models
+                volumes:
+                  - name: model-storage
+                    persistentVolumeClaim:
+                      claimName: ${pvc_name}
+          ---
+          apiVersion: v1
+          kind: Service
+          metadata:
+            name: ${{ inputs.app.name }}
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            selector:
+              app: ${{ inputs.app.name }}
+            ports:
+              - protocol: TCP
+                port: 3000
+                targetPort: 3000
+            type: ClusterIP
+          EOF
+      - name: Apply Deployment
+        run: kubectl apply -f sam2-deployment.yaml
+      - name: Wait for Pod to be Ready
+        run: |
+          echo "Waiting for pod to be ready..."
+          kubectl wait --for=condition=Ready pod -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --timeout=300s
+  create_session:
+    needs:
+      - deploy_sam2
+    steps:
+      - name: Get SLUG
+        run: |
+          echo "slug=" >> $OUTPUTS
+      - name: Debug Service + Pod
+        run: |
+          echo "Checking pod + service for session connection..."
+          kubectl get svc ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }}
+          kubectl get pods -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }}
+          kubectl describe svc ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }}
+      - name: Wait for Port 3000 to be Ready
+        run: |
+          echo "Polling HTTP response from localhost:3000 inside container..."
+          pod=$(kubectl get pods -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} -o jsonpath="{.items[0].metadata.name}")
+          for i in {1..30}; do
+            echo "Checking if port 3000 is responding (attempt $i)..."
+            if kubectl exec -n ${{ inputs.k8s.namespace }} "$pod" -- sh -c "curl -s http://localhost:3000 >/dev/null"; then
+              echo " Port 3000 is now responding!"
+              break
+            fi
+            sleep 2
+          done
+      - name: Expose Session
+        uses: parallelworks/update-session
+        with:
+          remotePort: '3000'
+          name: ${{ sessions.session }}
+          slug: ${{ needs.create_session.outputs.slug }}
+          targetInfo:
+            name: ${{ inputs.k8s.cluster }}
+            namespace: ${{ inputs.k8s.namespace }}
+            resourceType: services
+            resourceName: ${{ inputs.app.name }}
+  keep_alive:
+    needs:
+      - create_session
+    steps:
+      - name: Keep Session Running
+        run: tail -f /dev/null
+        cleanup: |
+          echo "Cleaning up resources..."
+          kubectl delete deployment ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --ignore-not-found
+          kubectl delete service ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --ignore-not-found
+          if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then
+            kubectl delete pvc ${pvc_name} -n ${{ inputs.k8s.namespace }} --ignore-not-found
+          fi
+'on':
+  execute:
+    inputs:
+      k8s:
+        type: group
+        label: Kubernetes Settings
+        items:
+          cluster:
+            label: Kubernetes Cluster
+            type: kubernetes-clusters
+          namespace:
+            label: Namespace
+            type: kubernetes-namespaces
+            clusterName: ${{ inputs.k8s.cluster }}
+            default: summer2025interns
+          volumes:
+            type: group
+            label: Storage Settings
+            collapsed: true
+            tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path.
+            items:
+              pvc:
+                label: Persistent Volume Claim
+                type: dropdown
+                default: New
+                options:
+                  - value: Existing
+                    label: Select Existing PVC
+                  - value: New
+                    label: Create New PVC
+              pvc_mount_path:
+                label: Mount Path
+                type: string
+                default: /models
+              pvc_existing:
+                label: Select PVC Name
+                type: kubernetes-pvc
+                clusterName: ${{ inputs.k8s.cluster }}
+                namespace: ${{ inputs.k8s.namespace }}
+                hidden: ${{ inputs.k8s.volumes.pvc !== 'Existing' }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+                default: sam2
+              pvc_storage_size:
+                label: Enter PVC Size
+                type: string
+                hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+                default: 50Gi
+              pvc_storage_class:
+                label: Enter PVC Storage Class
+                type: string
+                hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }}
+                ignore: ${{ .hidden }}
+                optional: true
+                tooltip: Leave blank to use the default storage class configured in the cluster.
+              pvc_persist:
+                label: Persist PVC After Completion
+                type: boolean
+                default: false
+                hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+                tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted.
+              pvc_name:
+                label: Enter PVC Name
+                type: string
+                hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+                default: sam2
+      app:
+        type: group
+        label: App Settings
+        items:
+          name:
+            label: Deployment Name
+            type: string
+            default: sam2demo
+      resources:
+        type: group
+        label: GPU Settings
+        items:
+          gpu_type:
+            label: Select GPU Device
+            type: dropdown
+            default: nvidia.com/gpu
+            options:
+              - value: None
+                label: None
+              - value: nvidia.com/gpu
+                label: Nvidia GPU
+              - value: Custom
+                label: Custom GPU Resource Key
+          gpu_resource_key:
+            label: Custom GPU Resource Key
+            type: string
+            hidden: ${{ inputs.resources.gpu_type !== 'Custom' }}
+            ignore: ${{ .hidden }}
+            optional: ${{ .hidden }}
+          gpu_count:
+            label: Number of GPUs
+            type: number
+            default: 1
+            min: 1
+            step: 1
+            hidden: ${{ inputs.resources.gpu_type === 'None' }}
+            ignore: ${{ .hidden }}

From f77227d08a04aca2a9e2ae626fa8aef504fdf9ee Mon Sep 17 00:00:00 2001
From: nhuytan1 <nhuytan1@gmail.com>
Date: Mon, 4 Aug 2025 11:29:16 -0500
Subject: [PATCH 2/5] Fix bug

---
 workflow/yamls/k8s/sam2-model/general.yaml    | 159 +++-
 .../triton-vllm/general_removecheckgpu.yaml   | 652 +++++++++++++++
 .../yamls/k8s/triton-vllm/general_v0.1.yaml   | 741 ++++++++++++++++++
 3 files changed, 1521 insertions(+), 31 deletions(-)
 create mode 100644 workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml
 create mode 100644 workflow/yamls/k8s/triton-vllm/general_v0.1.yaml

diff --git a/workflow/yamls/k8s/sam2-model/general.yaml b/workflow/yamls/k8s/sam2-model/general.yaml
index 06e03521..b2e05b1b 100644
--- a/workflow/yamls/k8s/sam2-model/general.yaml
+++ b/workflow/yamls/k8s/sam2-model/general.yaml
@@ -11,12 +11,14 @@ jobs:
   auth_k8s:
     steps:
       - name: Authenticate kubectl
+        early-cancel: any-job-failed
         run: pw kube auth ${{ inputs.k8s.cluster }}
   prepare_k8s_pvc:
     needs:
       - auth_k8s
     steps:
       - name: Creating New PVC YAML
+        early-cancel: any-job-failed
         if: ${{ inputs.k8s.volumes.pvc === 'New' }}
         run: |
           pvc_name="${{ inputs.k8s.volumes.pvc_name }}"
@@ -38,7 +40,7 @@ jobs:
             storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}"
           fi
           echo "${pvc_name}" > pvc_name
-          cat <<EOF > test-pvc.yaml
+          cat <<EOF > pvc.yaml
           apiVersion: v1
           kind: PersistentVolumeClaim
           metadata:
@@ -52,25 +54,23 @@ jobs:
                 storage: ${{ inputs.k8s.volumes.pvc_storage_size }}
             ${storageClassName}
           EOF
-          cat test-pvc.yaml
+          cat pvc.yaml
       - name: Dry Run PVC
+        early-cancel: any-job-failed
         if: ${{ inputs.k8s.volumes.pvc === 'New' }}
         run: |
           echo "Performing dry run..."
-          kubectl apply -f test-pvc.yaml --dry-run=client
-      - name: Apply PVC
-        if: ${{ inputs.k8s.volumes.pvc === 'New' }}
-        run: kubectl apply -f test-pvc.yaml
-        cleanup: |
-          if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then
-            kubectl delete -f test-pvc.yaml
-            touch pvc.deleted
-          fi
-  deploy_sam2:
+          kubectl apply -f pvc.yaml --dry-run=client
+      - name: Dummy
+        early-cancel: any-job-failed
+        run: echo Dummy
+
+  prepare_sam2:
     needs:
       - prepare_k8s_pvc
     steps:
-      - name: Generate Deployment YAML
+      - name: Create Deployment and Service YAML
+        early-cancel: any-job-failed
         run: |
           if [[ "${{ inputs.resources.gpu_type }}" == "Custom" ]]; then
             gpu_limits="${{ inputs.resources.gpu_resource_key }}: ${{ inputs.resources.gpu_count }}"
@@ -86,7 +86,7 @@ jobs:
           else
             pvc_name=$(cat pvc_name)
           fi
-          cat <<EOF > sam2-deployment.yaml
+          cat <<EOF > app.yaml
           apiVersion: apps/v1
           kind: Deployment
           metadata:
@@ -166,16 +166,126 @@ jobs:
                 targetPort: 3000
             type: ClusterIP
           EOF
-      - name: Apply Deployment
-        run: kubectl apply -f sam2-deployment.yaml
+  apply_sam2:
+    needs:
+      - prepare_sam2
+    steps:
+      - name: Apply PVC
+        early-cancel: any-job-failed
+        if: ${{ inputs.k8s.volumes.pvc === 'New' }}
+        run: kubectl apply -f pvc.yaml
+        cleanup: |
+          set -x
+          if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then
+            MAX_ATTEMPTS=5
+            ATTEMPT=1
+            while true; do
+              if kubectl delete -f pvc.yaml; then
+                echo "PVC deleted successfully"
+                touch pvc.deleted
+                break
+              elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
+                echo "Failed to delete PVC after $MAX_ATTEMPTS attempts"
+                exit 1
+              else
+                echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..."
+                sleep 5
+                ((ATTEMPT++))
+              fi
+            done
+          fi
+      - name: Apply Deployment and Service
+        run: kubectl apply -f app.yaml
+        cleanup: |
+          set -x
+          MAX_ATTEMPTS=5
+          ATTEMPT=1
+          while true; do
+            if kubectl delete -f app.yaml; then
+              echo "Resources deleted successfully"
+              touch app.deleted
+              break
+            elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
+              echo "Failed to delete resources after $MAX_ATTEMPTS attempts"
+              exit 1
+            else
+              echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..."
+              sleep 5
+              ((ATTEMPT++))
+            fi
+          done
+
+      - name: Wait for Deployment to be Ready
+        early-cancel: any-job-failed
+        env:
+          app_name: ${{ inputs.app.name }}
+          namespace: ${{ inputs.k8s.namespace }}
+        run: |
+
+          log() {
+            while true; do
+              echo
+              echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..."
+              kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment"
+              
+              echo; echo "[INFO] $(date) - Pods status:"
+              kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods"
+
+              pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+              if [[ -n "$pod_name" ]]; then
+                echo; echo "[INFO] $(date) - Describing pod ${pod_name}..."
+                kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events"
+              fi
+              
+              echo "---------------------------------------------"
+              sleep 20
+            done
+          }
+
+          log &
+          log_pid=$!
+          trap "kill ${log_pid}" EXIT
+          set -x
+          kubectl wait --for=condition=available --timeout=1200s deployment/${app_name} -n ${namespace}
+          exit_code=$?
+          kubectl get deployment ${app_name} -n ${namespace} -o wide
+          kubectl describe deployment ${app_name} -n ${namespace}
+          exit ${exit_code}
+
       - name: Wait for Pod to be Ready
+        early-cancel: any-job-failed
+        env:
+          app_name: ${{ inputs.app.name }}
+          namespace: ${{ inputs.k8s.namespace }}
         run: |
           echo "Waiting for pod to be ready..."
-          kubectl wait --for=condition=Ready pod -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --timeout=300s
+          kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s
+          sam2_pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}")
+          echo "sam2_pod=$sam2_pod" | tee -a $OUTPUTS | tee -a OUTPUTS
+          touch pod.running
+
+      - name: Stream Logs
+        early-cancel: any-job-failed
+        run: |
+          kubectl logs -f deployment/${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }}
+          echo Existing
   create_session:
     needs:
-      - deploy_sam2
+      - prepare_sam2
     steps:
+      - name: Wait until the Kubernetes deployment reaches its final stage
+        early-cancel: any-job-failed
+        run: |
+          while true; do
+            if [ -f "app.deleted" ]; then
+              echo "File app.deleted was detected. Exiting..."
+              exit 0
+            elif [ -f "pod.running" ]; then
+              echo "Pod is ready"
+              break
+            fi
+            sleep 2 
+          done
       - name: Get SLUG
         run: |
           echo "slug=" >> $OUTPUTS
@@ -208,19 +318,6 @@ jobs:
             namespace: ${{ inputs.k8s.namespace }}
             resourceType: services
             resourceName: ${{ inputs.app.name }}
-  keep_alive:
-    needs:
-      - create_session
-    steps:
-      - name: Keep Session Running
-        run: tail -f /dev/null
-        cleanup: |
-          echo "Cleaning up resources..."
-          kubectl delete deployment ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --ignore-not-found
-          kubectl delete service ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --ignore-not-found
-          if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then
-            kubectl delete pvc ${pvc_name} -n ${{ inputs.k8s.namespace }} --ignore-not-found
-          fi
 'on':
   execute:
     inputs:
diff --git a/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml b/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml
new file mode 100644
index 00000000..1faa57ad
--- /dev/null
+++ b/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml
@@ -0,0 +1,652 @@
+permissions:
+  - '*'
+sessions:
+  session:
+    useTLS: false
+    redirect: true
+    useCustomDomain: true
+app:
+  target: inputs.k8s.cluster
+jobs:
+  auth_k8s:
+    steps:
+      - name: Authenticate kubectl
+        early-cancel: any-job-failed
+        run: pw kube auth ${{ inputs.k8s.cluster }}
+  prepare_k8s_pvc:
+    needs:
+      - auth_k8s
+    steps:
+      - name: Creating New PVC YAML
+        early-cancel: any-job-failed
+        if: ${{ inputs.k8s.volumes.pvc === New }}
+        run: |
+          if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then
+            pvc_name="${{ inputs.k8s.volumes.pvc_name }}"
+          else
+            job_number=$(pwd | rev | cut -d "/" -f1 | rev)
+            workflow_name=$(pwd | rev | cut -d "/" -f2 | rev)
+            pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc
+          fi
+          pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }}
+          if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then
+            default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)' | awk '{print $1}')
+            if [ -z "${default_class}" ]; then
+              echo "ERROR: No default storage class found. Available storage classes:"
+              kubectl get storageclass -n ${{ inputs.k8s.namespace }}
+              exit 1
+            fi
+            storageClassName="storageClassName: $default_class"
+          else
+            storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}"
+          fi
+          echo "${pvc_name}" > pvc_name
+          cat <<EOF > pvc.yaml
+          apiVersion: v1
+          kind: PersistentVolumeClaim
+          metadata:
+            name: ${pvc_name}
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            accessModes:
+              - ReadWriteOnce
+            resources:
+              requests:
+                storage: ${{ inputs.k8s.volumes.pvc_storage_size }}
+            ${storageClassName}
+          EOF
+          cat pvc.yaml
+      - name: Dry Run PVC
+        early-cancel: any-job-failed
+        if: ${{ inputs.k8s.volumes.pvc === New }}
+        run: |
+          echo "Performing dry run for PVC..."
+          kubectl apply -f pvc.yaml --dry-run=client
+      - name: Dummy
+        early-cancel: any-job-failed
+        run: echo Dummy
+  prepare_k8s_deployment:
+    needs:
+      - prepare_k8s_pvc
+    steps:
+      - name: Defining App Name
+        early-cancel: any-job-failed
+        run: |
+          job_number=$(pwd | rev | cut -d "/" -f1 | rev)
+          workflow_name=$(pwd | rev | cut -d "/" -f2 | rev)
+          app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)
+          echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS
+      - name: Creating Deployment and Service YAML
+        early-cancel: any-job-failed
+        run: |
+          if [[ "${{ inputs.k8s.triton_resources.limits.select_gpu }}" == "Custom" ]]; then
+            gpu_limits="${{ inputs.k8s.triton_resources.limits.gpu_resource_key }}: ${{ inputs.k8s.triton_resources.limits.number_of_gpus }}"
+          else
+            gpu_limits="${{ inputs.k8s.triton_resources.limits.select_gpu }}: ${{ inputs.k8s.triton_resources.limits.number_of_gpus }}"
+          fi
+          gpu_check_limits="nvidia.com/gpu: 1"
+
+          tensor_parallel_size=${{ inputs.triton_k8s.tensor_parallel_size }}
+
+          if kubectl get runtimeclass nvidia &>/dev/null; then
+            runtimeClassName="runtimeClassName: nvidia"
+          else
+            runtimeClassName=""
+          fi
+
+          if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then
+            pvc_name=${{ inputs.k8s.volumes.pvc_existing }}
+          else
+            pvc_name=$(cat pvc_name)
+          fi
+
+          cat <<EOF > app.yaml
+          ---
+          # Deployment for Triton Inference Server
+          apiVersion: apps/v1
+          kind: Deployment
+          metadata:
+            name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            replicas: 1
+            selector:
+              matchLabels:
+                app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
+            template:
+              metadata:
+                labels:
+                  app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
+              spec:
+                runtimeClassName: nvidia
+                tolerations:
+                - key: "nvidia.com/gpu"
+                  operator: "Equal"
+                  value: "true"
+                  effect: "NoSchedule"
+                  
+                initContainers:
+                  - name: set-permissions
+                    image: busybox
+                    command: ["sh", "-c", "chmod -R 777 ${{ inputs.k8s.volumes.pvc_mount_path }}"]
+                    securityContext:
+                      runAsUser: 0
+                    volumeMounts:
+                      - name: storage
+                        mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }}
+                  - name: init-model-repository
+                    image: busybox
+                    command: ["sh", "-c", "mkdir -p ${{ inputs.k8s.volumes.pvc_mount_path }}/vllm_model/1 && echo '{\"model\": \"${{ inputs.triton_k8s.model }}\", \"gpu_memory_utilization\": ${{ inputs.triton_k8s.gpu_memory_utilization }}, \"max_num_seqs\": ${{ inputs.triton_k8s.max_num_seqs }}, \"max_model_len\": ${{ inputs.triton_k8s.max_model_len }} ,\"tensor_parallel_size\": ${tensor_parallel_size}}' > ${{ inputs.k8s.volumes.pvc_mount_path }}/vllm_model/1/model.json && echo 'backend: \"vllm\"\ninstance_group [\n  {\n    count: 1\n    kind: KIND_GPU\n  }\n]' > ${{ inputs.k8s.volumes.pvc_mount_path }}/vllm_model/config.pbtxt"]
+                    volumeMounts:
+                      - name: storage
+                        mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }}
+                containers:
+                  - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
+                    image: nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3
+                    args: ["tritonserver", "--model-store=${{ inputs.k8s.volumes.pvc_mount_path }}", "--model-control-mode=POLL"]
+                    ports:
+                      - containerPort: 8000
+                        name: http
+                      - containerPort: 8001
+                        name: grpc
+                      - containerPort: 8002
+                        name: metrics
+                    env:
+                      - name: NVIDIA_VISIBLE_DEVICES
+                        value: "all"
+                      - name: NVIDIA_DRIVER_CAPABILITIES
+                        value: "compute,utility"
+                      - name: HF_TOKEN
+                        value: "${{ inputs.triton_k8s.hf_token }}"
+                    resources:
+                      requests:
+                        memory: "${{ inputs.k8s.triton_resources.requests.memory }}"
+                        cpu: "${{ inputs.k8s.triton_resources.requests.cpu }}"
+                      limits:
+                        memory: "${{ inputs.k8s.triton_resources.limits.memory }}"
+                        cpu: "${{ inputs.k8s.triton_resources.limits.cpu }}"
+                        ${gpu_limits}
+                    volumeMounts:
+                      - name: storage
+                        mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }}
+                volumes:
+                  - name: storage
+                    persistentVolumeClaim:
+                      claimName: ${pvc_name}
+          ---
+          # Service for Triton
+          apiVersion: v1
+          kind: Service
+          metadata:
+            name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            selector:
+              app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
+            ports:
+              - protocol: TCP
+                port: 8000
+                targetPort: 8000
+                name: http
+              - protocol: TCP
+                port: 8001
+                targetPort: 8001
+                name: grpc
+              - protocol: TCP
+                port: 8002
+                targetPort: 8002
+                name: metrics
+          ---
+          # Deployment for Gradio UI
+          apiVersion: apps/v1
+          kind: Deployment
+          metadata:
+            name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            replicas: 1
+            selector:
+              matchLabels:
+                app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui
+            template:
+              metadata:
+                labels:
+                  app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui
+              spec:
+                containers:
+                  - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui
+                    image: nhuytan/gradio-ui:latest
+                    env:
+                      - name: UI_MAX_TOKENS
+                        value: "${{inputs.webui_k8s.ui_max_tokens}}"
+                      - name: UI_TEMPERATURE
+                        value: "${{inputs.webui_k8s.ui_temperature}}"
+                    command:
+                      - sh
+                      - -c
+                      - |
+                          python -c " 
+                          import gradio as gr
+                          import requests
+                          import os
+
+                          # Get default values from shell environment variables
+                          default_max_tokens = int(os.getenv('UI_MAX_TOKENS', '150'))
+                          default_temperature = float(os.getenv('UI_TEMPERATURE', '0.8'))
+
+                          def chat(message, history, max_tokens, temperature):
+
+                              url = 'http://${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton.${{ inputs.k8s.namespace }}.svc.cluster.local.:8000/v2/models/vllm_model/generate'
+                              
+
+                              payload = {
+                              'text_input': message, 
+                              'parameters': {
+                                'stream': False, 
+                                'temperature': temperature, 
+                                'max_tokens': max_tokens
+                                }
+                              }
+                              response = requests.post(url, json=payload)
+                              return response.json()['text_output']
+                          # Create Gradio components for input parameters
+                          max_tokens_slider = gr.Slider(
+                            minimum=1, 
+                            maximum=4096, # Or whatever is appropriate for your model
+                            value=default_max_tokens, 
+                            step=1, 
+                            label='Max Output Tokens')
+                          
+                          temperature_slider = gr.Slider(
+                            minimum=0.0, 
+                            maximum=2.0, 
+                            value=default_temperature, 
+                            step=0.1, 
+                            label='Temperature')
+                          
+                          # Pass the components to ChatInterface
+                          gr.ChatInterface(
+                            chat, 
+                            additional_inputs=[max_tokens_slider, temperature_slider],
+                            examples=[['What is the capital of Vietnam?', default_max_tokens, default_temperature],[ 'Tell me a short story.',200,0.7], ['Explain AI in simple terms.', 100, 0.5]]).launch(server_port=7860)"
+                    ports:
+                      - containerPort: 7860
+                        name: ui
+                    resources:
+                      requests:
+                        memory: "${{ inputs.k8s.webui_resources.requests.memory }}"
+                        cpu: "${{ inputs.k8s.webui_resources.requests.cpu }}"
+                      limits:
+                        memory: "${{ inputs.k8s.webui_resources.limits.memory }}"
+                        cpu: "${{ inputs.k8s.webui_resources.limits.cpu }}"
+          ---
+          # Service for Gradio UI
+          apiVersion: v1
+          kind: Service
+          metadata:
+            name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            selector:
+              app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui
+            ports:
+              - protocol: TCP
+                port: 7860
+                targetPort: 7860
+                name: ui
+          EOF
+          cat app.yaml
+      - name: Dry Run Deployment
+        early-cancel: any-job-failed
+        run: |
+          echo "Performing dry run for deployment..."
+          kubectl apply -f app.yaml --dry-run=client
+  apply_k8s_deployment:
+    needs:
+      - prepare_k8s_deployment
+    steps:
+      - name: Load outputs
+        run: cat OUTPUTS >> $OUTPUTS
+      - name: Apply PVC
+        if: ${{ inputs.k8s.volumes.pvc === New }}
+        run: kubectl apply -f pvc.yaml
+      - name: Apply Deployment and Service
+        run: kubectl apply -f app.yaml
+      - name: Wait for Deployment to be Ready
+        early-cancel: any-job-failed
+        env:
+          app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }}
+          namespace: ${{ inputs.k8s.namespace }}
+        run: |
+          log() {
+            while true; do
+              echo
+              echo "[INFO] $(date) - Checking deployment status for ${app_name}-triton in namespace ${namespace}..."
+              kubectl get deployment "${app_name}-triton" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment"
+              echo "[INFO] $(date) - Pods status:"
+              kubectl get pods -l app="${app_name}-triton" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods"
+              pod_name=$(kubectl get pods -l app="${app_name}-triton" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+              if [[ -n "$pod_name" ]]; then
+                echo "[INFO] $(date) - Describing pod ${pod_name}..."
+                kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events"
+                echo "[INFO] $(date) - Checking initContainer logs..."
+                kubectl logs "${pod_name}" -n "${namespace}" -c set-permissions 2>/dev/null || echo "[WARN] No logs for set-permissions"
+                kubectl logs "${pod_name}" -n "${namespace}" -c init-model-repository 2>/dev/null || echo "[WARN] No logs for init-model-repository"
+                echo "[INFO] $(date) - Checking pod status..."
+                kubectl get pod "${pod_name}" -n "${namespace}" -o yaml | grep -A10 "status:" || echo "[WARN] Unable to get pod status"
+              fi
+              echo "---------------------------------------------"
+              sleep 10
+            done
+          }
+          log &
+          log_pid=$!
+          trap "kill ${log_pid}" EXIT
+          set -x
+          kubectl wait --for=condition=available --timeout=600s deployment/${app_name}-triton -n ${namespace}
+          exit_code=$?
+          if [[ $exit_code -ne 0 ]]; then
+            echo "[ERROR] Deployment ${app_name}-triton failed to become available. Check pod events and initContainer logs above."
+            exit $exit_code
+          fi
+          kubectl get deployment ${app_name}-triton -n ${namespace} -o wide
+          kubectl describe deployment ${app_name}-triton -n ${namespace}
+          exit ${exit_code}
+      - name: Wait for Pod to be Ready
+        early-cancel: any-job-failed
+        env:
+          app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }}
+          namespace: ${{ inputs.k8s.namespace }}
+        run: |
+          echo "Waiting for pod to be ready..."
+          kubectl wait --for=condition=Ready pod -l app=${app_name}-triton -n ${namespace} --timeout=600s
+          pod=$(kubectl get pods -n ${namespace} -l app=${app_name}-triton --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}")
+          echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS
+          touch pod.running
+      - name: Stream Triton Logs
+        early-cancel: any-job-failed
+        run: |
+          kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }}-triton -n ${{ inputs.k8s.namespace }} &
+          triton_stream_pid=$?
+          echo ${triton_stream_pid} > triton_stream.pid
+      - name: Stream WebUI Logs
+        early-cancel: any-job-failed
+        run: |
+          kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }}-ui -n ${{ inputs.k8s.namespace }} &
+          webui_stream_pid=$?
+          echo ${webui_stream_pid} > webui_stream.pid
+  create_k8s_session:
+    needs:
+      - apply_k8s_deployment
+    steps:
+      - name: Wait until the Kubernetes deployment reaches its final stage
+        early-cancel: any-job-failed
+        run: |
+          while true; do
+            if [ -f "app.deleted" ]; then
+              echo "File app.deleted was detected. Exiting..."
+              exit 0
+            elif [ -f "pod.running" ]; then
+              echo "Pod is ready"
+              break
+            fi
+            sleep 2
+          done
+      - name: Get Service Name
+        early-cancel: any-job-failed
+        run: |
+          source OUTPUTS
+          echo "service_name=${app_name}-ui" | tee -a $OUTPUTS
+      - name: Expose port
+        early-cancel: any-job-failed
+        uses: parallelworks/update-session
+        with:
+          remotePort: '7860'
+          name: ${{ sessions.session }}
+          targetInfo:
+            name: ${{ inputs.k8s.cluster }}
+            namespace: ${{ inputs.k8s.namespace }}
+            resourceType: services
+            resourceName: ${{ needs.create_k8s_session.outputs.service_name }}
+  keep_alive:
+    needs:
+      - create_k8s_session
+    steps:
+      - name: Keep Session Running
+        early-cancel: any-job-failed
+        run: tail -f /dev/null
+        cleanup: |
+          echo "Cleaning up resources for keep_alive job..."
+          source OUTPUTS
+          kubectl delete deployment ${app_name}-triton -n ${{ inputs.k8s.namespace }} --ignore-not-found
+          kubectl delete service ${app_name}-triton -n ${{ inputs.k8s.namespace }} --ignore-not-found
+          kubectl delete deployment ${app_name}-ui -n ${{ inputs.k8s.namespace }} --ignore-not-found
+          kubectl delete service ${app_name}-ui -n ${{ inputs.k8s.namespace }} --ignore-not-found
+'on':
+  execute:
+    inputs:
+      k8s:
+        type: group
+        label: Kubernetes Settings
+        items:
+          cluster:
+            label: Kubernetes cluster
+            type: kubernetes-clusters
+          namespace:
+            label: Namespace
+            type: kubernetes-namespaces
+            clusterName: ${{ inputs.k8s.cluster }}
+          volumes:
+            type: group
+            label: Triton Volumes
+            collapsed: true
+            tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path.
+            items:
+              pvc:
+                label: Persistent Volume Claim
+                type: dropdown
+                default: Existing
+                options:
+                  - value: Existing
+                    label: Select Existing PVC
+                  - value: New
+                    label: Create New PVC
+              pvc_mount_path:
+                label: Mount Path
+                type: string
+                default: /models
+              pvc_existing:
+                label: Select PVC Name
+                type: kubernetes-pvc
+                clusterName: ${{ inputs.k8s.cluster }}
+                namespace: ${{ inputs.k8s.namespace }}
+                hidden: ${{ inputs.k8s.volumes.pvc !== Existing }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+              pvc_storage_size:
+                label: Enter PVC Size
+                type: string
+                hidden: ${{ inputs.k8s.volumes.pvc !== New }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+                default: 100Gi
+              pvc_storage_class:
+                label: Enter PVC Storage Class
+                type: string
+                hidden: ${{ inputs.k8s.volumes.pvc !== New }}
+                ignore: ${{ .hidden }}
+                optional: true
+                tooltip: Leave blank to use the default storage class configured in the cluster.
+              pvc_persist:
+                label: Persist PVC After Completion
+                type: boolean
+                default: false
+                hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+                tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted.
+              pvc_name:
+                label: Enter PVC Name
+                type: string
+                hidden: ${{ inputs.k8s.volumes.pvc_persist === false || inputs.k8s.volumes.pvc !== New }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+          triton_resources:
+            type: group
+            label: Triton Resources
+            collapsed: true
+            items:
+              requests:
+                type: group
+                label: Requests
+                items:
+                  memory:
+                    label: Memory
+                    type: string
+                    default: 8Gi
+                    tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 8Gi).
+                  cpu:
+                    label: CPU
+                    type: string
+                    default: '4'
+                    tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 4, 100m).
+              limits:
+                type: group
+                label: Limits
+                items:
+                  memory:
+                    label: Memory
+                    type: string
+                    default: 16Gi
+                    tooltip: Set the maximum memory the pod can use (e.g., 8Gi, 16Gi).
+                  cpu:
+                    label: CPU
+                    type: string
+                    default: '8'
+                    tooltip: Set the maximum CPU the pod can use (e.g., 4, 8, 500m).
+                  select_gpu:
+                    label: Select GPU Device
+                    type: dropdown
+                    tooltip: Choose the type of GPU device for the deployment, if needed.
+                    default: nvidia.com/gpu
+                    options:
+                      - value: nvidia.com/gpu
+                        label: Nvidia GPU
+                      - value: amd.com/gpu
+                        label: AMD GPU
+                      - value: Custom
+                        label: Custom GPU Resource Key
+                  gpu_resource_key:
+                    label: Custom GPU Resource Key
+                    type: string
+                    hidden: ${{ inputs.k8s.triton_resources.limits.select_gpu !== Custom }}
+                    ignore: ${{ .hidden }}
+                    tooltip: Specify a custom GPU resource key for Kubernetes.
+                  number_of_gpus:
+                    label: Number of GPUs
+                    type: number
+                    step: 1
+                    default: 1
+                    min: 1
+                    tooltip: Specify the number of GPUs to allocate for the deployment.
+          webui_resources:
+            type: group
+            label: WebUI Resources
+            collapsed: true
+            items:
+              requests:
+                type: group
+                label: Requests
+                items:
+                  memory:
+                    label: Memory
+                    type: string
+                    default: 2Gi
+                    tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 2Gi).
+                  cpu:
+                    label: CPU
+                    type: string
+                    default: '2'
+                    tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 2, 100m).
+              limits:
+                type: group
+                label: Limits
+                items:
+                  memory:
+                    label: Memory
+                    type: string
+                    default: 4Gi
+                    tooltip: Set the maximum memory the pod can use (e.g., 2Gi, 4Gi).
+                  cpu:
+                    label: CPU
+                    type: string
+                    default: '4'
+                    tooltip: Set the maximum CPU the pod can use (e.g., 2, 4, 500m).
+      triton_k8s:
+        type: group
+        label: Triton Settings
+        collapsed: true
+        items:
+          model:
+            label: Model Name
+            type: string
+            default: meta-llama/Meta-Llama-3.1-8B-Instruct
+            tooltip: Specify the Hugging Face model to use with vLLM (e.g., meta-llama/Meta-Llama-3.1-8B-Instruct).
+          hf_token:
+            label: Hugging Face Token (hf_...)
+            type: password
+            optional: false
+            tooltip: Your Hugging Face API token for accessing private or gated models (e.g., Llama).
+          gpu_memory_utilization:
+            label: GPU Memory Utilization
+            type: number
+            default: 0.9
+            min: 0.1
+            max: 0.9
+            tooltip: Specify the fraction of GPU memory to utilize (0.1 to 0.9).
+          max_num_seqs:
+            label: Max Number of Sequences
+            type: number
+            default: 4
+            min: 1
+            tooltip: Specify the maximum number of sequences in a batch.(concurrent requests processed per batch in vLLM)
+          max_model_len:
+            label: Max Model Length
+            type: number
+            default: 1024
+            min: 512
+            tooltip: Maximum model length for sequences.(Define max token length for input sequences, limiting context size and KV cache memory)
+          tensor_parallel_size:
+            label: Tensor Parallel Size
+            type: number
+            default: 1
+            min: 1
+            tooltip: Specify the number of GPUs for tensor parallelism.
+      webui_k8s:
+        type: group
+        label: WebUI Settings
+        collapsed: true
+        items:
+          image:
+            label: WebUI Image
+            type: string
+            default: python:3.10-slim
+          image_port:
+            label: WebUI Port
+            type: number
+            default: 7860
+          ui_max_tokens:
+            label: Max Output Tokens (UI)
+            type: number
+            default: 150
+            min: 1
+            max: 4096
+            tooltip: Maximum number of tokens the model should generate in a single response for the WebUI.
+          ui_temperature:
+            label: Temperature (UI)
+            type: number
+            default: 0.8
+            min: 0
+            max: 2
+            step: 0.1
+            tooltip: Controls the randomness of the output. Higher values (e.g., 0.8-1.0) make output more creative, lower values (e.g., 0.2) make it more deterministic.
diff --git a/workflow/yamls/k8s/triton-vllm/general_v0.1.yaml b/workflow/yamls/k8s/triton-vllm/general_v0.1.yaml
new file mode 100644
index 00000000..2f4a704f
--- /dev/null
+++ b/workflow/yamls/k8s/triton-vllm/general_v0.1.yaml
@@ -0,0 +1,741 @@
+permissions:
+  - '*'
+sessions:
+  session:
+    useTLS: false
+    redirect: true
+    useCustomDomain: true
+app:
+  target: inputs.k8s.cluster
+jobs:
+  auth_k8s:
+    steps:
+      - name: Authenticate kubectl
+        early-cancel: any-job-failed
+        run: pw kube auth ${{ inputs.k8s.cluster }}
+  prepare_k8s_pvc:
+    needs:
+      - auth_k8s
+    steps:
+      - name: Creating New PVC YAML
+        early-cancel: any-job-failed
+        if: ${{ inputs.k8s.volumes.pvc === New }}
+        run: |
+          if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then
+            pvc_name="${{ inputs.k8s.volumes.pvc_name }}"
+          else
+            job_number=$(pwd | rev | cut -d "/" -f1 | rev)
+            workflow_name=$(pwd | rev | cut -d "/" -f2 | rev)
+            pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc
+          fi
+          pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }}
+          if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then
+            default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)' | awk '{print $1}')
+            if [ -z "${default_class}" ]; then
+              echo "ERROR: No default storage class found. Available storage classes:"
+              kubectl get storageclass -n ${{ inputs.k8s.namespace }}
+              exit 1
+            fi
+            storageClassName="storageClassName: $default_class"
+          else
+            storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}"
+          fi
+          echo "${pvc_name}" > pvc_name
+          cat <<EOF > pvc.yaml
+          apiVersion: v1
+          kind: PersistentVolumeClaim
+          metadata:
+            name: ${pvc_name}
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            accessModes:
+              - ReadWriteOnce
+            resources:
+              requests:
+                storage: ${{ inputs.k8s.volumes.pvc_storage_size }}
+            ${storageClassName}
+          EOF
+          cat pvc.yaml
+      - name: Dry Run PVC
+        early-cancel: any-job-failed
+        if: ${{ inputs.k8s.volumes.pvc === New }}
+        run: |
+          echo "Performing dry run for PVC..."
+          kubectl apply -f pvc.yaml --dry-run=client
+      - name: Dummy
+        early-cancel: any-job-failed
+        run: echo Dummy
+  prepare_k8s_deployment:
+    needs:
+      - prepare_k8s_pvc
+    steps:
+      - name: Defining App Name
+        early-cancel: any-job-failed
+        run: |
+          job_number=$(pwd | rev | cut -d "/" -f1 | rev)
+          workflow_name=$(pwd | rev | cut -d "/" -f2 | rev)
+          app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)
+          echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS
+      - name: Creating Deployment and Service YAML
+        early-cancel: any-job-failed
+        run: |
+          if [[ "${{ inputs.k8s.triton_resources.limits.select_gpu }}" == "Custom" ]]; then
+            gpu_limits="${{ inputs.k8s.triton_resources.limits.gpu_resource_key }}: ${{ inputs.k8s.triton_resources.limits.number_of_gpus }}"
+          else
+            gpu_limits="${{ inputs.k8s.triton_resources.limits.select_gpu }}: ${{ inputs.k8s.triton_resources.limits.number_of_gpus }}"
+          fi
+          gpu_check_limits="nvidia.com/gpu: 1"
+
+          tensor_parallel_size=${{ inputs.triton_k8s.tensor_parallel_size }}
+
+          if kubectl get runtimeclass nvidia &>/dev/null; then
+            runtimeClassName="runtimeClassName: nvidia"
+          else
+            runtimeClassName=""
+          fi
+
+          if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then
+            pvc_name=${{ inputs.k8s.volumes.pvc_existing }}
+          else
+            pvc_name=$(cat pvc_name)
+          fi
+
+          cat <<EOF > app.yaml
+          ---
+          # Deployment for Triton Inference Server
+          apiVersion: apps/v1
+          kind: Deployment
+          metadata:
+            name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            replicas: 1
+            selector:
+              matchLabels:
+                app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
+            template:
+              metadata:
+                labels:
+                  app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
+              spec:
+                runtimeClassName: nvidia
+                tolerations:
+                - key: "nvidia.com/gpu"
+                  operator: "Equal"
+                  value: "true"
+                  effect: "NoSchedule"
+
+                initContainers:
+                  - name: check-gpu-memory
+                    image: nhuytan/gpu-python:3.10-cuda12.1
+                    resources:
+                      limits:
+                        ${gpu_check_limits}
+                    env:
+                      - name: HF_TOKEN
+                        value: "${{ inputs.triton_k8s.hf_token }}"
+                      - name: MODEL_NAME
+                        value: "${{ inputs.triton_k8s.model }}"
+                      - name: MAX_MODEL_LEN
+                        value: "${{ inputs.triton_k8s.max_model_len }}"
+                      - name: MAX_NUM_SEQS
+                        value: "${{ inputs.triton_k8s.max_num_seqs }}"
+                      - name: GPU_MEMORY_UTILIZATION
+                        value: "${{ inputs.triton_k8s.gpu_memory_utilization }}"
+                    command:
+                      - sh
+                      - -c
+                      - |
+                        cat << 'EOF' > /tmp/check_gpu.py
+                        import os
+                        import subprocess
+                        import sys
+                        from transformers import AutoConfig, AutoModelForCausalLM
+                        from accelerate.utils import calculate_maximum_sizes
+                        import torch
+
+                        def main():
+                            model_name = os.getenv("MODEL_NAME")
+                            hf_token = os.getenv("HF_TOKEN")
+                            max_model_len = int(os.getenv("MAX_MODEL_LEN", "2048"))
+                            max_num_seqs = int(os.getenv("MAX_NUM_SEQS", "4"))
+                            gpu_mem_util = float(os.getenv("GPU_MEMORY_UTILIZATION", "0.9"))
+
+                            print(f"Checking GPU memory for model: {model_name}")
+
+                            try:
+                                config = AutoConfig.from_pretrained(model_name, token=hf_token)
+                                model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.float16)
+                                total_bytes, _ = calculate_maximum_sizes(model)
+                                model_weights_gb = total_bytes / (1024**3)
+                                del model
+                                torch.cuda.empty_cache()
+                                print(f"Fetched config and estimated model size")
+                            except Exception as e:
+                                print(f"Failed to get config or model: {e}")
+                                sys.exit(1)
+
+                            head_dim = config.hidden_size // config.num_attention_heads
+                            kv_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+                            kv_cache_bytes = max_model_len * max_num_seqs * 2 * config.num_hidden_layers * kv_heads * head_dim * 2
+                            kv_cache_gb = kv_cache_bytes / (1024**3)
+
+                            activation_gb = max_model_len * max_num_seqs * (18 * config.hidden_size + 4 * config.intermediate_size) * 2 / (1024**3)
+
+                            overhead_gb = 1.0
+
+                            total_needed_gb = (model_weights_gb + kv_cache_gb + activation_gb + overhead_gb) / gpu_mem_util
+
+                            print(f"Estimated model size: {model_weights_gb:.1f} GB")
+                            print(f"Estimated KV cache: {kv_cache_gb:.1f} GB")
+                            print(f"Estimated activations: {activation_gb:.1f} GB")
+                            print(f"Total estimated needed: {total_needed_gb:.1f} GB (after util factor {gpu_mem_util})")
+
+                            try:
+                                output = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.total', '--format=csv,nounits,noheader'])
+                                gpu_total_gb = float(output.decode().strip().split('\n')[0]) / 1024
+                                print(f"Available GPU memory: {gpu_total_gb:.1f} GB")
+                            except Exception as e:
+                                print(f"Failed to run nvidia-smi: {e}")
+                                sys.exit(1)
+
+                            if total_needed_gb > gpu_total_gb:
+                                scale = gpu_total_gb / total_needed_gb
+                                suggested_max_model_len = max(int(max_model_len * scale * 0.5), 512)
+                                suggested_max_num_seqs = max(int(max_num_seqs * scale * 0.5), 1)
+                                print(f"Not enough memory. Need ~{total_needed_gb:.1f} GB, but only have {gpu_total_gb:.1f} GB.")
+                                print(f"Suggest lowering: max_model_len → {suggested_max_model_len}, max_num_seqs → {suggested_max_num_seqs}")
+                                sys.exit(1)
+
+                            print("Enough GPU memory, ready to deploy.")
+
+                        if __name__ == '__main__':
+                            main()
+                        EOF
+                        python3.10 /tmp/check_gpu.py || { echo "Script failed"; exit 1; }
+                  - name: set-permissions
+                    image: busybox
+                    command: ["sh", "-c", "chmod -R 777 ${{ inputs.k8s.volumes.pvc_mount_path }}"]
+                    securityContext:
+                      runAsUser: 0
+                    volumeMounts:
+                      - name: storage
+                        mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }}
+                  - name: init-model-repository
+                    image: busybox
+                    command: ["sh", "-c", "mkdir -p ${{ inputs.k8s.volumes.pvc_mount_path }}/vllm_model/1 && echo '{\"model\": \"${{ inputs.triton_k8s.model }}\", \"gpu_memory_utilization\": ${{ inputs.triton_k8s.gpu_memory_utilization }}, \"max_num_seqs\": ${{ inputs.triton_k8s.max_num_seqs }}, \"max_model_len\": ${{ inputs.triton_k8s.max_model_len }} ,\"tensor_parallel_size\": ${tensor_parallel_size}}' > ${{ inputs.k8s.volumes.pvc_mount_path }}/vllm_model/1/model.json && echo 'backend: \"vllm\"\ninstance_group [\n  {\n    count: 1\n    kind: KIND_GPU\n  }\n]' > ${{ inputs.k8s.volumes.pvc_mount_path }}/vllm_model/config.pbtxt"]
+                    volumeMounts:
+                      - name: storage
+                        mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }}
+                containers:
+                  - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
+                    image: nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3
+                    args: ["tritonserver", "--model-store=${{ inputs.k8s.volumes.pvc_mount_path }}", "--model-control-mode=POLL"]
+                    ports:
+                      - containerPort: 8000
+                        name: http
+                      - containerPort: 8001
+                        name: grpc
+                      - containerPort: 8002
+                        name: metrics
+                    env:
+                      - name: NVIDIA_VISIBLE_DEVICES
+                        value: "all"
+                      - name: NVIDIA_DRIVER_CAPABILITIES
+                        value: "compute,utility"
+                      - name: HF_TOKEN
+                        value: "${{ inputs.triton_k8s.hf_token }}"
+                    resources:
+                      requests:
+                        memory: "${{ inputs.k8s.triton_resources.requests.memory }}"
+                        cpu: "${{ inputs.k8s.triton_resources.requests.cpu }}"
+                      limits:
+                        memory: "${{ inputs.k8s.triton_resources.limits.memory }}"
+                        cpu: "${{ inputs.k8s.triton_resources.limits.cpu }}"
+                        ${gpu_limits}
+                    volumeMounts:
+                      - name: storage
+                        mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }}
+                volumes:
+                  - name: storage
+                    persistentVolumeClaim:
+                      claimName: ${pvc_name}
+          ---
+          # Service for Triton
+          apiVersion: v1
+          kind: Service
+          metadata:
+            name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            selector:
+              app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
+            ports:
+              - protocol: TCP
+                port: 8000
+                targetPort: 8000
+                name: http
+              - protocol: TCP
+                port: 8001
+                targetPort: 8001
+                name: grpc
+              - protocol: TCP
+                port: 8002
+                targetPort: 8002
+                name: metrics
+          ---
+          # Deployment for Gradio UI
+          apiVersion: apps/v1
+          kind: Deployment
+          metadata:
+            name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            replicas: 1
+            selector:
+              matchLabels:
+                app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui
+            template:
+              metadata:
+                labels:
+                  app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui
+              spec:
+                containers:
+                  - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui
+                    image: nhuytan/gradio-ui:latest
+                    env:
+                      - name: UI_MAX_TOKENS
+                        value: "${{inputs.webui_k8s.ui_max_tokens}}"
+                      - name: UI_TEMPERATURE
+                        value: "${{inputs.webui_k8s.ui_temperature}}"
+                    command:
+                      - sh
+                      - -c
+                      - |
+                          python -c " 
+                          import gradio as gr
+                          import requests
+                          import os
+
+                          # Get default values from shell environment variables
+                          default_max_tokens = int(os.getenv('UI_MAX_TOKENS', '150'))
+                          default_temperature = float(os.getenv('UI_TEMPERATURE', '0.8'))
+
+                          def chat(message, history, max_tokens, temperature):
+
+                              url = 'http://${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton.${{ inputs.k8s.namespace }}.svc.cluster.local.:8000/v2/models/vllm_model/generate'
+                              
+
+                              payload = {
+                              'text_input': message, 
+                              'parameters': {
+                                'stream': False, 
+                                'temperature': temperature, 
+                                'max_tokens': max_tokens
+                                }
+                              }
+                              response = requests.post(url, json=payload)
+                              return response.json()['text_output']
+                          # Create Gradio components for input parameters
+                          max_tokens_slider = gr.Slider(
+                            minimum=1, 
+                            maximum=4096, # Or whatever is appropriate for your model
+                            value=default_max_tokens, 
+                            step=1, 
+                            label='Max Output Tokens')
+                          
+                          temperature_slider = gr.Slider(
+                            minimum=0.0, 
+                            maximum=2.0, 
+                            value=default_temperature, 
+                            step=0.1, 
+                            label='Temperature')
+                          
+                          # Pass the components to ChatInterface
+                          gr.ChatInterface(
+                            chat, 
+                            additional_inputs=[max_tokens_slider, temperature_slider],
+                            examples=[['What is the capital of Vietnam?', default_max_tokens, default_temperature],[ 'Tell me a short story.',200,0.7], ['Explain AI in simple terms.', 100, 0.5]]).launch(server_port=7860)"
+                    ports:
+                      - containerPort: 7860
+                        name: ui
+                    resources:
+                      requests:
+                        memory: "${{ inputs.k8s.webui_resources.requests.memory }}"
+                        cpu: "${{ inputs.k8s.webui_resources.requests.cpu }}"
+                      limits:
+                        memory: "${{ inputs.k8s.webui_resources.limits.memory }}"
+                        cpu: "${{ inputs.k8s.webui_resources.limits.cpu }}"
+          ---
+          # Service for Gradio UI
+          apiVersion: v1
+          kind: Service
+          metadata:
+            name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            selector:
+              app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui
+            ports:
+              - protocol: TCP
+                port: 7860
+                targetPort: 7860
+                name: ui
+          EOF
+          cat app.yaml
+      - name: Dry Run Deployment
+        early-cancel: any-job-failed
+        run: |
+          echo "Performing dry run for deployment..."
+          kubectl apply -f app.yaml --dry-run=client
+  apply_k8s_deployment:
+    needs:
+      - prepare_k8s_deployment
+    steps:
+      - name: Load outputs
+        run: cat OUTPUTS >> $OUTPUTS
+      - name: Apply PVC
+        if: ${{ inputs.k8s.volumes.pvc === New }}
+        run: kubectl apply -f pvc.yaml
+      - name: Apply Deployment and Service
+        run: kubectl apply -f app.yaml
+        cleanup: |
+          kubectl delete -f app.yamltouch app.deleted
+      - name: Wait for Deployment to be Ready
+        early-cancel: any-job-failed
+        env:
+          app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }}
+          namespace: ${{ inputs.k8s.namespace }}
+        run: |
+          log() {
+            while true; do
+              echo
+              echo "[INFO] $(date) - Checking deployment status for ${app_name}-triton in namespace ${namespace}..."
+              kubectl get deployment "${app_name}-triton" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment"
+              echo "[INFO] $(date) - Pods status:"
+              kubectl get pods -l app="${app_name}-triton" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods"
+              pod_name=$(kubectl get pods -l app="${app_name}-triton" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+              if [[ -n "$pod_name" ]]; then
+                echo "[INFO] $(date) - Describing pod ${pod_name}..."
+                kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events"
+                echo "[INFO] $(date) - Checking initContainer logs..."
+                kubectl logs "${pod_name}" -n "${namespace}" -c set-permissions 2>/dev/null || echo "[WARN] No logs for set-permissions"
+                kubectl logs "${pod_name}" -n "${namespace}" -c init-model-repository 2>/dev/null || echo "[WARN] No logs for init-model-repository"
+                echo "[INFO] $(date) - Checking pod status..."
+                kubectl get pod "${pod_name}" -n "${namespace}" -o yaml | grep -A10 "status:" || echo "[WARN] Unable to get pod status"
+              fi
+              echo "---------------------------------------------"
+              sleep 10
+            done
+          }
+          log &
+          log_pid=$!
+          trap "kill ${log_pid}" EXIT
+          set -x
+          kubectl wait --for=condition=available --timeout=600s deployment/${app_name}-triton -n ${namespace}
+          exit_code=$?
+          if [[ $exit_code -ne 0 ]]; then
+            echo "[ERROR] Deployment ${app_name}-triton failed to become available. Check pod events and initContainer logs above."
+            exit $exit_code
+          fi
+          kubectl get deployment ${app_name}-triton -n ${namespace} -o wide
+          kubectl describe deployment ${app_name}-triton -n ${namespace}
+          exit ${exit_code}
+      - name: Wait for Pod to be Ready
+        early-cancel: any-job-failed
+        env:
+          app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }}
+          namespace: ${{ inputs.k8s.namespace }}
+        run: |
+          echo "Waiting for pod to be ready..."
+          kubectl wait --for=condition=Ready pod -l app=${app_name}-triton -n ${namespace} --timeout=600s
+          pod=$(kubectl get pods -n ${namespace} -l app=${app_name}-triton --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}")
+          echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS
+          touch pod.running
+      - name: Stream Triton Logs
+        early-cancel: any-job-failed
+        run: |
+          kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }}-triton -n ${{ inputs.k8s.namespace }} &
+          triton_stream_pid=$?
+          echo ${triton_stream_pid} > triton_stream.pid
+      - name: Stream WebUI Logs
+        early-cancel: any-job-failed
+        run: |
+          kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }}-ui -n ${{ inputs.k8s.namespace }} &
+          webui_stream_pid=$?
+          echo ${webui_stream_pid} > webui_stream.pid
+  create_k8s_session:
+    needs:
+      - apply_k8s_deployment
+    steps:
+      - name: Wait until the Kubernetes deployment reaches its final stage
+        early-cancel: any-job-failed
+        run: |
+          while true; do
+            if [ -f "app.deleted" ]; then
+              echo "File app.deleted was detected. Exiting..."
+              exit 0
+            elif [ -f "pod.running" ]; then
+              echo "Pod is ready"
+              break
+            fi
+            sleep 2
+          done
+      - name: Get Service Name
+        early-cancel: any-job-failed
+        run: |
+          source OUTPUTS
+          echo "service_name=${app_name}-ui" | tee -a $OUTPUTS
+      - name: Expose port
+        early-cancel: any-job-failed
+        uses: parallelworks/update-session
+        with:
+          remotePort: '7860'
+          name: ${{ sessions.session }}
+          targetInfo:
+            name: ${{ inputs.k8s.cluster }}
+            namespace: ${{ inputs.k8s.namespace }}
+            resourceType: services
+            resourceName: ${{ needs.create_k8s_session.outputs.service_name }}
+  keep_alive:
+    needs:
+      - create_k8s_session
+    steps:
+      - name: Keep Session Running
+        early-cancel: any-job-failed
+        run: tail -f /dev/null
+        cleanup: |
+          echo "Cleaning up resources for keep_alive job..."
+          source OUTPUTS
+          kubectl delete deployment ${app_name}-triton -n ${{ inputs.k8s.namespace }} --ignore-not-found
+          kubectl delete service ${app_name}-triton -n ${{ inputs.k8s.namespace }} --ignore-not-found
+          kubectl delete deployment ${app_name}-ui -n ${{ inputs.k8s.namespace }} --ignore-not-found
+          kubectl delete service ${app_name}-ui -n ${{ inputs.k8s.namespace }} --ignore-not-found
+'on':
+  execute:
+    inputs:
+      k8s:
+        type: group
+        label: Kubernetes Settings
+        items:
+          cluster:
+            label: Kubernetes cluster
+            type: kubernetes-clusters
+          namespace:
+            label: Namespace
+            type: kubernetes-namespaces
+            clusterName: ${{ inputs.k8s.cluster }}
+          volumes:
+            type: group
+            label: Triton Volumes
+            collapsed: true
+            tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path.
+            items:
+              pvc:
+                label: Persistent Volume Claim
+                type: dropdown
+                default: Existing
+                options:
+                  - value: Existing
+                    label: Select Existing PVC
+                  - value: New
+                    label: Create New PVC
+              pvc_mount_path:
+                label: Mount Path
+                type: string
+                default: /models
+              pvc_existing:
+                label: Select PVC Name
+                type: kubernetes-pvc
+                clusterName: ${{ inputs.k8s.cluster }}
+                namespace: ${{ inputs.k8s.namespace }}
+                hidden: ${{ inputs.k8s.volumes.pvc !== Existing }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+              pvc_storage_size:
+                label: Enter PVC Size
+                type: string
+                hidden: ${{ inputs.k8s.volumes.pvc !== New }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+                default: 100Gi
+              pvc_storage_class:
+                label: Enter PVC Storage Class
+                type: string
+                hidden: ${{ inputs.k8s.volumes.pvc !== New }}
+                ignore: ${{ .hidden }}
+                optional: true
+                tooltip: Leave blank to use the default storage class configured in the cluster.
+              pvc_persist:
+                label: Persist PVC After Completion
+                type: boolean
+                default: false
+                hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+                tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted.
+              pvc_name:
+                label: Enter PVC Name
+                type: string
+                hidden: ${{ inputs.k8s.volumes.pvc_persist === false || inputs.k8s.volumes.pvc !== New }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+          triton_resources:
+            type: group
+            label: Triton Resources
+            collapsed: true
+            items:
+              requests:
+                type: group
+                label: Requests
+                items:
+                  memory:
+                    label: Memory
+                    type: string
+                    default: 8Gi
+                    tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 8Gi).
+                  cpu:
+                    label: CPU
+                    type: string
+                    default: '4'
+                    tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 4, 100m).
+              limits:
+                type: group
+                label: Limits
+                items:
+                  memory:
+                    label: Memory
+                    type: string
+                    default: 16Gi
+                    tooltip: Set the maximum memory the pod can use (e.g., 8Gi, 16Gi).
+                  cpu:
+                    label: CPU
+                    type: string
+                    default: '8'
+                    tooltip: Set the maximum CPU the pod can use (e.g., 4, 8, 500m).
+                  select_gpu:
+                    label: Select GPU Device
+                    type: dropdown
+                    tooltip: Choose the type of GPU device for the deployment, if needed.
+                    default: nvidia.com/gpu
+                    options:
+                      - value: nvidia.com/gpu
+                        label: Nvidia GPU
+                      - value: amd.com/gpu
+                        label: AMD GPU
+                      - value: Custom
+                        label: Custom GPU Resource Key
+                  gpu_resource_key:
+                    label: Custom GPU Resource Key
+                    type: string
+                    hidden: ${{ inputs.k8s.triton_resources.limits.select_gpu !== Custom }}
+                    ignore: ${{ .hidden }}
+                    tooltip: Specify a custom GPU resource key for Kubernetes.
+                  number_of_gpus:
+                    label: Number of GPUs
+                    type: number
+                    step: 1
+                    default: 1
+                    min: 1
+                    tooltip: Specify the number of GPUs to allocate for the deployment.
+          webui_resources:
+            type: group
+            label: WebUI Resources
+            collapsed: true
+            items:
+              requests:
+                type: group
+                label: Requests
+                items:
+                  memory:
+                    label: Memory
+                    type: string
+                    default: 2Gi
+                    tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 2Gi).
+                  cpu:
+                    label: CPU
+                    type: string
+                    default: '2'
+                    tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 2, 100m).
+              limits:
+                type: group
+                label: Limits
+                items:
+                  memory:
+                    label: Memory
+                    type: string
+                    default: 4Gi
+                    tooltip: Set the maximum memory the pod can use (e.g., 2Gi, 4Gi).
+                  cpu:
+                    label: CPU
+                    type: string
+                    default: '4'
+                    tooltip: Set the maximum CPU the pod can use (e.g., 2, 4, 500m).
+      triton_k8s:
+        type: group
+        label: Triton Settings
+        collapsed: true
+        items:
+          model:
+            label: Model Name
+            type: string
+            default: meta-llama/Meta-Llama-3.1-8B-Instruct
+            tooltip: Specify the Hugging Face model to use with vLLM (e.g., meta-llama/Meta-Llama-3.1-8B-Instruct).
+          hf_token:
+            label: Hugging Face Token (hf_...)
+            type: password
+            optional: false
+            tooltip: Your Hugging Face API token for accessing private or gated models (e.g., Llama).
+          gpu_memory_utilization:
+            label: GPU Memory Utilization
+            type: number
+            default: 0.9
+            min: 0.1
+            max: 0.9
+            tooltip: Specify the fraction of GPU memory to utilize (0.1 to 0.9).
+          max_num_seqs:
+            label: Max Number of Sequences
+            type: number
+            default: 4
+            min: 1
+            tooltip: Specify the maximum number of sequences in a batch.(concurrent requests processed per batch in vLLM)
+          max_model_len:
+            label: Max Model Length
+            type: number
+            default: 1024
+            min: 512
+            tooltip: Maximum model length for sequences.(Define max token length for input sequences, limiting context size and KV cache memory)
+          tensor_parallel_size:
+            label: Tensor Parallel Size
+            type: number
+            default: 1
+            min: 1
+            tooltip: Specify the number of GPUs for tensor parallelism.
+      webui_k8s:
+        type: group
+        label: WebUI Settings
+        collapsed: true
+        items:
+          image:
+            label: WebUI Image
+            type: string
+            default: python:3.10-slim
+          image_port:
+            label: WebUI Port
+            type: number
+            default: 7860
+          ui_max_tokens:
+            label: Max Output Tokens (UI)
+            type: number
+            default: 150
+            min: 1
+            max: 4096
+            tooltip: Maximum number of tokens the model should generate in a single response for the WebUI.
+          ui_temperature:
+            label: Temperature (UI)
+            type: number
+            default: 0.8
+            min: 0
+            max: 2
+            step: 0.1
+            tooltip: Controls the randomness of the output. Higher values (e.g., 0.8-1.0) make output more creative, lower values (e.g., 0.2) make it more deterministic.

From 88a7eff2b9ee86146ea5c503225da71d4dae9e67 Mon Sep 17 00:00:00 2001
From: nhuytan1 <nhuytan1@gmail.com>
Date: Mon, 11 Aug 2025 12:05:47 -0500
Subject: [PATCH 3/5] add dry run

---
 workflow/yamls/k8s/ollama/general.yaml        | 540 ++++++++++++++++++
 workflow/yamls/k8s/sam2-model/general.yaml    |   7 +-
 .../triton-vllm/general_removecheckgpu.yaml   |  50 +-
 3 files changed, 595 insertions(+), 2 deletions(-)
 create mode 100644 workflow/yamls/k8s/ollama/general.yaml

diff --git a/workflow/yamls/k8s/ollama/general.yaml b/workflow/yamls/k8s/ollama/general.yaml
new file mode 100644
index 00000000..ab0de1a4
--- /dev/null
+++ b/workflow/yamls/k8s/ollama/general.yaml
@@ -0,0 +1,540 @@
+permissions:
+  - '*'
+sessions:
+  session:
+    useTLS: false
+    redirect: true
+    useCustomDomain: true
+app:
+  target: inputs.k8s.cluster
+jobs:
+  auth_k8s:
+    steps:
+      - name: Authenticate kubectl
+        early-cancel: any-job-failed
+        run: pw kube auth ${{ inputs.k8s.cluster }}
+  prepare_k8s_pvc:
+      needs:
+        - auth_k8s
+      steps:
+        - name: Creating New PVC YAML
+          early-cancel: any-job-failed
+          if: ${{ inputs.k8s.volumes.pvc === New }}
+          run: |
+            if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then
+              pvc_name="${{ inputs.k8s.volumes.pvc_name }}"
+            else
+              job_number=$(pwd | rev | cut -d "/" -f1 | rev)
+              workflow_name=$(pwd | rev | cut -d "/" -f2 | rev)
+              pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc
+            fi
+            pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} 
+            if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then
+              default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)')
+              if [ $? -ne 0 ]; then
+                echo "WARNING: Could not obtain default storageClass with command:"
+                echo "         kubectl get storageclass -n ${{ inputs.k8s.namespace }}"
+                echo "         You might need to provide a storage class input"
+              elif [ -z "${default_class}" ]; then
+                echo "ERROR: No default storage class found. You must specify one explicitly."
+                exit 1
+              fi
+            else
+              storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}"
+            fi
+            echo "${pvc_name}" > pvc_name
+            cat <<EOF > pvc.yaml
+            apiVersion: v1
+            kind: PersistentVolumeClaim
+            metadata:
+              name: ${pvc_name}
+              namespace: ${{ inputs.k8s.namespace }}
+            spec:
+              accessModes:
+                - ReadWriteOnce
+              resources:
+                requests:
+                  storage: ${{ inputs.k8s.volumes.pvc_storage_size }}  
+              ${storageClassName} 
+            EOF
+            cat pvc.yaml
+        - name: Dry Run PVC
+          early-cancel: any-job-failed
+          if: ${{ inputs.k8s.volumes.pvc === New }}
+          run: |
+            echo "Performing dry run..."
+            kubectl apply -f pvc.yaml --dry-run=client
+        - name: Dummy
+          early-cancel: any-job-failed
+          run: echo Dummy
+  prepare_k8s_deployment:
+    needs:
+      - prepare_k8s_pvc
+    steps:
+      - name: Defining App Name
+        early-cancel: any-job-failed
+        run: |
+          job_number=$(pwd | rev | cut -d "/" -f1 | rev)
+          workflow_name=$(pwd | rev | cut -d "/" -f2 | rev)
+          app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)
+          echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS
+      - name: Creating Deployment and Service YAML
+        early-cancel: any-job-failed
+        run: |
+          if [[ "${{ inputs.k8s.ollama_resources.limits.select_gpu }}" == "Custom" ]]; then
+            gpu_limits="${{ inputs.k8s.ollama_resources.limits.gpu_resource_key }}: ${{ inputs.k8s.ollama_resources.limits.number_of_gpus }}"
+          elif [[ "${{ inputs.k8s.ollama_resources.limits.select_gpu }}" != "None" ]]; then
+            gpu_limits="${{ inputs.k8s.ollama_resources.limits.select_gpu }}: ${{ inputs.k8s.ollama_resources.limits.number_of_gpus }}"
+          fi
+          # Attach RuntimeClass if it's available and using NVIDIA
+          if kubectl get runtimeclass nvidia &>/dev/null; then
+            echo "nvidia RuntimeClass is available"
+            runtimeClassName="runtimeClassName: nvidia"
+          fi
+
+          if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then
+            pvc_name=${{ inputs.k8s.volumes.pvc_existing }}
+          else
+            pvc_name=$(cat pvc_name)
+          fi
+
+          cat <<EOF > app.yaml
+          ---
+          # Deployment for ollama
+          apiVersion: apps/v1
+          kind: Deployment
+          metadata:
+            name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            replicas: 1
+            selector:
+              matchLabels:
+                app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}
+            template:
+              metadata:
+                labels:
+                  app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}
+              spec:
+                runtimeClassName: nvidia
+                initContainers:
+                  - name: set-permissions
+                    image: busybox
+                    command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"]
+                    securityContext:
+                      runAsUser: 0
+                    volumeMounts:
+                      - name: storage
+                        mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }}
+                containers:
+                  - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}
+                    image: ${{ inputs.ollama_k8s.image }}
+                    ports:
+                      - containerPort: ${{ inputs.ollama_k8s.image_port }}
+                    env:
+                      - name: NVIDIA_VISIBLE_DEVICES
+                        value: "all"
+                      - name: NVIDIA_DRIVER_CAPABILITIES
+                        value: "compute,utility"
+                    resources:
+                      requests:
+                        memory: "${{ inputs.k8s.ollama_resources.requests.memory }}"
+                        cpu: "${{ inputs.k8s.ollama_resources.requests.cpu }}"
+                      limits:
+                        memory: "${{ inputs.k8s.ollama_resources.limits.memory }}"
+                        cpu: "${{ inputs.k8s.ollama_resources.limits.cpu }}"
+                        ${gpu_limits}
+                    volumeMounts:
+                      - name: storage
+                        mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }}
+                volumes:
+                  - name: storage
+                    persistentVolumeClaim:
+                      claimName: ${pvc_name}  # Assumes PVC name is provided as an input
+          ---
+          # Service for ollama
+          apiVersion: v1
+          kind: Service
+          metadata:
+            name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            selector:
+              app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}
+            ports:
+              - protocol: TCP
+                port: ${{ inputs.ollama_k8s.image_port }}
+                targetPort: ${{ inputs.ollama_k8s.image_port }}
+
+          ---
+          # Deployment for openwebui
+          apiVersion: apps/v1
+          kind: Deployment
+          metadata:
+            name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            replicas: 1
+            selector:
+              matchLabels:
+                app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui
+            template:
+              metadata:
+                labels:
+                  app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui
+              spec:
+                containers:
+                  - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui
+                    image: ${{ inputs.openwebui_k8s.image }}
+                    ports:
+                      - containerPort: ${{ inputs.openwebui_k8s.image_port }}
+                    env:
+                      - name: OLLAMA_BASE_URL
+                        value: "http://${{ needs.prepare_k8s_deployment.outputs.app_name }}.${{ inputs.k8s.namespace }}.svc.cluster.local.:${{ inputs.ollama_k8s.image_port }}"
+                      - name: WEBUI_AUTH
+                        value: "False"
+                    resources:
+                      requests:
+                        memory: "${{ inputs.k8s.openwebui_resources.requests.memory }}"
+                        cpu: "${{ inputs.k8s.openwebui_resources.requests.cpu }}"
+                      limits:
+                        memory: "${{ inputs.k8s.openwebui_resources.limits.memory }}"
+                        cpu: "${{ inputs.k8s.openwebui_resources.limits.cpu }}"
+
+          ---
+          # Service for openwebui
+          apiVersion: v1
+          kind: Service
+          metadata:
+            name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            selector:
+              app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui
+            ports:
+              - protocol: TCP
+                port: ${{ inputs.openwebui_k8s.image_port }}
+                targetPort: ${{ inputs.openwebui_k8s.image_port }}
+          EOF
+      - name: Dry Run Deployment
+        early-cancel: any-job-failed
+        run: |
+          echo "Performing dry run..."
+          kubectl apply -f app.yaml --dry-run=client
+  apply_k8s_deployment:
+    needs:
+      - prepare_k8s_deployment
+    steps:
+      - name: Load outputs
+        early-cancel: any-job-failed
+        run: cat OUTPUTS >> $OUTPUTS
+      - name: Apply PVC
+        if: ${{ inputs.k8s.volumes.pvc === New }}
+        run: kubectl apply -f pvc.yaml
+        cleanup: |
+          if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then
+            kubectl delete -f pvc.yaml
+            touch pvc.deleted
+          fi
+      - name: Apply Deployment and Service
+        run: kubectl apply -f app.yaml
+        cleanup: |
+          kubectl delete -f app.yaml
+          touch app.deleted
+      - name: Wait for Deployment to be Ready
+        early-cancel: any-job-failed
+        env:
+          app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }}
+          namespace: ${{ inputs.k8s.namespace }}
+        run: |
+
+          log() {
+            while true; do
+              echo
+              echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..."
+              kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment"
+              
+              echo; echo "[INFO] $(date) - Pods status:"
+              kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods"
+
+              pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+              if [[ -n "$pod_name" ]]; then
+                echo; echo "[INFO] $(date) - Describing pod ${pod_name}..."
+                kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events"
+              fi
+              
+              echo "---------------------------------------------"
+              sleep 10
+            done
+          }
+
+          log &
+          log_pid=$!
+          trap "kill ${log_pid}" EXIT
+          set -x
+          kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace}
+          exit_code=$?
+          kubectl get deployment ${app_name} -n ${namespace} -o wide
+          kubectl describe deployment ${app_name} -n ${namespace}
+          exit ${exit_code}
+      - name: Wait for Pod to be Ready
+        early-cancel: any-job-failed
+        env:
+          app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }}
+          namespace: ${{ inputs.k8s.namespace }}
+        run: |
+          echo "Waiting for pod to be ready..."
+          kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s
+          pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}")
+          echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS
+          touch pod.running
+      - name: Pull Ollama Models
+        early-cancel: any-job-failed
+        env:
+          pod_name: ${{ needs.apply_k8s_deployment.outputs.pod }}
+          namespace: ${{ inputs.k8s.namespace }}
+        run: |
+          set -x
+          kubectl -n $namespace exec $pod_name -- /bin/sh -c "ollama pull gpt-oss:20b " &
+      - name: Stream Ollama Logs
+        run: |
+          kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} &
+          ollama_stream_pid=$?
+          echo ${ollama_stream_pid} > ollama_stream.pid
+        cleanup: kill $(cat ollama_stream.pid)
+      - name: Stream OpenWebUI Logs
+        early-cancel: any-job-failed
+        run: |
+          kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }}-openwebui -n ${{ inputs.k8s.namespace }}
+  create_k8s_session:
+    needs:
+      - prepare_k8s_deployment
+    steps:
+      - name: Wait until the Kubernetes deployment reaches its final stage
+        early-cancel: any-job-failed
+        run: |
+          while true; do
+            if [ -f "app.deleted" ]; then
+              echo "File app.deleted was detected. Exiting..."
+              exit 0
+            elif [ -f "pod.running" ]; then
+              echo "Pod is ready"
+              break
+            fi
+            sleep 2 
+          done
+      - name: Get Service Name
+        early-cancel: any-job-failed
+        run: |
+          source OUTPUTS
+          echo "service_name=${app_name}-lb" | tee -a $OUTPUTS
+      - name: Expose port
+        early-cancel: any-job-failed
+        uses: parallelworks/update-session
+        with:
+          remotePort: ${{ inputs.openwebui_k8s.image_port }}
+          name: ${{ sessions.session }}
+          targetInfo:
+            name: ${{ inputs.k8s.cluster }}
+            namespace: ${{ inputs.k8s.namespace }}
+            resourceType: services
+            resourceName: ${{ needs.create_k8s_session.outputs.service_name }}
+'on':
+  execute:
+    inputs:
+      k8s:
+        type: group
+        label: Kubernetes Settings
+        items:
+          cluster:
+            label: Kubernetes cluster
+            type: kubernetes-clusters
+          namespace:
+            label: Namespace
+            type: kubernetes-namespaces
+            clusterName: ${{ inputs.k8s.cluster }}
+          volumes:
+            type: group
+            label: Ollama Volumes
+            collapsed: true
+            tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path.
+            items:
+              pvc:
+                label: Persistent Volume Claim
+                type: dropdown
+                default: New
+                options:
+                  - value: Existing
+                    label: Select Existing PVC
+                  - value: New
+                    label: Create New PVC
+              pvc_mount_path:
+                label: Mount Path
+                type: string
+                default: /root/.ollama
+              pvc_existing:
+                label: Select PVC Name
+                type: kubernetes-pvc
+                clusterName: ${{ inputs.k8s.cluster }}
+                namespace: ${{ inputs.k8s.namespace }}
+                hidden: ${{ inputs.k8s.volumes.pvc !== Existing }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+              pvc_storage_size:
+                label: Enter PVC Size
+                type: string
+                hidden: ${{ inputs.k8s.volumes.pvc !== New }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+                default: 20Gi
+              pvc_storage_class:
+                label: Enter PVC Storage Class
+                type: string
+                hidden: ${{ inputs.k8s.volumes.pvc !== New }}
+                ignore: ${{ .hidden }}
+                optional: true
+                tooltip: Leave blank to use the default storage class configured in the cluster.
+              pvc_persist:
+                label: Persist PVC After Completion
+                type: boolean
+                default: false
+                hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+                tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted.
+              pvc_name:
+                label: Enter PVC Name
+                type: string
+                hidden: ${{ inputs.k8s.volumes.pvc_persist === false || inputs.k8s.volumes.pvc !== New  }}
+                ignore: ${{ .hidden }}
+                optional: ${{ .hidden }}
+          ollama_resources:
+            type: group
+            label: Ollama Resources
+            collapsed: true
+            items:
+              requests:
+                type: group
+                label: Requests
+                items:
+                  memory:
+                    label: Memory
+                    type: string
+                    default: 2Gi
+                    tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi).
+                  cpu:
+                    label: CPU
+                    type: string
+                    default: '2'
+                    tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU).
+              limits:
+                type: group
+                label: Limits
+                items:
+                  memory:
+                    label: Memory
+                    type: string
+                    default: 4Gi
+                    tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi).
+                  cpu:
+                    label: CPU
+                    type: string
+                    default: '4'
+                    tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m).
+                  select_gpu:
+                    label: Select GPU Device
+                    type: dropdown
+                    tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key.
+                    default: nvidia.com/gpu
+                    options:
+                      - value: None
+                        label: None
+                      - value: nvidia.com/gpu
+                        label: Nvidia GPU
+                      - value: amd.com/gpu
+                        label: AMD GPU
+                      - value: cloud-tpus.google.com/v3
+                        label: Google TPU
+                      - value: Custom
+                        label: Custom GPU Resource Key
+                  gpu_resource_key:
+                    label: Custom GPU Resource Key
+                    type: string
+                    hidden: ${{ inputs.k8s.ollama_resources.limits.select_gpu !== Custom }}
+                    ignore: ${{ .hidden }}
+                    tooltip: |
+                      Specify a custom GPU resource key for Kubernetes, such as:
+                        - nvidia.com/gpu
+                        - amd.com/gpu
+                        - cloud-tpus.google.com/v3
+                        - nvidia.com/mig-1g.5gb
+                        - nvidia.com/mig-2g.10gb
+                        - nvidia.com/mig-3g.20gb
+                  number_of_gpus:
+                    label: Number of GPUs
+                    type: number
+                    step: 1
+                    default: 1
+                    min: 1
+                    tooltip: Specify the number of GPUs to allocate for the deployment.
+                    hidden: ${{ inputs.k8s.ollama_resources.limits.select_gpu === None }}
+                    ignore: ${{ .hidden }}
+          openwebui_resources:
+            type: group
+            label: OpenWebUI Resources
+            collapsed: true
+            items:
+              requests:
+                type: group
+                label: Requests
+                items:
+                  memory:
+                    label: Memory
+                    type: string
+                    default: 2Gi
+                    tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi).
+                  cpu:
+                    label: CPU
+                    type: string
+                    default: '2'
+                    tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU).
+              limits:
+                type: group
+                label: Limits
+                items:
+                  memory:
+                    label: Memory
+                    type: string
+                    default: 4Gi
+                    tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi).
+                  cpu:
+                    label: CPU
+                    type: string
+                    default: '4'
+                    tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m).
+      ollama_k8s:
+        type: group
+        label: Ollama Settings
+        collapsed: true
+        items:
+          image:
+            label: Ollama Image
+            type: string
+            default: ollama/ollama:latest
+          image_port:
+            label: Ollama Port
+            type: number
+            default: 11434
+      openwebui_k8s:
+        type: group
+        label: OpenWebUI Settings
+        collapsed: true
+        items:
+          image:
+            label: OpenWebUI Image
+            type: string
+            default: ghcr.io/open-webui/open-webui:main
+          image_port:
+            label: OpenWebUI Port
+            type: number
+            default: 8080
diff --git a/workflow/yamls/k8s/sam2-model/general.yaml b/workflow/yamls/k8s/sam2-model/general.yaml
index b2e05b1b..98ad7846 100644
--- a/workflow/yamls/k8s/sam2-model/general.yaml
+++ b/workflow/yamls/k8s/sam2-model/general.yaml
@@ -166,7 +166,12 @@ jobs:
                 targetPort: 3000
             type: ClusterIP
           EOF
-  apply_sam2:
+      - name: Dry Run Deployment
+        early-cancel: any-job-failed
+        run: |
+          echo "Performing dry run..."
+          kubectl apply -f app.yaml --dry-run=client
+  apply_sam2_deployment:
     needs:
       - prepare_sam2
     steps:
diff --git a/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml b/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml
index 1faa57ad..68509fa3 100644
--- a/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml
+++ b/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml
@@ -143,7 +143,12 @@ jobs:
                 containers:
                   - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton
                     image: nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3
-                    args: ["tritonserver", "--model-store=${{ inputs.k8s.volumes.pvc_mount_path }}", "--model-control-mode=POLL"]
+                    args: 
+                      - /bin/bash
+                      - -c
+                      - |
+                        pip install --upgrade "vllm[gptoss]" --pre --extra-index-url https://wheels.vllm.ai/gpt-oss/ && \
+                        tritonserver --model-store=$MODEL_STORE --model-control-mode=POLL
                     ports:
                       - containerPort: 8000
                         name: http
@@ -158,6 +163,8 @@ jobs:
                         value: "compute,utility"
                       - name: HF_TOKEN
                         value: "${{ inputs.triton_k8s.hf_token }}"
+                      - name: MODEL_STORE
+                        value: "${{ inputs.k8s.volumes.pvc_mount_path }}"
                     resources:
                       requests:
                         memory: "${{ inputs.k8s.triton_resources.requests.memory }}"
@@ -306,12 +313,53 @@ jobs:
       - prepare_k8s_deployment
     steps:
       - name: Load outputs
+        early-cancel: any-job-failed
         run: cat OUTPUTS >> $OUTPUTS
       - name: Apply PVC
+        early-cancel: any-job-failed
         if: ${{ inputs.k8s.volumes.pvc === New }}
         run: kubectl apply -f pvc.yaml
+        cleanup: |
+          set -x
+          if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then
+            MAX_ATTEMPTS=5
+            ATTEMPT=1
+            while true; do
+              if kubectl delete -f pvc.yaml; then
+                echo "PVC deleted successfully"
+                touch pvc.deleted
+                break
+              elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
+                echo "Failed to delete PVC after $MAX_ATTEMPTS attempts"
+                exit 1
+              else
+                echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..."
+                sleep 5
+                ((ATTEMPT++))
+              fi
+            done
+          fi
       - name: Apply Deployment and Service
         run: kubectl apply -f app.yaml
+        early-cancel: any-job-failed
+        cleanup: |
+          set -x
+          MAX_ATTEMPTS=5
+          ATTEMPT=1
+          while true; do
+            if kubectl delete -f app.yaml; then
+              echo "Resources deleted successfully"
+              touch app.deleted
+              break
+            elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
+              echo "Failed to delete resources after $MAX_ATTEMPTS attempts"
+              exit 1
+            else
+              echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..."
+              sleep 5
+              ((ATTEMPT++))
+            fi
+          done
       - name: Wait for Deployment to be Ready
         early-cancel: any-job-failed
         env:

From cab682296584cda2f24011d6add558cbfc296150 Mon Sep 17 00:00:00 2001
From: nhuytan1 <nhuytan1@gmail.com>
Date: Mon, 11 Aug 2025 14:53:39 -0500
Subject: [PATCH 4/5] remove early-cancel when apply pvc

---
 workflow/yamls/k8s/sam2-model/general.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/workflow/yamls/k8s/sam2-model/general.yaml b/workflow/yamls/k8s/sam2-model/general.yaml
index 98ad7846..60d3b741 100644
--- a/workflow/yamls/k8s/sam2-model/general.yaml
+++ b/workflow/yamls/k8s/sam2-model/general.yaml
@@ -102,7 +102,7 @@ jobs:
                 labels:
                   app: ${{ inputs.app.name }}
               spec:
-                ${runtimeClassName}
+                runtimeClassName: nvidia
                 initContainers:
                   - name: set-permissions
                     image: busybox
@@ -176,7 +176,6 @@ jobs:
       - prepare_sam2
     steps:
       - name: Apply PVC
-        early-cancel: any-job-failed
         if: ${{ inputs.k8s.volumes.pvc === 'New' }}
         run: kubectl apply -f pvc.yaml
         cleanup: |

From 94cf3ffe4327c0d71c65ee81736ee239270fd61b Mon Sep 17 00:00:00 2001
From: nhuytan1 <nhuytan1@gmail.com>
Date: Wed, 13 Aug 2025 22:44:27 -0500
Subject: [PATCH 5/5] sam2: update general.yaml; add ollama general.md and
 general_v0.1.yaml

---
 workflow/yamls/k8s/ollama/general.md        |  59 ++++
 workflow/yamls/k8s/ollama/general_v0.1.yaml | 350 ++++++++++++++++++++
 workflow/yamls/k8s/sam2-model/general.yaml  |   8 +-
 3 files changed, 415 insertions(+), 2 deletions(-)
 create mode 100644 workflow/yamls/k8s/ollama/general.md
 create mode 100644 workflow/yamls/k8s/ollama/general_v0.1.yaml

diff --git a/workflow/yamls/k8s/ollama/general.md b/workflow/yamls/k8s/ollama/general.md
new file mode 100644
index 00000000..c643cf56
--- /dev/null
+++ b/workflow/yamls/k8s/ollama/general.md
@@ -0,0 +1,59 @@
+# 🧠 Secure LLM Serving using Ollama on Kubernetes
+
+This workflow launches a **GPU-enabled Ollama server** on a Kubernetes cluster with a secure API gateway. Users can select a model (e.g., `mistral`, `qwen3`, `deepseek`), which will be pulled and served behind a public **Cloudflare Tunnel** with **API key protection**. The resulting endpoint is **OpenAI-compatible** and ready for use in tools like **LangChain**, **OpenWebUI**, or **Postman**.
+
+---
+
+## 🚀 Quick Start
+
+- **Select a Kubernetes Cluster:** Choose your target K8s cluster.  
+- **Set Namespace:** Specify the namespace to deploy in (e.g., `default`, `summer2025interns`).  
+- **Choose Model:** Select a model like `mistral`, `qwen3:4b`, or `deepseek-r1:1.5b`.  
+  > 🔍 **Browse available models** at [https://ollama.com/models](https://ollama.com/models)  
+- **Define Resources:** Pick a GPU-enabled preset or set custom CPU/RAM/GPU limits.  
+- **Run the Workflow:** Deploy and wait for the endpoint to be available.
+
+---
+
+## 🔐 Accessing the API
+
+Once deployed, the system will:
+
+- ✅ Generate a **secure API key**  
+- ✅ Start an **OpenAI-compatible proxy**  
+- ✅ Launch a **Cloudflare Tunnel** to expose the endpoint publicly  
+
+You will receive these credentials in the logs:
+
+- **API Key**  
+- **Public Endpoint (URL)**  
+- **Model Name**
+
+Use them to authenticate with any OpenAI-compatible frontend.
+
+---
+
+## 🧩 AI Integration in Parallel Works
+
+After deployment, the workflow **automatically registers the new model endpoint** as an AI Provider in Parallel Works. This enables:
+
+- Seamless use in **AI Chat** workflows  
+- Easy model selection in downstream **pipelines**  
+- Reuse across teams and namespaces with API key control  
+
+No manual setup needed — everything is handled during execution.
+
+---
+
+## 📡 Integration Example
+
+Example `curl` request:
+
+```bash
+curl https://<your-tunnel>.trycloudflare.com/v1/chat/completions \
+  -H "Authorization: Bearer <your-api-key>" \
+  -H "Content-Type: application/json" \
+  -d '{
+        "model": "mistral",
+        "messages": [{"role": "user", "content": "Hello!"}]
+      }'
diff --git a/workflow/yamls/k8s/ollama/general_v0.1.yaml b/workflow/yamls/k8s/ollama/general_v0.1.yaml
new file mode 100644
index 00000000..8f83711c
--- /dev/null
+++ b/workflow/yamls/k8s/ollama/general_v0.1.yaml
@@ -0,0 +1,350 @@
+permissions:
+  - '*'
+sessions:
+  session:
+    useTLS: false
+    redirect: true
+    useCustomDomain: true
+app:
+  target: inputs.k8s.cluster
+jobs:
+  get_pw_api_key:
+    steps:
+      - name: Authenticate with K8s
+        run: pw kube auth ${{ inputs.k8s.cluster }}
+      - name: Export PW_API_KEY from Parallel Works environment
+        run: |
+          source /etc/profile.d/parallelworks-env.sh
+          echo "PW_API_KEY found: ${PW_API_KEY:0:6}********"
+          echo "pw_api_key=$PW_API_KEY" >> OUTPUTS
+
+          pw_api_key_b64=$(echo -n "$PW_API_KEY" | base64)
+          echo "pw_api_key_b64=$pw_api_key_b64" >> OUTPUTS
+
+
+          team=$(curl -s -H "Authorization: Basic $pw_api_key_b64" https://activate.parallel.works/api/v2/teams | grep -o '"id": *"[^"]*"' | head -n1 | cut -d':' -f2 | tr -d ' "')
+          echo "team=$team" >> OUTPUTS
+
+          org=$(curl -s -H "Authorization: Basic $pw_api_key_b64" https://activate.parallel.works/api/auth/whoami/organization | tr -d '"')
+          echo "org=$org" >> OUTPUTS
+
+          np=$(curl -s -H "Authorization: Basic $pw_api_key_b64" https://activate.parallel.works/api/auth/whoami | sed 's/^user://')
+          echo "np=$np" >> OUTPUTS
+  create_pvc:
+    needs:
+      - get_pw_api_key
+    steps:
+      - name: Authenticate Kubernetes
+        run: pw kube auth ${{ inputs.k8s.cluster }}
+      - name: Create PVC
+        run: |
+          cat <<EOF > test-pvc.yaml
+          apiVersion: v1
+          kind: PersistentVolumeClaim
+          metadata:
+            name: test-pvc
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            accessModes:
+              - ReadWriteOnce
+            resources:
+              requests:
+                storage: 30Gi
+          EOF
+          kubectl apply -f test-pvc.yaml
+  deploy_ollama_openwebui:
+    needs:
+      - create_pvc
+    steps:
+      - name: Authenticate Kubernetes
+        run: pw kube auth ${{ inputs.k8s.cluster }}
+      - name: Generate API Key and Deployment YAML
+        run: |
+          API_KEY=$(openssl rand -hex 16)
+          echo "Generated API_KEY=$API_KEY"
+
+          cat <<EOF > ollama-openwebui-deployment.yaml
+          apiVersion: apps/v1
+          kind: Deployment
+          metadata:
+            name: ${{ inputs.app.name }}
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            replicas: 1
+            selector:
+              matchLabels:
+                app: ${{ inputs.app.name }}
+            template:
+              metadata:
+                labels:
+                  app: ${{ inputs.app.name }}
+              spec:
+                runtimeClassName: nvidia
+                volumes:
+                  - name: shared-pvc
+                    persistentVolumeClaim:
+                      claimName: ${{ inputs.pvc.name }}
+                  - name: shared-tmp
+                    emptyDir: {}
+                containers:
+                  - name: ollama
+                    image: ollama/ollama
+                    ports:
+                      - containerPort: 11434
+                    resources:
+                      limits:
+                        nvidia.com/gpu: ${{ inputs.resources.gpu_count }}
+                        cpu: "2"
+                        memory: "4Gi"
+                    volumeMounts:
+                      - mountPath: /root/.ollama
+                        name: shared-pvc
+                      - mountPath: /tmp
+                        name: shared-tmp
+                    command: ["sh", "-c"]
+                    args:
+                      - |
+                        set -ex
+                        apt-get update && apt-get install -y curl
+                        echo "$API_KEY" > /tmp/api_key.txt
+                        ollama serve &
+                        
+                        until curl -s http://localhost:11434/api/tags > /dev/null; do
+                          echo "Waiting for Ollama..."
+                          sleep 2
+                        done
+
+                        echo "Pulling model: ${{ inputs.app.model }}"
+                        ollama pull ${{ inputs.app.model }}
+                        
+                        wait
+
+                  - name: ollama-proxy
+                    image: nhuytan/ollama-apikey-proxy:latest
+                    ports:
+                      - containerPort: 8000
+                    env:
+                      - name: OLLAMA_API_KEY
+                        value: "$API_KEY"
+                      - name: OLLAMA_BASE_URL
+                        value: "http://localhost:11434"
+                    volumeMounts:
+                      - mountPath: /tmp
+                        name: shared-tmp
+                    command: ["sh", "-c"]
+                    args:
+                        - |
+                          apt-get update && apt-get install -y curl
+
+                          curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -o /usr/local/bin/cloudflared
+                          chmod +x /usr/local/bin/cloudflared
+
+                          echo "Starting API proxy..."
+                          uvicorn proxy:app --host 0.0.0.0 --port 8000 &
+
+                          echo "Starting tunnel..."
+                          cloudflared tunnel --url http://localhost:8000 --no-autoupdate > /tmp/cloudflared-url.txt 2>&1 &
+
+                          tail -f /dev/null   
+
+                  - name: openwebui
+                    image: ghcr.io/open-webui/open-webui:main
+                    env:
+                      - name: OLLAMA_BASE_URL
+                        value: "http://localhost:11434"
+                      - name: USE_OLLAMA_DOCKER
+                        value: "true"
+                    ports:
+                      - containerPort: 8080
+                    resources:
+                      limits:
+                        cpu: "1"
+                        memory: "2Gi"
+                    volumeMounts:
+                      - mountPath: /app/data
+                        name: shared-pvc
+                    lifecycle:
+                      postStart:
+                        exec:
+                          command:
+                            - sh
+                            - -c
+                            - |
+                              until curl -s http://localhost:11434/api/tags > /dev/null; do
+                                echo "Still waiting for Ollama..."
+                                sleep 2
+                              done
+          EOF
+      - name: Apply Deployment
+        run: kubectl apply -f ollama-openwebui-deployment.yaml
+      - name: Create Service
+        run: |
+          cat <<EOF > ollama-openwebui-service.yaml
+          apiVersion: v1
+          kind: Service
+          metadata:
+            name: ${{ inputs.app.name }}
+            namespace: ${{ inputs.k8s.namespace }}
+          spec:
+            selector:
+              app: ${{ inputs.app.name }}
+            ports:
+              - name: openwebui
+                protocol: TCP
+                port: 8080
+                targetPort: 8080
+              - name: ollama
+                protocol: TCP
+                port: 11434
+                targetPort: 11434
+              - name: proxy
+                protocol: TCP
+                port: 8000
+                targetPort: 8000
+            type: LoadBalancer
+          EOF
+          kubectl apply -f ollama-openwebui-service.yaml
+      - name: Wait for Pod to be Ready
+        run: |
+          echo "Waiting for pod to be ready..."
+          pod=$(kubectl get pods -n ${{ inputs.k8s.namespace }} -l app=${{ inputs.app.name }} -o jsonpath='{.items[0].metadata.name}')
+          kubectl wait --for=condition=Ready pod/$pod -n ${{ inputs.k8s.namespace }} --timeout=300s
+  create_session:
+    needs:
+      - deploy_ollama_openwebui
+    steps:
+      - name: Debug Service + Pod
+        run: |
+          echo "Checking pod + service for session connection..."
+          kubectl get svc ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }}
+          kubectl get pods -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }}
+      - name: Wait for Cloudflare Tunnel URL
+        run: |
+          pod=$(kubectl get pods -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} -o jsonpath="{.items[0].metadata.name}")
+
+          echo "model=${{inputs.app.model}}" >> OUTPUTS
+
+          api_key=$(kubectl exec -n ${{ inputs.k8s.namespace }} "$pod" -c ollama -- sh -c 'cat /tmp/api_key.txt' || echo "")
+          echo "api_key=$api_key" >> OUTPUTS
+
+          for i in $(seq 1 15); do
+            url=$(kubectl exec -n ${{ inputs.k8s.namespace }} "$pod" -c ollama-proxy -- sh -c "grep -o 'https://[a-zA-Z0-9.-]*\\.trycloudflare\\.com' /tmp/cloudflared-url.txt | head -n1")
+            if [ -n "$url" ]; then
+              echo "tunnel_url=$url" >> OUTPUTS
+              break
+            fi
+            echo "Waiting for tunnel URL..."
+            sleep 2
+          done
+      - name: Display Information to put at AI
+        run: |
+          source OUTPUTS
+          echo "Model: $model"
+          echo "API Key: $api_key"
+          echo "Tunnel URL: $tunnel_url"
+
+          echo "PW_API_KEY: $pw_api_key_b64"
+          echo "TEAM": $team
+          echo "Organization": $org
+          echo "Namespace": $np
+      - name: Expose Session
+        uses: parallelworks/update-session
+        with:
+          remotePort: '8080'
+          name: ${{ sessions.session }}
+          slug: ''
+          targetInfo:
+            name: ${{ inputs.k8s.cluster }}
+            namespace: ${{ inputs.k8s.namespace }}
+            resourceType: services
+            resourceName: ${{ inputs.app.name }}
+  keep_alive:
+    needs:
+      - create_session
+    steps:
+      - name: Display Information for AI - backup
+        run: |
+          source OUTPUTS
+          echo "Model: $model"
+          echo "API Key: $api_key"
+          echo "Tunnel URL: $tunnel_url"
+
+          echo "PW_API_KEY: $pw_api_key_b64"
+          echo "TEAM": $team
+          echo "Organization": $org
+          echo "Namespace": $np
+      - name: Register AI Chat Provider
+        run: |
+          source OUTPUTS
+
+          echo "Registering AI Chat Provider..."
+          safe_model=$(echo "$model" | tr -cd 'a-z0-9')
+          unique_suffix=$(date +%s | tail -c 6)
+          aichat_name="${safe_model}-${unique_suffix}"
+          echo "aichat_name=$aichat_name"
+
+          curl -s -X POST "https://activate.parallel.works/api/organizations/$org/namespaces/$np/aichat-providers" \
+            -H "Authorization: Basic $pw_api_key_b64" \
+            -H "Content-Type: application/json" \
+            -d '{
+              "name": "'"$aichat_name"'",
+              "description": "'"$model"'",
+              "tags": "",
+              "csp": "custom",
+              "team": "'"$team"'",
+              "variables": {
+                "endpoint": "'"$tunnel_url"'",
+                "apiKey": "'"$api_key"'",
+                "model": "'"$model"'"
+              }
+            }'
+      - name: Keep Session Running
+        run: tail -f /dev/null
+        cleanup: |
+          echo "Cleaning up resources..."
+          kubectl delete deployment ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --ignore-not-found
+          kubectl delete service ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --ignore-not-found
+'on':
+  execute:
+    inputs:
+      k8s:
+        type: group
+        label: Kubernetes Settings
+        items:
+          cluster:
+            label: Kubernetes Cluster
+            type: kubernetes-clusters
+          namespace:
+            label: Namespace
+            type: kubernetes-namespaces
+            clusterName: ${{ inputs.k8s.cluster }}
+      app:
+        type: group
+        label: App Settings
+        items:
+          name:
+            label: Deployment Name
+            type: string
+            default: ollama-ui
+          model:
+            label: Ollama Model to Pull
+            type: string
+            default: mistral
+      pvc:
+        type: group
+        label: Shared Volume
+        items:
+          name:
+            label: PVC Name
+            type: string
+            default: test-pvc
+      resources:
+        type: group
+        label: GPU Settings
+        items:
+          gpu_count:
+            label: Number of GPUs
+            type: number
+            default: 1
+            min: 1
+            step: 1
diff --git a/workflow/yamls/k8s/sam2-model/general.yaml b/workflow/yamls/k8s/sam2-model/general.yaml
index 60d3b741..93b81d72 100644
--- a/workflow/yamls/k8s/sam2-model/general.yaml
+++ b/workflow/yamls/k8s/sam2-model/general.yaml
@@ -291,15 +291,18 @@ jobs:
             sleep 2 
           done
       - name: Get SLUG
+        early-cancel: any-job-failed
         run: |
           echo "slug=" >> $OUTPUTS
       - name: Debug Service + Pod
+        early-cancel: any-job-failed
         run: |
           echo "Checking pod + service for session connection..."
           kubectl get svc ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }}
           kubectl get pods -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }}
           kubectl describe svc ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }}
       - name: Wait for Port 3000 to be Ready
+        early-cancel: any-job-failed
         run: |
           echo "Polling HTTP response from localhost:3000 inside container..."
           pod=$(kubectl get pods -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} -o jsonpath="{.items[0].metadata.name}")
@@ -312,6 +315,7 @@ jobs:
             sleep 2
           done
       - name: Expose Session
+        early-cancel: any-job-failed
         uses: parallelworks/update-session
         with:
           remotePort: '3000'
@@ -378,7 +382,7 @@ jobs:
                 hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }}
                 ignore: ${{ .hidden }}
                 optional: true
-                tooltip: Leave blank to use the default storage class configured in the cluster.
+                tooltip: Leave blank to use the default storage class configured in the cluster. Use 'local-path' if default does not work
               pvc_persist:
                 label: Persist PVC After Completion
                 type: boolean
@@ -399,7 +403,7 @@ jobs:
         label: App Settings
         items:
           name:
-            label: Deployment Name
+            label: App Name
             type: string
             default: sam2demo
       resources: