From 4de2d1c84f4572c7f093aab9f732d02fca7351d8 Mon Sep 17 00:00:00 2001 From: nhuytan1 Date: Mon, 28 Jul 2025 11:22:53 -0500 Subject: [PATCH 1/5] add sam2-model code from local machine --- workflow/yamls/k8s/sam2-model/general.md | 48 +++ workflow/yamls/k8s/sam2-model/general.yaml | 332 +++++++++++++++++++++ 2 files changed, 380 insertions(+) create mode 100644 workflow/yamls/k8s/sam2-model/general.md create mode 100644 workflow/yamls/k8s/sam2-model/general.yaml diff --git a/workflow/yamls/k8s/sam2-model/general.md b/workflow/yamls/k8s/sam2-model/general.md new file mode 100644 index 00000000..4421fae3 --- /dev/null +++ b/workflow/yamls/k8s/sam2-model/general.md @@ -0,0 +1,48 @@ +# Video Object Tracking using SAM2 model on Kubernetes + +This workflow launches a GPU-powered video object tracking interface on a Kubernetes cluster. Users can upload a video, select an object in the first frame, and run the tracking process. Once complete, both the tracked and stacked output videos are available for download. + +## Quick Start + +- **Select a Kubernetes Cluster:** Choose your target K8s cluster. +- **Set Namespace:** Specify the namespace to deploy in (e.g., `default`, `summer2025interns`). +- **Choose Number of GPUs:** Define how many GPUs (or MIG instances) to allocate for the workload. +- **Run the Workflow:** Launch the interface and wait for the deployment to be ready. + +--- + +## Using the Web Interface + +Once the UI is available, follow these steps: + +- **Upload a Video:** + - Accepted formats: `.mp4`, `.mov` + - Recommended: Less than 15 seconds, resolution under 1080p for best performance + +- **Select an Object:** + - Use the interactive canvas to click on the target object in the first frame + - This initializes the tracking point for segmentation + +- **Run Tracking:** + - Start the segmentation and tracking pipeline + - The system will process the video using GPU (or fallback to CPU if needed) + +--- + +## GPU Acceleration & MIG + +For best performance, the workflow runs on GPU-enabled nodes. +MIG (Multi-Instance GPU) support allows multiple jobs to run concurrently with isolated memory and compute slices. +This ensures efficient resource usage when running multiple video tracking sessions in parallel. + +--- + +## Output + +Once processing completes: + +- **Tracked Video:** Shows the object followed across frames with a visual overlay +- **Stacked Video:** Displays input/output side-by-side for comparison +- Both files will be available for download directly from the interface + +--- diff --git a/workflow/yamls/k8s/sam2-model/general.yaml b/workflow/yamls/k8s/sam2-model/general.yaml new file mode 100644 index 00000000..06e03521 --- /dev/null +++ b/workflow/yamls/k8s/sam2-model/general.yaml @@ -0,0 +1,332 @@ +permissions: + - '*' +sessions: + session: + useTLS: false + redirect: true + useCustomDomain: true +app: + target: inputs.k8s.cluster +jobs: + auth_k8s: + steps: + - name: Authenticate kubectl + run: pw kube auth ${{ inputs.k8s.cluster }} + prepare_k8s_pvc: + needs: + - auth_k8s + steps: + - name: Creating New PVC YAML + if: ${{ inputs.k8s.volumes.pvc === 'New' }} + run: | + pvc_name="${{ inputs.k8s.volumes.pvc_name }}" + pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} + if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then + default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)' | awk '{print $1}') + if [ $? -ne 0 ]; then + echo "WARNING: Could not obtain default storageClass with command:" + echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" + echo " Using empty storageClassName" + storageClassName="" + elif [ -z "${default_class}" ]; then + echo "ERROR: No default storage class found. You must specify one explicitly." + exit 1 + else + storageClassName="storageClassName: ${default_class}" + fi + else + storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" + fi + echo "${pvc_name}" > pvc_name + cat < test-pvc.yaml + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: ${pvc_name} + namespace: ${{ inputs.k8s.namespace }} + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: ${{ inputs.k8s.volumes.pvc_storage_size }} + ${storageClassName} + EOF + cat test-pvc.yaml + - name: Dry Run PVC + if: ${{ inputs.k8s.volumes.pvc === 'New' }} + run: | + echo "Performing dry run..." + kubectl apply -f test-pvc.yaml --dry-run=client + - name: Apply PVC + if: ${{ inputs.k8s.volumes.pvc === 'New' }} + run: kubectl apply -f test-pvc.yaml + cleanup: | + if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then + kubectl delete -f test-pvc.yaml + touch pvc.deleted + fi + deploy_sam2: + needs: + - prepare_k8s_pvc + steps: + - name: Generate Deployment YAML + run: | + if [[ "${{ inputs.resources.gpu_type }}" == "Custom" ]]; then + gpu_limits="${{ inputs.resources.gpu_resource_key }}: ${{ inputs.resources.gpu_count }}" + elif [[ "${{ inputs.resources.gpu_type }}" != "None" ]]; then + gpu_limits="${{ inputs.resources.gpu_type }}: ${{ inputs.resources.gpu_count }}" + fi + if kubectl get runtimeclass nvidia &>/dev/null; then + echo "nvidia RuntimeClass is available" + runtimeClassName="runtimeClassName: nvidia" + fi + if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then + pvc_name=${{ inputs.k8s.volumes.pvc_existing }} + else + pvc_name=$(cat pvc_name) + fi + cat < sam2-deployment.yaml + apiVersion: apps/v1 + kind: Deployment + metadata: + name: ${{ inputs.app.name }} + namespace: ${{ inputs.k8s.namespace }} + spec: + replicas: 1 + selector: + matchLabels: + app: ${{ inputs.app.name }} + template: + metadata: + labels: + app: ${{ inputs.app.name }} + spec: + ${runtimeClassName} + initContainers: + - name: set-permissions + image: busybox + command: ["sh", "-c", "chmod -R 777 /models"] + securityContext: + runAsUser: 0 + volumeMounts: + - name: model-storage + mountPath: /models + containers: + - name: ${{ inputs.app.name }} + image: nhuytan/sam2-video-tracker:test + imagePullPolicy: Always + ports: + - containerPort: 3000 + command: ["sh", "-c"] + args: ["pnpm install && echo 'Starting Next.js server...' && pnpm start && tail -f /dev/null"] + resources: + limits: + ${gpu_limits} + requests: + ${gpu_limits} + env: + - name: TORCH_DEVICE + value: "cuda" + - name: SAFE_MODE + value: "true" + - name: PYTORCH_CUDA_ALLOC_CONF + value: "max_split_size_mb:128,garbage_collection_threshold:0.8" + - name: NODE_ENV + value: "production" + - name: NEXT_TELEMETRY_DISABLED + value: "1" + - name: PYTHONPATH + value: "/app" + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_DRIVER_CAPABILITIES + value: "compute,utility" + - name: PYTHONUNBUFFERED + value: "1" + volumeMounts: + - name: model-storage + mountPath: /models + volumes: + - name: model-storage + persistentVolumeClaim: + claimName: ${pvc_name} + --- + apiVersion: v1 + kind: Service + metadata: + name: ${{ inputs.app.name }} + namespace: ${{ inputs.k8s.namespace }} + spec: + selector: + app: ${{ inputs.app.name }} + ports: + - protocol: TCP + port: 3000 + targetPort: 3000 + type: ClusterIP + EOF + - name: Apply Deployment + run: kubectl apply -f sam2-deployment.yaml + - name: Wait for Pod to be Ready + run: | + echo "Waiting for pod to be ready..." + kubectl wait --for=condition=Ready pod -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --timeout=300s + create_session: + needs: + - deploy_sam2 + steps: + - name: Get SLUG + run: | + echo "slug=" >> $OUTPUTS + - name: Debug Service + Pod + run: | + echo "Checking pod + service for session connection..." + kubectl get svc ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} + kubectl get pods -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} + kubectl describe svc ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} + - name: Wait for Port 3000 to be Ready + run: | + echo "Polling HTTP response from localhost:3000 inside container..." + pod=$(kubectl get pods -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} -o jsonpath="{.items[0].metadata.name}") + for i in {1..30}; do + echo "Checking if port 3000 is responding (attempt $i)..." + if kubectl exec -n ${{ inputs.k8s.namespace }} "$pod" -- sh -c "curl -s http://localhost:3000 >/dev/null"; then + echo " Port 3000 is now responding!" + break + fi + sleep 2 + done + - name: Expose Session + uses: parallelworks/update-session + with: + remotePort: '3000' + name: ${{ sessions.session }} + slug: ${{ needs.create_session.outputs.slug }} + targetInfo: + name: ${{ inputs.k8s.cluster }} + namespace: ${{ inputs.k8s.namespace }} + resourceType: services + resourceName: ${{ inputs.app.name }} + keep_alive: + needs: + - create_session + steps: + - name: Keep Session Running + run: tail -f /dev/null + cleanup: | + echo "Cleaning up resources..." + kubectl delete deployment ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --ignore-not-found + kubectl delete service ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --ignore-not-found + if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then + kubectl delete pvc ${pvc_name} -n ${{ inputs.k8s.namespace }} --ignore-not-found + fi +'on': + execute: + inputs: + k8s: + type: group + label: Kubernetes Settings + items: + cluster: + label: Kubernetes Cluster + type: kubernetes-clusters + namespace: + label: Namespace + type: kubernetes-namespaces + clusterName: ${{ inputs.k8s.cluster }} + default: summer2025interns + volumes: + type: group + label: Storage Settings + collapsed: true + tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. + items: + pvc: + label: Persistent Volume Claim + type: dropdown + default: New + options: + - value: Existing + label: Select Existing PVC + - value: New + label: Create New PVC + pvc_mount_path: + label: Mount Path + type: string + default: /models + pvc_existing: + label: Select PVC Name + type: kubernetes-pvc + clusterName: ${{ inputs.k8s.cluster }} + namespace: ${{ inputs.k8s.namespace }} + hidden: ${{ inputs.k8s.volumes.pvc !== 'Existing' }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + default: sam2 + pvc_storage_size: + label: Enter PVC Size + type: string + hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + default: 50Gi + pvc_storage_class: + label: Enter PVC Storage Class + type: string + hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }} + ignore: ${{ .hidden }} + optional: true + tooltip: Leave blank to use the default storage class configured in the cluster. + pvc_persist: + label: Persist PVC After Completion + type: boolean + default: false + hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. + pvc_name: + label: Enter PVC Name + type: string + hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + default: sam2 + app: + type: group + label: App Settings + items: + name: + label: Deployment Name + type: string + default: sam2demo + resources: + type: group + label: GPU Settings + items: + gpu_type: + label: Select GPU Device + type: dropdown + default: nvidia.com/gpu + options: + - value: None + label: None + - value: nvidia.com/gpu + label: Nvidia GPU + - value: Custom + label: Custom GPU Resource Key + gpu_resource_key: + label: Custom GPU Resource Key + type: string + hidden: ${{ inputs.resources.gpu_type !== 'Custom' }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + gpu_count: + label: Number of GPUs + type: number + default: 1 + min: 1 + step: 1 + hidden: ${{ inputs.resources.gpu_type === 'None' }} + ignore: ${{ .hidden }} From f77227d08a04aca2a9e2ae626fa8aef504fdf9ee Mon Sep 17 00:00:00 2001 From: nhuytan1 Date: Mon, 4 Aug 2025 11:29:16 -0500 Subject: [PATCH 2/5] Fix bug --- workflow/yamls/k8s/sam2-model/general.yaml | 159 +++- .../triton-vllm/general_removecheckgpu.yaml | 652 +++++++++++++++ .../yamls/k8s/triton-vllm/general_v0.1.yaml | 741 ++++++++++++++++++ 3 files changed, 1521 insertions(+), 31 deletions(-) create mode 100644 workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml create mode 100644 workflow/yamls/k8s/triton-vllm/general_v0.1.yaml diff --git a/workflow/yamls/k8s/sam2-model/general.yaml b/workflow/yamls/k8s/sam2-model/general.yaml index 06e03521..b2e05b1b 100644 --- a/workflow/yamls/k8s/sam2-model/general.yaml +++ b/workflow/yamls/k8s/sam2-model/general.yaml @@ -11,12 +11,14 @@ jobs: auth_k8s: steps: - name: Authenticate kubectl + early-cancel: any-job-failed run: pw kube auth ${{ inputs.k8s.cluster }} prepare_k8s_pvc: needs: - auth_k8s steps: - name: Creating New PVC YAML + early-cancel: any-job-failed if: ${{ inputs.k8s.volumes.pvc === 'New' }} run: | pvc_name="${{ inputs.k8s.volumes.pvc_name }}" @@ -38,7 +40,7 @@ jobs: storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" fi echo "${pvc_name}" > pvc_name - cat < test-pvc.yaml + cat < pvc.yaml apiVersion: v1 kind: PersistentVolumeClaim metadata: @@ -52,25 +54,23 @@ jobs: storage: ${{ inputs.k8s.volumes.pvc_storage_size }} ${storageClassName} EOF - cat test-pvc.yaml + cat pvc.yaml - name: Dry Run PVC + early-cancel: any-job-failed if: ${{ inputs.k8s.volumes.pvc === 'New' }} run: | echo "Performing dry run..." - kubectl apply -f test-pvc.yaml --dry-run=client - - name: Apply PVC - if: ${{ inputs.k8s.volumes.pvc === 'New' }} - run: kubectl apply -f test-pvc.yaml - cleanup: | - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - kubectl delete -f test-pvc.yaml - touch pvc.deleted - fi - deploy_sam2: + kubectl apply -f pvc.yaml --dry-run=client + - name: Dummy + early-cancel: any-job-failed + run: echo Dummy + + prepare_sam2: needs: - prepare_k8s_pvc steps: - - name: Generate Deployment YAML + - name: Create Deployment and Service YAML + early-cancel: any-job-failed run: | if [[ "${{ inputs.resources.gpu_type }}" == "Custom" ]]; then gpu_limits="${{ inputs.resources.gpu_resource_key }}: ${{ inputs.resources.gpu_count }}" @@ -86,7 +86,7 @@ jobs: else pvc_name=$(cat pvc_name) fi - cat < sam2-deployment.yaml + cat < app.yaml apiVersion: apps/v1 kind: Deployment metadata: @@ -166,16 +166,126 @@ jobs: targetPort: 3000 type: ClusterIP EOF - - name: Apply Deployment - run: kubectl apply -f sam2-deployment.yaml + apply_sam2: + needs: + - prepare_sam2 + steps: + - name: Apply PVC + early-cancel: any-job-failed + if: ${{ inputs.k8s.volumes.pvc === 'New' }} + run: kubectl apply -f pvc.yaml + cleanup: | + set -x + if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then + MAX_ATTEMPTS=5 + ATTEMPT=1 + while true; do + if kubectl delete -f pvc.yaml; then + echo "PVC deleted successfully" + touch pvc.deleted + break + elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then + echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" + exit 1 + else + echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." + sleep 5 + ((ATTEMPT++)) + fi + done + fi + - name: Apply Deployment and Service + run: kubectl apply -f app.yaml + cleanup: | + set -x + MAX_ATTEMPTS=5 + ATTEMPT=1 + while true; do + if kubectl delete -f app.yaml; then + echo "Resources deleted successfully" + touch app.deleted + break + elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then + echo "Failed to delete resources after $MAX_ATTEMPTS attempts" + exit 1 + else + echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." + sleep 5 + ((ATTEMPT++)) + fi + done + + - name: Wait for Deployment to be Ready + early-cancel: any-job-failed + env: + app_name: ${{ inputs.app.name }} + namespace: ${{ inputs.k8s.namespace }} + run: | + + log() { + while true; do + echo + echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." + kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" + + echo; echo "[INFO] $(date) - Pods status:" + kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" + + pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + if [[ -n "$pod_name" ]]; then + echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." + kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" + fi + + echo "---------------------------------------------" + sleep 20 + done + } + + log & + log_pid=$! + trap "kill ${log_pid}" EXIT + set -x + kubectl wait --for=condition=available --timeout=1200s deployment/${app_name} -n ${namespace} + exit_code=$? + kubectl get deployment ${app_name} -n ${namespace} -o wide + kubectl describe deployment ${app_name} -n ${namespace} + exit ${exit_code} + - name: Wait for Pod to be Ready + early-cancel: any-job-failed + env: + app_name: ${{ inputs.app.name }} + namespace: ${{ inputs.k8s.namespace }} run: | echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --timeout=300s + kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s + sam2_pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") + echo "sam2_pod=$sam2_pod" | tee -a $OUTPUTS | tee -a OUTPUTS + touch pod.running + + - name: Stream Logs + early-cancel: any-job-failed + run: | + kubectl logs -f deployment/${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} + echo Existing create_session: needs: - - deploy_sam2 + - prepare_sam2 steps: + - name: Wait until the Kubernetes deployment reaches its final stage + early-cancel: any-job-failed + run: | + while true; do + if [ -f "app.deleted" ]; then + echo "File app.deleted was detected. Exiting..." + exit 0 + elif [ -f "pod.running" ]; then + echo "Pod is ready" + break + fi + sleep 2 + done - name: Get SLUG run: | echo "slug=" >> $OUTPUTS @@ -208,19 +318,6 @@ jobs: namespace: ${{ inputs.k8s.namespace }} resourceType: services resourceName: ${{ inputs.app.name }} - keep_alive: - needs: - - create_session - steps: - - name: Keep Session Running - run: tail -f /dev/null - cleanup: | - echo "Cleaning up resources..." - kubectl delete deployment ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --ignore-not-found - kubectl delete service ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --ignore-not-found - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - kubectl delete pvc ${pvc_name} -n ${{ inputs.k8s.namespace }} --ignore-not-found - fi 'on': execute: inputs: diff --git a/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml b/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml new file mode 100644 index 00000000..1faa57ad --- /dev/null +++ b/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml @@ -0,0 +1,652 @@ +permissions: + - '*' +sessions: + session: + useTLS: false + redirect: true + useCustomDomain: true +app: + target: inputs.k8s.cluster +jobs: + auth_k8s: + steps: + - name: Authenticate kubectl + early-cancel: any-job-failed + run: pw kube auth ${{ inputs.k8s.cluster }} + prepare_k8s_pvc: + needs: + - auth_k8s + steps: + - name: Creating New PVC YAML + early-cancel: any-job-failed + if: ${{ inputs.k8s.volumes.pvc === New }} + run: | + if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then + pvc_name="${{ inputs.k8s.volumes.pvc_name }}" + else + job_number=$(pwd | rev | cut -d "/" -f1 | rev) + workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) + pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc + fi + pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} + if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then + default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)' | awk '{print $1}') + if [ -z "${default_class}" ]; then + echo "ERROR: No default storage class found. Available storage classes:" + kubectl get storageclass -n ${{ inputs.k8s.namespace }} + exit 1 + fi + storageClassName="storageClassName: $default_class" + else + storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" + fi + echo "${pvc_name}" > pvc_name + cat < pvc.yaml + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: ${pvc_name} + namespace: ${{ inputs.k8s.namespace }} + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: ${{ inputs.k8s.volumes.pvc_storage_size }} + ${storageClassName} + EOF + cat pvc.yaml + - name: Dry Run PVC + early-cancel: any-job-failed + if: ${{ inputs.k8s.volumes.pvc === New }} + run: | + echo "Performing dry run for PVC..." + kubectl apply -f pvc.yaml --dry-run=client + - name: Dummy + early-cancel: any-job-failed + run: echo Dummy + prepare_k8s_deployment: + needs: + - prepare_k8s_pvc + steps: + - name: Defining App Name + early-cancel: any-job-failed + run: | + job_number=$(pwd | rev | cut -d "/" -f1 | rev) + workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) + app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) + echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS + - name: Creating Deployment and Service YAML + early-cancel: any-job-failed + run: | + if [[ "${{ inputs.k8s.triton_resources.limits.select_gpu }}" == "Custom" ]]; then + gpu_limits="${{ inputs.k8s.triton_resources.limits.gpu_resource_key }}: ${{ inputs.k8s.triton_resources.limits.number_of_gpus }}" + else + gpu_limits="${{ inputs.k8s.triton_resources.limits.select_gpu }}: ${{ inputs.k8s.triton_resources.limits.number_of_gpus }}" + fi + gpu_check_limits="nvidia.com/gpu: 1" + + tensor_parallel_size=${{ inputs.triton_k8s.tensor_parallel_size }} + + if kubectl get runtimeclass nvidia &>/dev/null; then + runtimeClassName="runtimeClassName: nvidia" + else + runtimeClassName="" + fi + + if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then + pvc_name=${{ inputs.k8s.volumes.pvc_existing }} + else + pvc_name=$(cat pvc_name) + fi + + cat < app.yaml + --- + # Deployment for Triton Inference Server + apiVersion: apps/v1 + kind: Deployment + metadata: + name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton + namespace: ${{ inputs.k8s.namespace }} + spec: + replicas: 1 + selector: + matchLabels: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton + template: + metadata: + labels: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton + spec: + runtimeClassName: nvidia + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" + + initContainers: + - name: set-permissions + image: busybox + command: ["sh", "-c", "chmod -R 777 ${{ inputs.k8s.volumes.pvc_mount_path }}"] + securityContext: + runAsUser: 0 + volumeMounts: + - name: storage + mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} + - name: init-model-repository + image: busybox + command: ["sh", "-c", "mkdir -p ${{ inputs.k8s.volumes.pvc_mount_path }}/vllm_model/1 && echo '{\"model\": \"${{ inputs.triton_k8s.model }}\", \"gpu_memory_utilization\": ${{ inputs.triton_k8s.gpu_memory_utilization }}, \"max_num_seqs\": ${{ inputs.triton_k8s.max_num_seqs }}, \"max_model_len\": ${{ inputs.triton_k8s.max_model_len }} ,\"tensor_parallel_size\": ${tensor_parallel_size}}' > ${{ inputs.k8s.volumes.pvc_mount_path }}/vllm_model/1/model.json && echo 'backend: \"vllm\"\ninstance_group [\n {\n count: 1\n kind: KIND_GPU\n }\n]' > ${{ inputs.k8s.volumes.pvc_mount_path }}/vllm_model/config.pbtxt"] + volumeMounts: + - name: storage + mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} + containers: + - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton + image: nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3 + args: ["tritonserver", "--model-store=${{ inputs.k8s.volumes.pvc_mount_path }}", "--model-control-mode=POLL"] + ports: + - containerPort: 8000 + name: http + - containerPort: 8001 + name: grpc + - containerPort: 8002 + name: metrics + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_DRIVER_CAPABILITIES + value: "compute,utility" + - name: HF_TOKEN + value: "${{ inputs.triton_k8s.hf_token }}" + resources: + requests: + memory: "${{ inputs.k8s.triton_resources.requests.memory }}" + cpu: "${{ inputs.k8s.triton_resources.requests.cpu }}" + limits: + memory: "${{ inputs.k8s.triton_resources.limits.memory }}" + cpu: "${{ inputs.k8s.triton_resources.limits.cpu }}" + ${gpu_limits} + volumeMounts: + - name: storage + mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} + volumes: + - name: storage + persistentVolumeClaim: + claimName: ${pvc_name} + --- + # Service for Triton + apiVersion: v1 + kind: Service + metadata: + name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton + namespace: ${{ inputs.k8s.namespace }} + spec: + selector: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + name: http + - protocol: TCP + port: 8001 + targetPort: 8001 + name: grpc + - protocol: TCP + port: 8002 + targetPort: 8002 + name: metrics + --- + # Deployment for Gradio UI + apiVersion: apps/v1 + kind: Deployment + metadata: + name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui + namespace: ${{ inputs.k8s.namespace }} + spec: + replicas: 1 + selector: + matchLabels: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui + template: + metadata: + labels: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui + spec: + containers: + - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui + image: nhuytan/gradio-ui:latest + env: + - name: UI_MAX_TOKENS + value: "${{inputs.webui_k8s.ui_max_tokens}}" + - name: UI_TEMPERATURE + value: "${{inputs.webui_k8s.ui_temperature}}" + command: + - sh + - -c + - | + python -c " + import gradio as gr + import requests + import os + + # Get default values from shell environment variables + default_max_tokens = int(os.getenv('UI_MAX_TOKENS', '150')) + default_temperature = float(os.getenv('UI_TEMPERATURE', '0.8')) + + def chat(message, history, max_tokens, temperature): + + url = 'http://${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton.${{ inputs.k8s.namespace }}.svc.cluster.local.:8000/v2/models/vllm_model/generate' + + + payload = { + 'text_input': message, + 'parameters': { + 'stream': False, + 'temperature': temperature, + 'max_tokens': max_tokens + } + } + response = requests.post(url, json=payload) + return response.json()['text_output'] + # Create Gradio components for input parameters + max_tokens_slider = gr.Slider( + minimum=1, + maximum=4096, # Or whatever is appropriate for your model + value=default_max_tokens, + step=1, + label='Max Output Tokens') + + temperature_slider = gr.Slider( + minimum=0.0, + maximum=2.0, + value=default_temperature, + step=0.1, + label='Temperature') + + # Pass the components to ChatInterface + gr.ChatInterface( + chat, + additional_inputs=[max_tokens_slider, temperature_slider], + examples=[['What is the capital of Vietnam?', default_max_tokens, default_temperature],[ 'Tell me a short story.',200,0.7], ['Explain AI in simple terms.', 100, 0.5]]).launch(server_port=7860)" + ports: + - containerPort: 7860 + name: ui + resources: + requests: + memory: "${{ inputs.k8s.webui_resources.requests.memory }}" + cpu: "${{ inputs.k8s.webui_resources.requests.cpu }}" + limits: + memory: "${{ inputs.k8s.webui_resources.limits.memory }}" + cpu: "${{ inputs.k8s.webui_resources.limits.cpu }}" + --- + # Service for Gradio UI + apiVersion: v1 + kind: Service + metadata: + name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui + namespace: ${{ inputs.k8s.namespace }} + spec: + selector: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui + ports: + - protocol: TCP + port: 7860 + targetPort: 7860 + name: ui + EOF + cat app.yaml + - name: Dry Run Deployment + early-cancel: any-job-failed + run: | + echo "Performing dry run for deployment..." + kubectl apply -f app.yaml --dry-run=client + apply_k8s_deployment: + needs: + - prepare_k8s_deployment + steps: + - name: Load outputs + run: cat OUTPUTS >> $OUTPUTS + - name: Apply PVC + if: ${{ inputs.k8s.volumes.pvc === New }} + run: kubectl apply -f pvc.yaml + - name: Apply Deployment and Service + run: kubectl apply -f app.yaml + - name: Wait for Deployment to be Ready + early-cancel: any-job-failed + env: + app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} + namespace: ${{ inputs.k8s.namespace }} + run: | + log() { + while true; do + echo + echo "[INFO] $(date) - Checking deployment status for ${app_name}-triton in namespace ${namespace}..." + kubectl get deployment "${app_name}-triton" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" + echo "[INFO] $(date) - Pods status:" + kubectl get pods -l app="${app_name}-triton" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" + pod_name=$(kubectl get pods -l app="${app_name}-triton" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + if [[ -n "$pod_name" ]]; then + echo "[INFO] $(date) - Describing pod ${pod_name}..." + kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" + echo "[INFO] $(date) - Checking initContainer logs..." + kubectl logs "${pod_name}" -n "${namespace}" -c set-permissions 2>/dev/null || echo "[WARN] No logs for set-permissions" + kubectl logs "${pod_name}" -n "${namespace}" -c init-model-repository 2>/dev/null || echo "[WARN] No logs for init-model-repository" + echo "[INFO] $(date) - Checking pod status..." + kubectl get pod "${pod_name}" -n "${namespace}" -o yaml | grep -A10 "status:" || echo "[WARN] Unable to get pod status" + fi + echo "---------------------------------------------" + sleep 10 + done + } + log & + log_pid=$! + trap "kill ${log_pid}" EXIT + set -x + kubectl wait --for=condition=available --timeout=600s deployment/${app_name}-triton -n ${namespace} + exit_code=$? + if [[ $exit_code -ne 0 ]]; then + echo "[ERROR] Deployment ${app_name}-triton failed to become available. Check pod events and initContainer logs above." + exit $exit_code + fi + kubectl get deployment ${app_name}-triton -n ${namespace} -o wide + kubectl describe deployment ${app_name}-triton -n ${namespace} + exit ${exit_code} + - name: Wait for Pod to be Ready + early-cancel: any-job-failed + env: + app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} + namespace: ${{ inputs.k8s.namespace }} + run: | + echo "Waiting for pod to be ready..." + kubectl wait --for=condition=Ready pod -l app=${app_name}-triton -n ${namespace} --timeout=600s + pod=$(kubectl get pods -n ${namespace} -l app=${app_name}-triton --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") + echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS + touch pod.running + - name: Stream Triton Logs + early-cancel: any-job-failed + run: | + kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }}-triton -n ${{ inputs.k8s.namespace }} & + triton_stream_pid=$? + echo ${triton_stream_pid} > triton_stream.pid + - name: Stream WebUI Logs + early-cancel: any-job-failed + run: | + kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }}-ui -n ${{ inputs.k8s.namespace }} & + webui_stream_pid=$? + echo ${webui_stream_pid} > webui_stream.pid + create_k8s_session: + needs: + - apply_k8s_deployment + steps: + - name: Wait until the Kubernetes deployment reaches its final stage + early-cancel: any-job-failed + run: | + while true; do + if [ -f "app.deleted" ]; then + echo "File app.deleted was detected. Exiting..." + exit 0 + elif [ -f "pod.running" ]; then + echo "Pod is ready" + break + fi + sleep 2 + done + - name: Get Service Name + early-cancel: any-job-failed + run: | + source OUTPUTS + echo "service_name=${app_name}-ui" | tee -a $OUTPUTS + - name: Expose port + early-cancel: any-job-failed + uses: parallelworks/update-session + with: + remotePort: '7860' + name: ${{ sessions.session }} + targetInfo: + name: ${{ inputs.k8s.cluster }} + namespace: ${{ inputs.k8s.namespace }} + resourceType: services + resourceName: ${{ needs.create_k8s_session.outputs.service_name }} + keep_alive: + needs: + - create_k8s_session + steps: + - name: Keep Session Running + early-cancel: any-job-failed + run: tail -f /dev/null + cleanup: | + echo "Cleaning up resources for keep_alive job..." + source OUTPUTS + kubectl delete deployment ${app_name}-triton -n ${{ inputs.k8s.namespace }} --ignore-not-found + kubectl delete service ${app_name}-triton -n ${{ inputs.k8s.namespace }} --ignore-not-found + kubectl delete deployment ${app_name}-ui -n ${{ inputs.k8s.namespace }} --ignore-not-found + kubectl delete service ${app_name}-ui -n ${{ inputs.k8s.namespace }} --ignore-not-found +'on': + execute: + inputs: + k8s: + type: group + label: Kubernetes Settings + items: + cluster: + label: Kubernetes cluster + type: kubernetes-clusters + namespace: + label: Namespace + type: kubernetes-namespaces + clusterName: ${{ inputs.k8s.cluster }} + volumes: + type: group + label: Triton Volumes + collapsed: true + tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. + items: + pvc: + label: Persistent Volume Claim + type: dropdown + default: Existing + options: + - value: Existing + label: Select Existing PVC + - value: New + label: Create New PVC + pvc_mount_path: + label: Mount Path + type: string + default: /models + pvc_existing: + label: Select PVC Name + type: kubernetes-pvc + clusterName: ${{ inputs.k8s.cluster }} + namespace: ${{ inputs.k8s.namespace }} + hidden: ${{ inputs.k8s.volumes.pvc !== Existing }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + pvc_storage_size: + label: Enter PVC Size + type: string + hidden: ${{ inputs.k8s.volumes.pvc !== New }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + default: 100Gi + pvc_storage_class: + label: Enter PVC Storage Class + type: string + hidden: ${{ inputs.k8s.volumes.pvc !== New }} + ignore: ${{ .hidden }} + optional: true + tooltip: Leave blank to use the default storage class configured in the cluster. + pvc_persist: + label: Persist PVC After Completion + type: boolean + default: false + hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. + pvc_name: + label: Enter PVC Name + type: string + hidden: ${{ inputs.k8s.volumes.pvc_persist === false || inputs.k8s.volumes.pvc !== New }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + triton_resources: + type: group + label: Triton Resources + collapsed: true + items: + requests: + type: group + label: Requests + items: + memory: + label: Memory + type: string + default: 8Gi + tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 8Gi). + cpu: + label: CPU + type: string + default: '4' + tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 4, 100m). + limits: + type: group + label: Limits + items: + memory: + label: Memory + type: string + default: 16Gi + tooltip: Set the maximum memory the pod can use (e.g., 8Gi, 16Gi). + cpu: + label: CPU + type: string + default: '8' + tooltip: Set the maximum CPU the pod can use (e.g., 4, 8, 500m). + select_gpu: + label: Select GPU Device + type: dropdown + tooltip: Choose the type of GPU device for the deployment, if needed. + default: nvidia.com/gpu + options: + - value: nvidia.com/gpu + label: Nvidia GPU + - value: amd.com/gpu + label: AMD GPU + - value: Custom + label: Custom GPU Resource Key + gpu_resource_key: + label: Custom GPU Resource Key + type: string + hidden: ${{ inputs.k8s.triton_resources.limits.select_gpu !== Custom }} + ignore: ${{ .hidden }} + tooltip: Specify a custom GPU resource key for Kubernetes. + number_of_gpus: + label: Number of GPUs + type: number + step: 1 + default: 1 + min: 1 + tooltip: Specify the number of GPUs to allocate for the deployment. + webui_resources: + type: group + label: WebUI Resources + collapsed: true + items: + requests: + type: group + label: Requests + items: + memory: + label: Memory + type: string + default: 2Gi + tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 2Gi). + cpu: + label: CPU + type: string + default: '2' + tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 2, 100m). + limits: + type: group + label: Limits + items: + memory: + label: Memory + type: string + default: 4Gi + tooltip: Set the maximum memory the pod can use (e.g., 2Gi, 4Gi). + cpu: + label: CPU + type: string + default: '4' + tooltip: Set the maximum CPU the pod can use (e.g., 2, 4, 500m). + triton_k8s: + type: group + label: Triton Settings + collapsed: true + items: + model: + label: Model Name + type: string + default: meta-llama/Meta-Llama-3.1-8B-Instruct + tooltip: Specify the Hugging Face model to use with vLLM (e.g., meta-llama/Meta-Llama-3.1-8B-Instruct). + hf_token: + label: Hugging Face Token (hf_...) + type: password + optional: false + tooltip: Your Hugging Face API token for accessing private or gated models (e.g., Llama). + gpu_memory_utilization: + label: GPU Memory Utilization + type: number + default: 0.9 + min: 0.1 + max: 0.9 + tooltip: Specify the fraction of GPU memory to utilize (0.1 to 0.9). + max_num_seqs: + label: Max Number of Sequences + type: number + default: 4 + min: 1 + tooltip: Specify the maximum number of sequences in a batch.(concurrent requests processed per batch in vLLM) + max_model_len: + label: Max Model Length + type: number + default: 1024 + min: 512 + tooltip: Maximum model length for sequences.(Define max token length for input sequences, limiting context size and KV cache memory) + tensor_parallel_size: + label: Tensor Parallel Size + type: number + default: 1 + min: 1 + tooltip: Specify the number of GPUs for tensor parallelism. + webui_k8s: + type: group + label: WebUI Settings + collapsed: true + items: + image: + label: WebUI Image + type: string + default: python:3.10-slim + image_port: + label: WebUI Port + type: number + default: 7860 + ui_max_tokens: + label: Max Output Tokens (UI) + type: number + default: 150 + min: 1 + max: 4096 + tooltip: Maximum number of tokens the model should generate in a single response for the WebUI. + ui_temperature: + label: Temperature (UI) + type: number + default: 0.8 + min: 0 + max: 2 + step: 0.1 + tooltip: Controls the randomness of the output. Higher values (e.g., 0.8-1.0) make output more creative, lower values (e.g., 0.2) make it more deterministic. diff --git a/workflow/yamls/k8s/triton-vllm/general_v0.1.yaml b/workflow/yamls/k8s/triton-vllm/general_v0.1.yaml new file mode 100644 index 00000000..2f4a704f --- /dev/null +++ b/workflow/yamls/k8s/triton-vllm/general_v0.1.yaml @@ -0,0 +1,741 @@ +permissions: + - '*' +sessions: + session: + useTLS: false + redirect: true + useCustomDomain: true +app: + target: inputs.k8s.cluster +jobs: + auth_k8s: + steps: + - name: Authenticate kubectl + early-cancel: any-job-failed + run: pw kube auth ${{ inputs.k8s.cluster }} + prepare_k8s_pvc: + needs: + - auth_k8s + steps: + - name: Creating New PVC YAML + early-cancel: any-job-failed + if: ${{ inputs.k8s.volumes.pvc === New }} + run: | + if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then + pvc_name="${{ inputs.k8s.volumes.pvc_name }}" + else + job_number=$(pwd | rev | cut -d "/" -f1 | rev) + workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) + pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc + fi + pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} + if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then + default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)' | awk '{print $1}') + if [ -z "${default_class}" ]; then + echo "ERROR: No default storage class found. Available storage classes:" + kubectl get storageclass -n ${{ inputs.k8s.namespace }} + exit 1 + fi + storageClassName="storageClassName: $default_class" + else + storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" + fi + echo "${pvc_name}" > pvc_name + cat < pvc.yaml + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: ${pvc_name} + namespace: ${{ inputs.k8s.namespace }} + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: ${{ inputs.k8s.volumes.pvc_storage_size }} + ${storageClassName} + EOF + cat pvc.yaml + - name: Dry Run PVC + early-cancel: any-job-failed + if: ${{ inputs.k8s.volumes.pvc === New }} + run: | + echo "Performing dry run for PVC..." + kubectl apply -f pvc.yaml --dry-run=client + - name: Dummy + early-cancel: any-job-failed + run: echo Dummy + prepare_k8s_deployment: + needs: + - prepare_k8s_pvc + steps: + - name: Defining App Name + early-cancel: any-job-failed + run: | + job_number=$(pwd | rev | cut -d "/" -f1 | rev) + workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) + app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) + echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS + - name: Creating Deployment and Service YAML + early-cancel: any-job-failed + run: | + if [[ "${{ inputs.k8s.triton_resources.limits.select_gpu }}" == "Custom" ]]; then + gpu_limits="${{ inputs.k8s.triton_resources.limits.gpu_resource_key }}: ${{ inputs.k8s.triton_resources.limits.number_of_gpus }}" + else + gpu_limits="${{ inputs.k8s.triton_resources.limits.select_gpu }}: ${{ inputs.k8s.triton_resources.limits.number_of_gpus }}" + fi + gpu_check_limits="nvidia.com/gpu: 1" + + tensor_parallel_size=${{ inputs.triton_k8s.tensor_parallel_size }} + + if kubectl get runtimeclass nvidia &>/dev/null; then + runtimeClassName="runtimeClassName: nvidia" + else + runtimeClassName="" + fi + + if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then + pvc_name=${{ inputs.k8s.volumes.pvc_existing }} + else + pvc_name=$(cat pvc_name) + fi + + cat < app.yaml + --- + # Deployment for Triton Inference Server + apiVersion: apps/v1 + kind: Deployment + metadata: + name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton + namespace: ${{ inputs.k8s.namespace }} + spec: + replicas: 1 + selector: + matchLabels: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton + template: + metadata: + labels: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton + spec: + runtimeClassName: nvidia + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" + + initContainers: + - name: check-gpu-memory + image: nhuytan/gpu-python:3.10-cuda12.1 + resources: + limits: + ${gpu_check_limits} + env: + - name: HF_TOKEN + value: "${{ inputs.triton_k8s.hf_token }}" + - name: MODEL_NAME + value: "${{ inputs.triton_k8s.model }}" + - name: MAX_MODEL_LEN + value: "${{ inputs.triton_k8s.max_model_len }}" + - name: MAX_NUM_SEQS + value: "${{ inputs.triton_k8s.max_num_seqs }}" + - name: GPU_MEMORY_UTILIZATION + value: "${{ inputs.triton_k8s.gpu_memory_utilization }}" + command: + - sh + - -c + - | + cat << 'EOF' > /tmp/check_gpu.py + import os + import subprocess + import sys + from transformers import AutoConfig, AutoModelForCausalLM + from accelerate.utils import calculate_maximum_sizes + import torch + + def main(): + model_name = os.getenv("MODEL_NAME") + hf_token = os.getenv("HF_TOKEN") + max_model_len = int(os.getenv("MAX_MODEL_LEN", "2048")) + max_num_seqs = int(os.getenv("MAX_NUM_SEQS", "4")) + gpu_mem_util = float(os.getenv("GPU_MEMORY_UTILIZATION", "0.9")) + + print(f"Checking GPU memory for model: {model_name}") + + try: + config = AutoConfig.from_pretrained(model_name, token=hf_token) + model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.float16) + total_bytes, _ = calculate_maximum_sizes(model) + model_weights_gb = total_bytes / (1024**3) + del model + torch.cuda.empty_cache() + print(f"Fetched config and estimated model size") + except Exception as e: + print(f"Failed to get config or model: {e}") + sys.exit(1) + + head_dim = config.hidden_size // config.num_attention_heads + kv_heads = getattr(config, "num_key_value_heads", config.num_attention_heads) + kv_cache_bytes = max_model_len * max_num_seqs * 2 * config.num_hidden_layers * kv_heads * head_dim * 2 + kv_cache_gb = kv_cache_bytes / (1024**3) + + activation_gb = max_model_len * max_num_seqs * (18 * config.hidden_size + 4 * config.intermediate_size) * 2 / (1024**3) + + overhead_gb = 1.0 + + total_needed_gb = (model_weights_gb + kv_cache_gb + activation_gb + overhead_gb) / gpu_mem_util + + print(f"Estimated model size: {model_weights_gb:.1f} GB") + print(f"Estimated KV cache: {kv_cache_gb:.1f} GB") + print(f"Estimated activations: {activation_gb:.1f} GB") + print(f"Total estimated needed: {total_needed_gb:.1f} GB (after util factor {gpu_mem_util})") + + try: + output = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.total', '--format=csv,nounits,noheader']) + gpu_total_gb = float(output.decode().strip().split('\n')[0]) / 1024 + print(f"Available GPU memory: {gpu_total_gb:.1f} GB") + except Exception as e: + print(f"Failed to run nvidia-smi: {e}") + sys.exit(1) + + if total_needed_gb > gpu_total_gb: + scale = gpu_total_gb / total_needed_gb + suggested_max_model_len = max(int(max_model_len * scale * 0.5), 512) + suggested_max_num_seqs = max(int(max_num_seqs * scale * 0.5), 1) + print(f"Not enough memory. Need ~{total_needed_gb:.1f} GB, but only have {gpu_total_gb:.1f} GB.") + print(f"Suggest lowering: max_model_len → {suggested_max_model_len}, max_num_seqs → {suggested_max_num_seqs}") + sys.exit(1) + + print("Enough GPU memory, ready to deploy.") + + if __name__ == '__main__': + main() + EOF + python3.10 /tmp/check_gpu.py || { echo "Script failed"; exit 1; } + - name: set-permissions + image: busybox + command: ["sh", "-c", "chmod -R 777 ${{ inputs.k8s.volumes.pvc_mount_path }}"] + securityContext: + runAsUser: 0 + volumeMounts: + - name: storage + mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} + - name: init-model-repository + image: busybox + command: ["sh", "-c", "mkdir -p ${{ inputs.k8s.volumes.pvc_mount_path }}/vllm_model/1 && echo '{\"model\": \"${{ inputs.triton_k8s.model }}\", \"gpu_memory_utilization\": ${{ inputs.triton_k8s.gpu_memory_utilization }}, \"max_num_seqs\": ${{ inputs.triton_k8s.max_num_seqs }}, \"max_model_len\": ${{ inputs.triton_k8s.max_model_len }} ,\"tensor_parallel_size\": ${tensor_parallel_size}}' > ${{ inputs.k8s.volumes.pvc_mount_path }}/vllm_model/1/model.json && echo 'backend: \"vllm\"\ninstance_group [\n {\n count: 1\n kind: KIND_GPU\n }\n]' > ${{ inputs.k8s.volumes.pvc_mount_path }}/vllm_model/config.pbtxt"] + volumeMounts: + - name: storage + mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} + containers: + - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton + image: nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3 + args: ["tritonserver", "--model-store=${{ inputs.k8s.volumes.pvc_mount_path }}", "--model-control-mode=POLL"] + ports: + - containerPort: 8000 + name: http + - containerPort: 8001 + name: grpc + - containerPort: 8002 + name: metrics + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_DRIVER_CAPABILITIES + value: "compute,utility" + - name: HF_TOKEN + value: "${{ inputs.triton_k8s.hf_token }}" + resources: + requests: + memory: "${{ inputs.k8s.triton_resources.requests.memory }}" + cpu: "${{ inputs.k8s.triton_resources.requests.cpu }}" + limits: + memory: "${{ inputs.k8s.triton_resources.limits.memory }}" + cpu: "${{ inputs.k8s.triton_resources.limits.cpu }}" + ${gpu_limits} + volumeMounts: + - name: storage + mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} + volumes: + - name: storage + persistentVolumeClaim: + claimName: ${pvc_name} + --- + # Service for Triton + apiVersion: v1 + kind: Service + metadata: + name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton + namespace: ${{ inputs.k8s.namespace }} + spec: + selector: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + name: http + - protocol: TCP + port: 8001 + targetPort: 8001 + name: grpc + - protocol: TCP + port: 8002 + targetPort: 8002 + name: metrics + --- + # Deployment for Gradio UI + apiVersion: apps/v1 + kind: Deployment + metadata: + name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui + namespace: ${{ inputs.k8s.namespace }} + spec: + replicas: 1 + selector: + matchLabels: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui + template: + metadata: + labels: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui + spec: + containers: + - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui + image: nhuytan/gradio-ui:latest + env: + - name: UI_MAX_TOKENS + value: "${{inputs.webui_k8s.ui_max_tokens}}" + - name: UI_TEMPERATURE + value: "${{inputs.webui_k8s.ui_temperature}}" + command: + - sh + - -c + - | + python -c " + import gradio as gr + import requests + import os + + # Get default values from shell environment variables + default_max_tokens = int(os.getenv('UI_MAX_TOKENS', '150')) + default_temperature = float(os.getenv('UI_TEMPERATURE', '0.8')) + + def chat(message, history, max_tokens, temperature): + + url = 'http://${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton.${{ inputs.k8s.namespace }}.svc.cluster.local.:8000/v2/models/vllm_model/generate' + + + payload = { + 'text_input': message, + 'parameters': { + 'stream': False, + 'temperature': temperature, + 'max_tokens': max_tokens + } + } + response = requests.post(url, json=payload) + return response.json()['text_output'] + # Create Gradio components for input parameters + max_tokens_slider = gr.Slider( + minimum=1, + maximum=4096, # Or whatever is appropriate for your model + value=default_max_tokens, + step=1, + label='Max Output Tokens') + + temperature_slider = gr.Slider( + minimum=0.0, + maximum=2.0, + value=default_temperature, + step=0.1, + label='Temperature') + + # Pass the components to ChatInterface + gr.ChatInterface( + chat, + additional_inputs=[max_tokens_slider, temperature_slider], + examples=[['What is the capital of Vietnam?', default_max_tokens, default_temperature],[ 'Tell me a short story.',200,0.7], ['Explain AI in simple terms.', 100, 0.5]]).launch(server_port=7860)" + ports: + - containerPort: 7860 + name: ui + resources: + requests: + memory: "${{ inputs.k8s.webui_resources.requests.memory }}" + cpu: "${{ inputs.k8s.webui_resources.requests.cpu }}" + limits: + memory: "${{ inputs.k8s.webui_resources.limits.memory }}" + cpu: "${{ inputs.k8s.webui_resources.limits.cpu }}" + --- + # Service for Gradio UI + apiVersion: v1 + kind: Service + metadata: + name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui + namespace: ${{ inputs.k8s.namespace }} + spec: + selector: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-ui + ports: + - protocol: TCP + port: 7860 + targetPort: 7860 + name: ui + EOF + cat app.yaml + - name: Dry Run Deployment + early-cancel: any-job-failed + run: | + echo "Performing dry run for deployment..." + kubectl apply -f app.yaml --dry-run=client + apply_k8s_deployment: + needs: + - prepare_k8s_deployment + steps: + - name: Load outputs + run: cat OUTPUTS >> $OUTPUTS + - name: Apply PVC + if: ${{ inputs.k8s.volumes.pvc === New }} + run: kubectl apply -f pvc.yaml + - name: Apply Deployment and Service + run: kubectl apply -f app.yaml + cleanup: | + kubectl delete -f app.yamltouch app.deleted + - name: Wait for Deployment to be Ready + early-cancel: any-job-failed + env: + app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} + namespace: ${{ inputs.k8s.namespace }} + run: | + log() { + while true; do + echo + echo "[INFO] $(date) - Checking deployment status for ${app_name}-triton in namespace ${namespace}..." + kubectl get deployment "${app_name}-triton" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" + echo "[INFO] $(date) - Pods status:" + kubectl get pods -l app="${app_name}-triton" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" + pod_name=$(kubectl get pods -l app="${app_name}-triton" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + if [[ -n "$pod_name" ]]; then + echo "[INFO] $(date) - Describing pod ${pod_name}..." + kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" + echo "[INFO] $(date) - Checking initContainer logs..." + kubectl logs "${pod_name}" -n "${namespace}" -c set-permissions 2>/dev/null || echo "[WARN] No logs for set-permissions" + kubectl logs "${pod_name}" -n "${namespace}" -c init-model-repository 2>/dev/null || echo "[WARN] No logs for init-model-repository" + echo "[INFO] $(date) - Checking pod status..." + kubectl get pod "${pod_name}" -n "${namespace}" -o yaml | grep -A10 "status:" || echo "[WARN] Unable to get pod status" + fi + echo "---------------------------------------------" + sleep 10 + done + } + log & + log_pid=$! + trap "kill ${log_pid}" EXIT + set -x + kubectl wait --for=condition=available --timeout=600s deployment/${app_name}-triton -n ${namespace} + exit_code=$? + if [[ $exit_code -ne 0 ]]; then + echo "[ERROR] Deployment ${app_name}-triton failed to become available. Check pod events and initContainer logs above." + exit $exit_code + fi + kubectl get deployment ${app_name}-triton -n ${namespace} -o wide + kubectl describe deployment ${app_name}-triton -n ${namespace} + exit ${exit_code} + - name: Wait for Pod to be Ready + early-cancel: any-job-failed + env: + app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} + namespace: ${{ inputs.k8s.namespace }} + run: | + echo "Waiting for pod to be ready..." + kubectl wait --for=condition=Ready pod -l app=${app_name}-triton -n ${namespace} --timeout=600s + pod=$(kubectl get pods -n ${namespace} -l app=${app_name}-triton --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") + echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS + touch pod.running + - name: Stream Triton Logs + early-cancel: any-job-failed + run: | + kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }}-triton -n ${{ inputs.k8s.namespace }} & + triton_stream_pid=$? + echo ${triton_stream_pid} > triton_stream.pid + - name: Stream WebUI Logs + early-cancel: any-job-failed + run: | + kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }}-ui -n ${{ inputs.k8s.namespace }} & + webui_stream_pid=$? + echo ${webui_stream_pid} > webui_stream.pid + create_k8s_session: + needs: + - apply_k8s_deployment + steps: + - name: Wait until the Kubernetes deployment reaches its final stage + early-cancel: any-job-failed + run: | + while true; do + if [ -f "app.deleted" ]; then + echo "File app.deleted was detected. Exiting..." + exit 0 + elif [ -f "pod.running" ]; then + echo "Pod is ready" + break + fi + sleep 2 + done + - name: Get Service Name + early-cancel: any-job-failed + run: | + source OUTPUTS + echo "service_name=${app_name}-ui" | tee -a $OUTPUTS + - name: Expose port + early-cancel: any-job-failed + uses: parallelworks/update-session + with: + remotePort: '7860' + name: ${{ sessions.session }} + targetInfo: + name: ${{ inputs.k8s.cluster }} + namespace: ${{ inputs.k8s.namespace }} + resourceType: services + resourceName: ${{ needs.create_k8s_session.outputs.service_name }} + keep_alive: + needs: + - create_k8s_session + steps: + - name: Keep Session Running + early-cancel: any-job-failed + run: tail -f /dev/null + cleanup: | + echo "Cleaning up resources for keep_alive job..." + source OUTPUTS + kubectl delete deployment ${app_name}-triton -n ${{ inputs.k8s.namespace }} --ignore-not-found + kubectl delete service ${app_name}-triton -n ${{ inputs.k8s.namespace }} --ignore-not-found + kubectl delete deployment ${app_name}-ui -n ${{ inputs.k8s.namespace }} --ignore-not-found + kubectl delete service ${app_name}-ui -n ${{ inputs.k8s.namespace }} --ignore-not-found +'on': + execute: + inputs: + k8s: + type: group + label: Kubernetes Settings + items: + cluster: + label: Kubernetes cluster + type: kubernetes-clusters + namespace: + label: Namespace + type: kubernetes-namespaces + clusterName: ${{ inputs.k8s.cluster }} + volumes: + type: group + label: Triton Volumes + collapsed: true + tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. + items: + pvc: + label: Persistent Volume Claim + type: dropdown + default: Existing + options: + - value: Existing + label: Select Existing PVC + - value: New + label: Create New PVC + pvc_mount_path: + label: Mount Path + type: string + default: /models + pvc_existing: + label: Select PVC Name + type: kubernetes-pvc + clusterName: ${{ inputs.k8s.cluster }} + namespace: ${{ inputs.k8s.namespace }} + hidden: ${{ inputs.k8s.volumes.pvc !== Existing }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + pvc_storage_size: + label: Enter PVC Size + type: string + hidden: ${{ inputs.k8s.volumes.pvc !== New }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + default: 100Gi + pvc_storage_class: + label: Enter PVC Storage Class + type: string + hidden: ${{ inputs.k8s.volumes.pvc !== New }} + ignore: ${{ .hidden }} + optional: true + tooltip: Leave blank to use the default storage class configured in the cluster. + pvc_persist: + label: Persist PVC After Completion + type: boolean + default: false + hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. + pvc_name: + label: Enter PVC Name + type: string + hidden: ${{ inputs.k8s.volumes.pvc_persist === false || inputs.k8s.volumes.pvc !== New }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + triton_resources: + type: group + label: Triton Resources + collapsed: true + items: + requests: + type: group + label: Requests + items: + memory: + label: Memory + type: string + default: 8Gi + tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 8Gi). + cpu: + label: CPU + type: string + default: '4' + tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 4, 100m). + limits: + type: group + label: Limits + items: + memory: + label: Memory + type: string + default: 16Gi + tooltip: Set the maximum memory the pod can use (e.g., 8Gi, 16Gi). + cpu: + label: CPU + type: string + default: '8' + tooltip: Set the maximum CPU the pod can use (e.g., 4, 8, 500m). + select_gpu: + label: Select GPU Device + type: dropdown + tooltip: Choose the type of GPU device for the deployment, if needed. + default: nvidia.com/gpu + options: + - value: nvidia.com/gpu + label: Nvidia GPU + - value: amd.com/gpu + label: AMD GPU + - value: Custom + label: Custom GPU Resource Key + gpu_resource_key: + label: Custom GPU Resource Key + type: string + hidden: ${{ inputs.k8s.triton_resources.limits.select_gpu !== Custom }} + ignore: ${{ .hidden }} + tooltip: Specify a custom GPU resource key for Kubernetes. + number_of_gpus: + label: Number of GPUs + type: number + step: 1 + default: 1 + min: 1 + tooltip: Specify the number of GPUs to allocate for the deployment. + webui_resources: + type: group + label: WebUI Resources + collapsed: true + items: + requests: + type: group + label: Requests + items: + memory: + label: Memory + type: string + default: 2Gi + tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 2Gi). + cpu: + label: CPU + type: string + default: '2' + tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 2, 100m). + limits: + type: group + label: Limits + items: + memory: + label: Memory + type: string + default: 4Gi + tooltip: Set the maximum memory the pod can use (e.g., 2Gi, 4Gi). + cpu: + label: CPU + type: string + default: '4' + tooltip: Set the maximum CPU the pod can use (e.g., 2, 4, 500m). + triton_k8s: + type: group + label: Triton Settings + collapsed: true + items: + model: + label: Model Name + type: string + default: meta-llama/Meta-Llama-3.1-8B-Instruct + tooltip: Specify the Hugging Face model to use with vLLM (e.g., meta-llama/Meta-Llama-3.1-8B-Instruct). + hf_token: + label: Hugging Face Token (hf_...) + type: password + optional: false + tooltip: Your Hugging Face API token for accessing private or gated models (e.g., Llama). + gpu_memory_utilization: + label: GPU Memory Utilization + type: number + default: 0.9 + min: 0.1 + max: 0.9 + tooltip: Specify the fraction of GPU memory to utilize (0.1 to 0.9). + max_num_seqs: + label: Max Number of Sequences + type: number + default: 4 + min: 1 + tooltip: Specify the maximum number of sequences in a batch.(concurrent requests processed per batch in vLLM) + max_model_len: + label: Max Model Length + type: number + default: 1024 + min: 512 + tooltip: Maximum model length for sequences.(Define max token length for input sequences, limiting context size and KV cache memory) + tensor_parallel_size: + label: Tensor Parallel Size + type: number + default: 1 + min: 1 + tooltip: Specify the number of GPUs for tensor parallelism. + webui_k8s: + type: group + label: WebUI Settings + collapsed: true + items: + image: + label: WebUI Image + type: string + default: python:3.10-slim + image_port: + label: WebUI Port + type: number + default: 7860 + ui_max_tokens: + label: Max Output Tokens (UI) + type: number + default: 150 + min: 1 + max: 4096 + tooltip: Maximum number of tokens the model should generate in a single response for the WebUI. + ui_temperature: + label: Temperature (UI) + type: number + default: 0.8 + min: 0 + max: 2 + step: 0.1 + tooltip: Controls the randomness of the output. Higher values (e.g., 0.8-1.0) make output more creative, lower values (e.g., 0.2) make it more deterministic. From 88a7eff2b9ee86146ea5c503225da71d4dae9e67 Mon Sep 17 00:00:00 2001 From: nhuytan1 Date: Mon, 11 Aug 2025 12:05:47 -0500 Subject: [PATCH 3/5] add dry run --- workflow/yamls/k8s/ollama/general.yaml | 540 ++++++++++++++++++ workflow/yamls/k8s/sam2-model/general.yaml | 7 +- .../triton-vllm/general_removecheckgpu.yaml | 50 +- 3 files changed, 595 insertions(+), 2 deletions(-) create mode 100644 workflow/yamls/k8s/ollama/general.yaml diff --git a/workflow/yamls/k8s/ollama/general.yaml b/workflow/yamls/k8s/ollama/general.yaml new file mode 100644 index 00000000..ab0de1a4 --- /dev/null +++ b/workflow/yamls/k8s/ollama/general.yaml @@ -0,0 +1,540 @@ +permissions: + - '*' +sessions: + session: + useTLS: false + redirect: true + useCustomDomain: true +app: + target: inputs.k8s.cluster +jobs: + auth_k8s: + steps: + - name: Authenticate kubectl + early-cancel: any-job-failed + run: pw kube auth ${{ inputs.k8s.cluster }} + prepare_k8s_pvc: + needs: + - auth_k8s + steps: + - name: Creating New PVC YAML + early-cancel: any-job-failed + if: ${{ inputs.k8s.volumes.pvc === New }} + run: | + if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then + pvc_name="${{ inputs.k8s.volumes.pvc_name }}" + else + job_number=$(pwd | rev | cut -d "/" -f1 | rev) + workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) + pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc + fi + pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} + if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then + default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') + if [ $? -ne 0 ]; then + echo "WARNING: Could not obtain default storageClass with command:" + echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" + echo " You might need to provide a storage class input" + elif [ -z "${default_class}" ]; then + echo "ERROR: No default storage class found. You must specify one explicitly." + exit 1 + fi + else + storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" + fi + echo "${pvc_name}" > pvc_name + cat < pvc.yaml + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: ${pvc_name} + namespace: ${{ inputs.k8s.namespace }} + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: ${{ inputs.k8s.volumes.pvc_storage_size }} + ${storageClassName} + EOF + cat pvc.yaml + - name: Dry Run PVC + early-cancel: any-job-failed + if: ${{ inputs.k8s.volumes.pvc === New }} + run: | + echo "Performing dry run..." + kubectl apply -f pvc.yaml --dry-run=client + - name: Dummy + early-cancel: any-job-failed + run: echo Dummy + prepare_k8s_deployment: + needs: + - prepare_k8s_pvc + steps: + - name: Defining App Name + early-cancel: any-job-failed + run: | + job_number=$(pwd | rev | cut -d "/" -f1 | rev) + workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) + app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) + echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS + - name: Creating Deployment and Service YAML + early-cancel: any-job-failed + run: | + if [[ "${{ inputs.k8s.ollama_resources.limits.select_gpu }}" == "Custom" ]]; then + gpu_limits="${{ inputs.k8s.ollama_resources.limits.gpu_resource_key }}: ${{ inputs.k8s.ollama_resources.limits.number_of_gpus }}" + elif [[ "${{ inputs.k8s.ollama_resources.limits.select_gpu }}" != "None" ]]; then + gpu_limits="${{ inputs.k8s.ollama_resources.limits.select_gpu }}: ${{ inputs.k8s.ollama_resources.limits.number_of_gpus }}" + fi + # Attach RuntimeClass if it's available and using NVIDIA + if kubectl get runtimeclass nvidia &>/dev/null; then + echo "nvidia RuntimeClass is available" + runtimeClassName="runtimeClassName: nvidia" + fi + + if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then + pvc_name=${{ inputs.k8s.volumes.pvc_existing }} + else + pvc_name=$(cat pvc_name) + fi + + cat < app.yaml + --- + # Deployment for ollama + apiVersion: apps/v1 + kind: Deployment + metadata: + name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} + namespace: ${{ inputs.k8s.namespace }} + spec: + replicas: 1 + selector: + matchLabels: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} + template: + metadata: + labels: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} + spec: + runtimeClassName: nvidia + initContainers: + - name: set-permissions + image: busybox + command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] + securityContext: + runAsUser: 0 + volumeMounts: + - name: storage + mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} + containers: + - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} + image: ${{ inputs.ollama_k8s.image }} + ports: + - containerPort: ${{ inputs.ollama_k8s.image_port }} + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_DRIVER_CAPABILITIES + value: "compute,utility" + resources: + requests: + memory: "${{ inputs.k8s.ollama_resources.requests.memory }}" + cpu: "${{ inputs.k8s.ollama_resources.requests.cpu }}" + limits: + memory: "${{ inputs.k8s.ollama_resources.limits.memory }}" + cpu: "${{ inputs.k8s.ollama_resources.limits.cpu }}" + ${gpu_limits} + volumeMounts: + - name: storage + mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} + volumes: + - name: storage + persistentVolumeClaim: + claimName: ${pvc_name} # Assumes PVC name is provided as an input + --- + # Service for ollama + apiVersion: v1 + kind: Service + metadata: + name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} + namespace: ${{ inputs.k8s.namespace }} + spec: + selector: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} + ports: + - protocol: TCP + port: ${{ inputs.ollama_k8s.image_port }} + targetPort: ${{ inputs.ollama_k8s.image_port }} + + --- + # Deployment for openwebui + apiVersion: apps/v1 + kind: Deployment + metadata: + name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui + namespace: ${{ inputs.k8s.namespace }} + spec: + replicas: 1 + selector: + matchLabels: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui + template: + metadata: + labels: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui + spec: + containers: + - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui + image: ${{ inputs.openwebui_k8s.image }} + ports: + - containerPort: ${{ inputs.openwebui_k8s.image_port }} + env: + - name: OLLAMA_BASE_URL + value: "http://${{ needs.prepare_k8s_deployment.outputs.app_name }}.${{ inputs.k8s.namespace }}.svc.cluster.local.:${{ inputs.ollama_k8s.image_port }}" + - name: WEBUI_AUTH + value: "False" + resources: + requests: + memory: "${{ inputs.k8s.openwebui_resources.requests.memory }}" + cpu: "${{ inputs.k8s.openwebui_resources.requests.cpu }}" + limits: + memory: "${{ inputs.k8s.openwebui_resources.limits.memory }}" + cpu: "${{ inputs.k8s.openwebui_resources.limits.cpu }}" + + --- + # Service for openwebui + apiVersion: v1 + kind: Service + metadata: + name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb + namespace: ${{ inputs.k8s.namespace }} + spec: + selector: + app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui + ports: + - protocol: TCP + port: ${{ inputs.openwebui_k8s.image_port }} + targetPort: ${{ inputs.openwebui_k8s.image_port }} + EOF + - name: Dry Run Deployment + early-cancel: any-job-failed + run: | + echo "Performing dry run..." + kubectl apply -f app.yaml --dry-run=client + apply_k8s_deployment: + needs: + - prepare_k8s_deployment + steps: + - name: Load outputs + early-cancel: any-job-failed + run: cat OUTPUTS >> $OUTPUTS + - name: Apply PVC + if: ${{ inputs.k8s.volumes.pvc === New }} + run: kubectl apply -f pvc.yaml + cleanup: | + if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then + kubectl delete -f pvc.yaml + touch pvc.deleted + fi + - name: Apply Deployment and Service + run: kubectl apply -f app.yaml + cleanup: | + kubectl delete -f app.yaml + touch app.deleted + - name: Wait for Deployment to be Ready + early-cancel: any-job-failed + env: + app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} + namespace: ${{ inputs.k8s.namespace }} + run: | + + log() { + while true; do + echo + echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." + kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" + + echo; echo "[INFO] $(date) - Pods status:" + kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" + + pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + if [[ -n "$pod_name" ]]; then + echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." + kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" + fi + + echo "---------------------------------------------" + sleep 10 + done + } + + log & + log_pid=$! + trap "kill ${log_pid}" EXIT + set -x + kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} + exit_code=$? + kubectl get deployment ${app_name} -n ${namespace} -o wide + kubectl describe deployment ${app_name} -n ${namespace} + exit ${exit_code} + - name: Wait for Pod to be Ready + early-cancel: any-job-failed + env: + app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} + namespace: ${{ inputs.k8s.namespace }} + run: | + echo "Waiting for pod to be ready..." + kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s + pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") + echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS + touch pod.running + - name: Pull Ollama Models + early-cancel: any-job-failed + env: + pod_name: ${{ needs.apply_k8s_deployment.outputs.pod }} + namespace: ${{ inputs.k8s.namespace }} + run: | + set -x + kubectl -n $namespace exec $pod_name -- /bin/sh -c "ollama pull gpt-oss:20b " & + - name: Stream Ollama Logs + run: | + kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} & + ollama_stream_pid=$? + echo ${ollama_stream_pid} > ollama_stream.pid + cleanup: kill $(cat ollama_stream.pid) + - name: Stream OpenWebUI Logs + early-cancel: any-job-failed + run: | + kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }}-openwebui -n ${{ inputs.k8s.namespace }} + create_k8s_session: + needs: + - prepare_k8s_deployment + steps: + - name: Wait until the Kubernetes deployment reaches its final stage + early-cancel: any-job-failed + run: | + while true; do + if [ -f "app.deleted" ]; then + echo "File app.deleted was detected. Exiting..." + exit 0 + elif [ -f "pod.running" ]; then + echo "Pod is ready" + break + fi + sleep 2 + done + - name: Get Service Name + early-cancel: any-job-failed + run: | + source OUTPUTS + echo "service_name=${app_name}-lb" | tee -a $OUTPUTS + - name: Expose port + early-cancel: any-job-failed + uses: parallelworks/update-session + with: + remotePort: ${{ inputs.openwebui_k8s.image_port }} + name: ${{ sessions.session }} + targetInfo: + name: ${{ inputs.k8s.cluster }} + namespace: ${{ inputs.k8s.namespace }} + resourceType: services + resourceName: ${{ needs.create_k8s_session.outputs.service_name }} +'on': + execute: + inputs: + k8s: + type: group + label: Kubernetes Settings + items: + cluster: + label: Kubernetes cluster + type: kubernetes-clusters + namespace: + label: Namespace + type: kubernetes-namespaces + clusterName: ${{ inputs.k8s.cluster }} + volumes: + type: group + label: Ollama Volumes + collapsed: true + tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. + items: + pvc: + label: Persistent Volume Claim + type: dropdown + default: New + options: + - value: Existing + label: Select Existing PVC + - value: New + label: Create New PVC + pvc_mount_path: + label: Mount Path + type: string + default: /root/.ollama + pvc_existing: + label: Select PVC Name + type: kubernetes-pvc + clusterName: ${{ inputs.k8s.cluster }} + namespace: ${{ inputs.k8s.namespace }} + hidden: ${{ inputs.k8s.volumes.pvc !== Existing }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + pvc_storage_size: + label: Enter PVC Size + type: string + hidden: ${{ inputs.k8s.volumes.pvc !== New }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + default: 20Gi + pvc_storage_class: + label: Enter PVC Storage Class + type: string + hidden: ${{ inputs.k8s.volumes.pvc !== New }} + ignore: ${{ .hidden }} + optional: true + tooltip: Leave blank to use the default storage class configured in the cluster. + pvc_persist: + label: Persist PVC After Completion + type: boolean + default: false + hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. + pvc_name: + label: Enter PVC Name + type: string + hidden: ${{ inputs.k8s.volumes.pvc_persist === false || inputs.k8s.volumes.pvc !== New }} + ignore: ${{ .hidden }} + optional: ${{ .hidden }} + ollama_resources: + type: group + label: Ollama Resources + collapsed: true + items: + requests: + type: group + label: Requests + items: + memory: + label: Memory + type: string + default: 2Gi + tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). + cpu: + label: CPU + type: string + default: '2' + tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). + limits: + type: group + label: Limits + items: + memory: + label: Memory + type: string + default: 4Gi + tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). + cpu: + label: CPU + type: string + default: '4' + tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). + select_gpu: + label: Select GPU Device + type: dropdown + tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. + default: nvidia.com/gpu + options: + - value: None + label: None + - value: nvidia.com/gpu + label: Nvidia GPU + - value: amd.com/gpu + label: AMD GPU + - value: cloud-tpus.google.com/v3 + label: Google TPU + - value: Custom + label: Custom GPU Resource Key + gpu_resource_key: + label: Custom GPU Resource Key + type: string + hidden: ${{ inputs.k8s.ollama_resources.limits.select_gpu !== Custom }} + ignore: ${{ .hidden }} + tooltip: | + Specify a custom GPU resource key for Kubernetes, such as: + - nvidia.com/gpu + - amd.com/gpu + - cloud-tpus.google.com/v3 + - nvidia.com/mig-1g.5gb + - nvidia.com/mig-2g.10gb + - nvidia.com/mig-3g.20gb + number_of_gpus: + label: Number of GPUs + type: number + step: 1 + default: 1 + min: 1 + tooltip: Specify the number of GPUs to allocate for the deployment. + hidden: ${{ inputs.k8s.ollama_resources.limits.select_gpu === None }} + ignore: ${{ .hidden }} + openwebui_resources: + type: group + label: OpenWebUI Resources + collapsed: true + items: + requests: + type: group + label: Requests + items: + memory: + label: Memory + type: string + default: 2Gi + tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). + cpu: + label: CPU + type: string + default: '2' + tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). + limits: + type: group + label: Limits + items: + memory: + label: Memory + type: string + default: 4Gi + tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). + cpu: + label: CPU + type: string + default: '4' + tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). + ollama_k8s: + type: group + label: Ollama Settings + collapsed: true + items: + image: + label: Ollama Image + type: string + default: ollama/ollama:latest + image_port: + label: Ollama Port + type: number + default: 11434 + openwebui_k8s: + type: group + label: OpenWebUI Settings + collapsed: true + items: + image: + label: OpenWebUI Image + type: string + default: ghcr.io/open-webui/open-webui:main + image_port: + label: OpenWebUI Port + type: number + default: 8080 diff --git a/workflow/yamls/k8s/sam2-model/general.yaml b/workflow/yamls/k8s/sam2-model/general.yaml index b2e05b1b..98ad7846 100644 --- a/workflow/yamls/k8s/sam2-model/general.yaml +++ b/workflow/yamls/k8s/sam2-model/general.yaml @@ -166,7 +166,12 @@ jobs: targetPort: 3000 type: ClusterIP EOF - apply_sam2: + - name: Dry Run Deployment + early-cancel: any-job-failed + run: | + echo "Performing dry run..." + kubectl apply -f app.yaml --dry-run=client + apply_sam2_deployment: needs: - prepare_sam2 steps: diff --git a/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml b/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml index 1faa57ad..68509fa3 100644 --- a/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml +++ b/workflow/yamls/k8s/triton-vllm/general_removecheckgpu.yaml @@ -143,7 +143,12 @@ jobs: containers: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-triton image: nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3 - args: ["tritonserver", "--model-store=${{ inputs.k8s.volumes.pvc_mount_path }}", "--model-control-mode=POLL"] + args: + - /bin/bash + - -c + - | + pip install --upgrade "vllm[gptoss]" --pre --extra-index-url https://wheels.vllm.ai/gpt-oss/ && \ + tritonserver --model-store=$MODEL_STORE --model-control-mode=POLL ports: - containerPort: 8000 name: http @@ -158,6 +163,8 @@ jobs: value: "compute,utility" - name: HF_TOKEN value: "${{ inputs.triton_k8s.hf_token }}" + - name: MODEL_STORE + value: "${{ inputs.k8s.volumes.pvc_mount_path }}" resources: requests: memory: "${{ inputs.k8s.triton_resources.requests.memory }}" @@ -306,12 +313,53 @@ jobs: - prepare_k8s_deployment steps: - name: Load outputs + early-cancel: any-job-failed run: cat OUTPUTS >> $OUTPUTS - name: Apply PVC + early-cancel: any-job-failed if: ${{ inputs.k8s.volumes.pvc === New }} run: kubectl apply -f pvc.yaml + cleanup: | + set -x + if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then + MAX_ATTEMPTS=5 + ATTEMPT=1 + while true; do + if kubectl delete -f pvc.yaml; then + echo "PVC deleted successfully" + touch pvc.deleted + break + elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then + echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" + exit 1 + else + echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." + sleep 5 + ((ATTEMPT++)) + fi + done + fi - name: Apply Deployment and Service run: kubectl apply -f app.yaml + early-cancel: any-job-failed + cleanup: | + set -x + MAX_ATTEMPTS=5 + ATTEMPT=1 + while true; do + if kubectl delete -f app.yaml; then + echo "Resources deleted successfully" + touch app.deleted + break + elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then + echo "Failed to delete resources after $MAX_ATTEMPTS attempts" + exit 1 + else + echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." + sleep 5 + ((ATTEMPT++)) + fi + done - name: Wait for Deployment to be Ready early-cancel: any-job-failed env: From cab682296584cda2f24011d6add558cbfc296150 Mon Sep 17 00:00:00 2001 From: nhuytan1 Date: Mon, 11 Aug 2025 14:53:39 -0500 Subject: [PATCH 4/5] remove early-cancel when apply pvc --- workflow/yamls/k8s/sam2-model/general.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflow/yamls/k8s/sam2-model/general.yaml b/workflow/yamls/k8s/sam2-model/general.yaml index 98ad7846..60d3b741 100644 --- a/workflow/yamls/k8s/sam2-model/general.yaml +++ b/workflow/yamls/k8s/sam2-model/general.yaml @@ -102,7 +102,7 @@ jobs: labels: app: ${{ inputs.app.name }} spec: - ${runtimeClassName} + runtimeClassName: nvidia initContainers: - name: set-permissions image: busybox @@ -176,7 +176,6 @@ jobs: - prepare_sam2 steps: - name: Apply PVC - early-cancel: any-job-failed if: ${{ inputs.k8s.volumes.pvc === 'New' }} run: kubectl apply -f pvc.yaml cleanup: | From 94cf3ffe4327c0d71c65ee81736ee239270fd61b Mon Sep 17 00:00:00 2001 From: nhuytan1 Date: Wed, 13 Aug 2025 22:44:27 -0500 Subject: [PATCH 5/5] sam2: update general.yaml; add ollama general.md and general_v0.1.yaml --- workflow/yamls/k8s/ollama/general.md | 59 ++++ workflow/yamls/k8s/ollama/general_v0.1.yaml | 350 ++++++++++++++++++++ workflow/yamls/k8s/sam2-model/general.yaml | 8 +- 3 files changed, 415 insertions(+), 2 deletions(-) create mode 100644 workflow/yamls/k8s/ollama/general.md create mode 100644 workflow/yamls/k8s/ollama/general_v0.1.yaml diff --git a/workflow/yamls/k8s/ollama/general.md b/workflow/yamls/k8s/ollama/general.md new file mode 100644 index 00000000..c643cf56 --- /dev/null +++ b/workflow/yamls/k8s/ollama/general.md @@ -0,0 +1,59 @@ +# 🧠 Secure LLM Serving using Ollama on Kubernetes + +This workflow launches a **GPU-enabled Ollama server** on a Kubernetes cluster with a secure API gateway. Users can select a model (e.g., `mistral`, `qwen3`, `deepseek`), which will be pulled and served behind a public **Cloudflare Tunnel** with **API key protection**. The resulting endpoint is **OpenAI-compatible** and ready for use in tools like **LangChain**, **OpenWebUI**, or **Postman**. + +--- + +## 🚀 Quick Start + +- **Select a Kubernetes Cluster:** Choose your target K8s cluster. +- **Set Namespace:** Specify the namespace to deploy in (e.g., `default`, `summer2025interns`). +- **Choose Model:** Select a model like `mistral`, `qwen3:4b`, or `deepseek-r1:1.5b`. + > 🔍 **Browse available models** at [https://ollama.com/models](https://ollama.com/models) +- **Define Resources:** Pick a GPU-enabled preset or set custom CPU/RAM/GPU limits. +- **Run the Workflow:** Deploy and wait for the endpoint to be available. + +--- + +## 🔐 Accessing the API + +Once deployed, the system will: + +- ✅ Generate a **secure API key** +- ✅ Start an **OpenAI-compatible proxy** +- ✅ Launch a **Cloudflare Tunnel** to expose the endpoint publicly + +You will receive these credentials in the logs: + +- **API Key** +- **Public Endpoint (URL)** +- **Model Name** + +Use them to authenticate with any OpenAI-compatible frontend. + +--- + +## 🧩 AI Integration in Parallel Works + +After deployment, the workflow **automatically registers the new model endpoint** as an AI Provider in Parallel Works. This enables: + +- Seamless use in **AI Chat** workflows +- Easy model selection in downstream **pipelines** +- Reuse across teams and namespaces with API key control + +No manual setup needed — everything is handled during execution. + +--- + +## 📡 Integration Example + +Example `curl` request: + +```bash +curl https://.trycloudflare.com/v1/chat/completions \ + -H "Authorization: Bearer " \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mistral", + "messages": [{"role": "user", "content": "Hello!"}] + }' diff --git a/workflow/yamls/k8s/ollama/general_v0.1.yaml b/workflow/yamls/k8s/ollama/general_v0.1.yaml new file mode 100644 index 00000000..8f83711c --- /dev/null +++ b/workflow/yamls/k8s/ollama/general_v0.1.yaml @@ -0,0 +1,350 @@ +permissions: + - '*' +sessions: + session: + useTLS: false + redirect: true + useCustomDomain: true +app: + target: inputs.k8s.cluster +jobs: + get_pw_api_key: + steps: + - name: Authenticate with K8s + run: pw kube auth ${{ inputs.k8s.cluster }} + - name: Export PW_API_KEY from Parallel Works environment + run: | + source /etc/profile.d/parallelworks-env.sh + echo "PW_API_KEY found: ${PW_API_KEY:0:6}********" + echo "pw_api_key=$PW_API_KEY" >> OUTPUTS + + pw_api_key_b64=$(echo -n "$PW_API_KEY" | base64) + echo "pw_api_key_b64=$pw_api_key_b64" >> OUTPUTS + + + team=$(curl -s -H "Authorization: Basic $pw_api_key_b64" https://activate.parallel.works/api/v2/teams | grep -o '"id": *"[^"]*"' | head -n1 | cut -d':' -f2 | tr -d ' "') + echo "team=$team" >> OUTPUTS + + org=$(curl -s -H "Authorization: Basic $pw_api_key_b64" https://activate.parallel.works/api/auth/whoami/organization | tr -d '"') + echo "org=$org" >> OUTPUTS + + np=$(curl -s -H "Authorization: Basic $pw_api_key_b64" https://activate.parallel.works/api/auth/whoami | sed 's/^user://') + echo "np=$np" >> OUTPUTS + create_pvc: + needs: + - get_pw_api_key + steps: + - name: Authenticate Kubernetes + run: pw kube auth ${{ inputs.k8s.cluster }} + - name: Create PVC + run: | + cat < test-pvc.yaml + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: test-pvc + namespace: ${{ inputs.k8s.namespace }} + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 30Gi + EOF + kubectl apply -f test-pvc.yaml + deploy_ollama_openwebui: + needs: + - create_pvc + steps: + - name: Authenticate Kubernetes + run: pw kube auth ${{ inputs.k8s.cluster }} + - name: Generate API Key and Deployment YAML + run: | + API_KEY=$(openssl rand -hex 16) + echo "Generated API_KEY=$API_KEY" + + cat < ollama-openwebui-deployment.yaml + apiVersion: apps/v1 + kind: Deployment + metadata: + name: ${{ inputs.app.name }} + namespace: ${{ inputs.k8s.namespace }} + spec: + replicas: 1 + selector: + matchLabels: + app: ${{ inputs.app.name }} + template: + metadata: + labels: + app: ${{ inputs.app.name }} + spec: + runtimeClassName: nvidia + volumes: + - name: shared-pvc + persistentVolumeClaim: + claimName: ${{ inputs.pvc.name }} + - name: shared-tmp + emptyDir: {} + containers: + - name: ollama + image: ollama/ollama + ports: + - containerPort: 11434 + resources: + limits: + nvidia.com/gpu: ${{ inputs.resources.gpu_count }} + cpu: "2" + memory: "4Gi" + volumeMounts: + - mountPath: /root/.ollama + name: shared-pvc + - mountPath: /tmp + name: shared-tmp + command: ["sh", "-c"] + args: + - | + set -ex + apt-get update && apt-get install -y curl + echo "$API_KEY" > /tmp/api_key.txt + ollama serve & + + until curl -s http://localhost:11434/api/tags > /dev/null; do + echo "Waiting for Ollama..." + sleep 2 + done + + echo "Pulling model: ${{ inputs.app.model }}" + ollama pull ${{ inputs.app.model }} + + wait + + - name: ollama-proxy + image: nhuytan/ollama-apikey-proxy:latest + ports: + - containerPort: 8000 + env: + - name: OLLAMA_API_KEY + value: "$API_KEY" + - name: OLLAMA_BASE_URL + value: "http://localhost:11434" + volumeMounts: + - mountPath: /tmp + name: shared-tmp + command: ["sh", "-c"] + args: + - | + apt-get update && apt-get install -y curl + + curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -o /usr/local/bin/cloudflared + chmod +x /usr/local/bin/cloudflared + + echo "Starting API proxy..." + uvicorn proxy:app --host 0.0.0.0 --port 8000 & + + echo "Starting tunnel..." + cloudflared tunnel --url http://localhost:8000 --no-autoupdate > /tmp/cloudflared-url.txt 2>&1 & + + tail -f /dev/null + + - name: openwebui + image: ghcr.io/open-webui/open-webui:main + env: + - name: OLLAMA_BASE_URL + value: "http://localhost:11434" + - name: USE_OLLAMA_DOCKER + value: "true" + ports: + - containerPort: 8080 + resources: + limits: + cpu: "1" + memory: "2Gi" + volumeMounts: + - mountPath: /app/data + name: shared-pvc + lifecycle: + postStart: + exec: + command: + - sh + - -c + - | + until curl -s http://localhost:11434/api/tags > /dev/null; do + echo "Still waiting for Ollama..." + sleep 2 + done + EOF + - name: Apply Deployment + run: kubectl apply -f ollama-openwebui-deployment.yaml + - name: Create Service + run: | + cat < ollama-openwebui-service.yaml + apiVersion: v1 + kind: Service + metadata: + name: ${{ inputs.app.name }} + namespace: ${{ inputs.k8s.namespace }} + spec: + selector: + app: ${{ inputs.app.name }} + ports: + - name: openwebui + protocol: TCP + port: 8080 + targetPort: 8080 + - name: ollama + protocol: TCP + port: 11434 + targetPort: 11434 + - name: proxy + protocol: TCP + port: 8000 + targetPort: 8000 + type: LoadBalancer + EOF + kubectl apply -f ollama-openwebui-service.yaml + - name: Wait for Pod to be Ready + run: | + echo "Waiting for pod to be ready..." + pod=$(kubectl get pods -n ${{ inputs.k8s.namespace }} -l app=${{ inputs.app.name }} -o jsonpath='{.items[0].metadata.name}') + kubectl wait --for=condition=Ready pod/$pod -n ${{ inputs.k8s.namespace }} --timeout=300s + create_session: + needs: + - deploy_ollama_openwebui + steps: + - name: Debug Service + Pod + run: | + echo "Checking pod + service for session connection..." + kubectl get svc ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} + kubectl get pods -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} + - name: Wait for Cloudflare Tunnel URL + run: | + pod=$(kubectl get pods -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} -o jsonpath="{.items[0].metadata.name}") + + echo "model=${{inputs.app.model}}" >> OUTPUTS + + api_key=$(kubectl exec -n ${{ inputs.k8s.namespace }} "$pod" -c ollama -- sh -c 'cat /tmp/api_key.txt' || echo "") + echo "api_key=$api_key" >> OUTPUTS + + for i in $(seq 1 15); do + url=$(kubectl exec -n ${{ inputs.k8s.namespace }} "$pod" -c ollama-proxy -- sh -c "grep -o 'https://[a-zA-Z0-9.-]*\\.trycloudflare\\.com' /tmp/cloudflared-url.txt | head -n1") + if [ -n "$url" ]; then + echo "tunnel_url=$url" >> OUTPUTS + break + fi + echo "Waiting for tunnel URL..." + sleep 2 + done + - name: Display Information to put at AI + run: | + source OUTPUTS + echo "Model: $model" + echo "API Key: $api_key" + echo "Tunnel URL: $tunnel_url" + + echo "PW_API_KEY: $pw_api_key_b64" + echo "TEAM": $team + echo "Organization": $org + echo "Namespace": $np + - name: Expose Session + uses: parallelworks/update-session + with: + remotePort: '8080' + name: ${{ sessions.session }} + slug: '' + targetInfo: + name: ${{ inputs.k8s.cluster }} + namespace: ${{ inputs.k8s.namespace }} + resourceType: services + resourceName: ${{ inputs.app.name }} + keep_alive: + needs: + - create_session + steps: + - name: Display Information for AI - backup + run: | + source OUTPUTS + echo "Model: $model" + echo "API Key: $api_key" + echo "Tunnel URL: $tunnel_url" + + echo "PW_API_KEY: $pw_api_key_b64" + echo "TEAM": $team + echo "Organization": $org + echo "Namespace": $np + - name: Register AI Chat Provider + run: | + source OUTPUTS + + echo "Registering AI Chat Provider..." + safe_model=$(echo "$model" | tr -cd 'a-z0-9') + unique_suffix=$(date +%s | tail -c 6) + aichat_name="${safe_model}-${unique_suffix}" + echo "aichat_name=$aichat_name" + + curl -s -X POST "https://activate.parallel.works/api/organizations/$org/namespaces/$np/aichat-providers" \ + -H "Authorization: Basic $pw_api_key_b64" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "'"$aichat_name"'", + "description": "'"$model"'", + "tags": "", + "csp": "custom", + "team": "'"$team"'", + "variables": { + "endpoint": "'"$tunnel_url"'", + "apiKey": "'"$api_key"'", + "model": "'"$model"'" + } + }' + - name: Keep Session Running + run: tail -f /dev/null + cleanup: | + echo "Cleaning up resources..." + kubectl delete deployment ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --ignore-not-found + kubectl delete service ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} --ignore-not-found +'on': + execute: + inputs: + k8s: + type: group + label: Kubernetes Settings + items: + cluster: + label: Kubernetes Cluster + type: kubernetes-clusters + namespace: + label: Namespace + type: kubernetes-namespaces + clusterName: ${{ inputs.k8s.cluster }} + app: + type: group + label: App Settings + items: + name: + label: Deployment Name + type: string + default: ollama-ui + model: + label: Ollama Model to Pull + type: string + default: mistral + pvc: + type: group + label: Shared Volume + items: + name: + label: PVC Name + type: string + default: test-pvc + resources: + type: group + label: GPU Settings + items: + gpu_count: + label: Number of GPUs + type: number + default: 1 + min: 1 + step: 1 diff --git a/workflow/yamls/k8s/sam2-model/general.yaml b/workflow/yamls/k8s/sam2-model/general.yaml index 60d3b741..93b81d72 100644 --- a/workflow/yamls/k8s/sam2-model/general.yaml +++ b/workflow/yamls/k8s/sam2-model/general.yaml @@ -291,15 +291,18 @@ jobs: sleep 2 done - name: Get SLUG + early-cancel: any-job-failed run: | echo "slug=" >> $OUTPUTS - name: Debug Service + Pod + early-cancel: any-job-failed run: | echo "Checking pod + service for session connection..." kubectl get svc ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} kubectl get pods -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} kubectl describe svc ${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} - name: Wait for Port 3000 to be Ready + early-cancel: any-job-failed run: | echo "Polling HTTP response from localhost:3000 inside container..." pod=$(kubectl get pods -l app=${{ inputs.app.name }} -n ${{ inputs.k8s.namespace }} -o jsonpath="{.items[0].metadata.name}") @@ -312,6 +315,7 @@ jobs: sleep 2 done - name: Expose Session + early-cancel: any-job-failed uses: parallelworks/update-session with: remotePort: '3000' @@ -378,7 +382,7 @@ jobs: hidden: ${{ inputs.k8s.volumes.pvc !== 'New' }} ignore: ${{ .hidden }} optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. + tooltip: Leave blank to use the default storage class configured in the cluster. Use 'local-path' if default does not work pvc_persist: label: Persist PVC After Completion type: boolean @@ -399,7 +403,7 @@ jobs: label: App Settings items: name: - label: Deployment Name + label: App Name type: string default: sam2demo resources: