From 78c53a59ec80969127a82b25d9b54f9c79bf595d Mon Sep 17 00:00:00 2001 From: vinayK34 Date: Thu, 14 May 2026 13:22:33 +0530 Subject: [PATCH 1/5] Intial Commit for SGLang Helm Chart added to EI Signed-off-by: vinayK34 --- core/helm-charts/sglang/Chart.yaml | 10 + core/helm-charts/sglang/ci-values.yaml | 3 + .../helm-charts/sglang/templates/_helpers.tpl | 71 ++++++ .../sglang/templates/apisixroutes.yaml | 51 ++++ .../sglang/templates/configmap.yaml | 28 +++ .../sglang/templates/deployment.yaml | 235 ++++++++++++++++++ .../helm-charts/sglang/templates/ingress.yaml | 45 ++++ .../sglang/templates/ingress_eks.yaml | 50 ++++ core/helm-charts/sglang/templates/pvc.yaml | 15 ++ core/helm-charts/sglang/templates/route.yaml | 57 +++++ core/helm-charts/sglang/templates/secret.yaml | 17 ++ .../helm-charts/sglang/templates/service.yaml | 18 ++ .../sglang/templates/servicemonitor.yaml | 20 ++ core/helm-charts/sglang/values.yaml | 221 ++++++++++++++++ core/helm-charts/sglang/xeon-values.yaml | 163 ++++++++++++ 15 files changed, 1004 insertions(+) create mode 100644 core/helm-charts/sglang/Chart.yaml create mode 100644 core/helm-charts/sglang/ci-values.yaml create mode 100644 core/helm-charts/sglang/templates/_helpers.tpl create mode 100644 core/helm-charts/sglang/templates/apisixroutes.yaml create mode 100644 core/helm-charts/sglang/templates/configmap.yaml create mode 100644 core/helm-charts/sglang/templates/deployment.yaml create mode 100644 core/helm-charts/sglang/templates/ingress.yaml create mode 100644 core/helm-charts/sglang/templates/ingress_eks.yaml create mode 100644 core/helm-charts/sglang/templates/pvc.yaml create mode 100644 core/helm-charts/sglang/templates/route.yaml create mode 100644 core/helm-charts/sglang/templates/secret.yaml create mode 100644 core/helm-charts/sglang/templates/service.yaml create mode 100644 core/helm-charts/sglang/templates/servicemonitor.yaml create mode 100644 core/helm-charts/sglang/values.yaml create mode 100644 core/helm-charts/sglang/xeon-values.yaml diff --git a/core/helm-charts/sglang/Chart.yaml b/core/helm-charts/sglang/Chart.yaml new file mode 100644 index 00000000..e9d95308 --- /dev/null +++ b/core/helm-charts/sglang/Chart.yaml @@ -0,0 +1,10 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v2 +name: sglang +description: The Helm chart for sglang Inference Server +type: application +version: 0-latest +# The sglang version +appVersion: "0.5" diff --git a/core/helm-charts/sglang/ci-values.yaml b/core/helm-charts/sglang/ci-values.yaml new file mode 100644 index 00000000..8211daac --- /dev/null +++ b/core/helm-charts/sglang/ci-values.yaml @@ -0,0 +1,3 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +values.yaml diff --git a/core/helm-charts/sglang/templates/_helpers.tpl b/core/helm-charts/sglang/templates/_helpers.tpl new file mode 100644 index 00000000..a457d1a6 --- /dev/null +++ b/core/helm-charts/sglang/templates/_helpers.tpl @@ -0,0 +1,71 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "sglang.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "sglang.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "sglang.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Convert chart name to a string suitable as metric prefix +*/}} +{{- define "sglang.metricPrefix" -}} +{{- include "sglang.fullname" . | replace "-" "_" | regexFind "[a-zA-Z_:][a-zA-Z0-9_:]*" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "sglang.labels" -}} +helm.sh/chart: {{ include "sglang.chart" . }} +{{ include "sglang.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "sglang.selectorLabels" -}} +app.kubernetes.io/name: {{ include "sglang.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "sglang.serviceAccountName" -}} +{{- if .Values.global.sharedSAName }} +{{- .Values.global.sharedSAName }} +{{- else if .Values.serviceAccount.create }} +{{- default (include "sglang.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/core/helm-charts/sglang/templates/apisixroutes.yaml b/core/helm-charts/sglang/templates/apisixroutes.yaml new file mode 100644 index 00000000..bc43557f --- /dev/null +++ b/core/helm-charts/sglang/templates/apisixroutes.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +{{- if .Values.apisix.enabled }} +{{- $modelName := (default .Values.LLM_MODEL_ID .Values.SERVED_MODEL_NAME) | splitList "/" | last }} +apiVersion: apisix.apache.org/v2 +kind: ApisixRoute +metadata: + name: {{ include "sglang.fullname" . }}-apisixroute + namespace: default +spec: + http: + - name: {{ include "sglang.fullname" . }}-apisixroute + match: + hosts: + - {{ .Values.route.host | default .Values.ingress.host }} + paths: + # - /{{ $modelName }}/* + + backends: + - serviceName: {{ include "sglang.fullname" . }}-service + servicePort: {{- if .Values.route.enabled }} + sglang + {{- else }} + 80 + {{- end }} + plugins: + - name: openid-connect + enable: true + secretRef: {{ include "sglang.fullname" . }}-secret + config: + discovery: {{ .Values.oidc.discovery }} + {{- if or (eq .Values.platform "openshift") (eq .Values.platform "eks") }} + use_jwks: {{ .Values.oidc.use_jwks }} + {{- else }} + introspection_endpoint: {{ .Values.oidc.introspection_endpoint }} + introspection_endpoint_auth_method: client_secret_basic + {{- end }} + scope: openid profile email + bearer_only: true + realm: master + - name: proxy-rewrite + enable: true + config: + regex_uri: + + - /$1 + # - ^/{{ $modelName }}/(.*) + # - /$1 + headers: + Content-Type: application/json +{{- end }} \ No newline at end of file diff --git a/core/helm-charts/sglang/templates/configmap.yaml b/core/helm-charts/sglang/templates/configmap.yaml new file mode 100644 index 00000000..72c90521 --- /dev/null +++ b/core/helm-charts/sglang/templates/configmap.yaml @@ -0,0 +1,28 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +{{- $modelName := (default .Values.LLM_MODEL_ID .Values.SERVED_MODEL_NAME) }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "sglang.fullname" . }}-config + labels: + {{- include "sglang.labels" . | nindent 4 }} +data: + {{- if .Values.global.HF_ENDPOINT }} + HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} + {{- end }} + http_proxy: {{ .Values.global.http_proxy | quote }} + https_proxy: {{ .Values.global.https_proxy | quote }} + no_proxy: {{ .Values.global.no_proxy | quote }} + TORCHINDUCTOR_CACHE_DIR: "/tmp" + NUMBA_CACHE_DIR: "/tmp" + HF_HOME: "/data" + OUTLINES_CACHE_DIR: "/tmp/.cache/outlines" + {{- if .Values.runtime }} + runtime: {{ .Values.runtime | quote}} + {{- end }} + + {{- $modelConfig := (index .Values.modelConfigs $modelName | default dict).configMapValues | default .Values.defaultModelConfigs.configMapValues }} + {{- range $key, $value := $modelConfig }} + {{ $key }}: {{ $value | quote }} + {{- end }} \ No newline at end of file diff --git a/core/helm-charts/sglang/templates/deployment.yaml b/core/helm-charts/sglang/templates/deployment.yaml new file mode 100644 index 00000000..1d5c96c7 --- /dev/null +++ b/core/helm-charts/sglang/templates/deployment.yaml @@ -0,0 +1,235 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +{{- $modelName := (default .Values.LLM_MODEL_ID .Values.SERVED_MODEL_NAME) }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "sglang.fullname" . }} + labels: + {{- include "sglang.labels" . | nindent 4 }} +spec: + {{- if ne (int .Values.replicaCount) 1 }} + # remove if replica count should not be reset on pod update (e.g. with HPA) + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "sglang.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + + {{- if and .Values.cpu_balloon_annotation (not .Values.finetune.enabled) }} + balloon.balloons.resource-policy.nri.io: {{ .Values.cpu_balloon_annotation | quote }} + {{- end }} + + labels: + {{- include "sglang.selectorLabels" . | nindent 8 }} + {{- if not .Values.finetune.enabled }} + name: sglang + {{- end }} + spec: + # {{- with .Values.imagePullSecrets }} + # imagePullSecrets: + # {{- toYaml . | nindent 8 }} + # {{- end }} + # serviceAccountName: {{ include "sglang.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + {{- if .Values.finetune.enabled }} + initContainers: + - name: fetch-finetuned-model + image: "{{ .Values.finetune.images.minioMc.repository }}:{{ .Values.finetune.images.minioMc.tag }}" + imagePullPolicy: {{ .Values.finetune.images.minioMc.pullPolicy }} + command: ["/bin/sh", "-c"] + args: + - | + set -e + echo "[finetune-init] Starting model download for fileId: {{ .Values.finetune.fileId }}" + + # Install minio client and tar (alpine doesn't have them by default) + apk add --no-cache curl tar + curl -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc + chmod +x /usr/local/bin/mc + + # Configure MinIO client + mc alias set minio http://${MINIO_HOST}:${MINIO_PORT} ${MINIO_ACCESS_KEY} ${MINIO_SECRET_KEY} + + # Download model tarball + MODEL_PATH="dataprep/{{ .Values.finetune.fileId }}/{{ .Values.finetune.fileId }}.tar.gz" + echo "[finetune-init] Downloading from bucket: ${MODEL_PATH}" + mc cp "minio/${MODEL_PATH}" /tmp/model.tar.gz + + # Validate tarball (check for malicious content) + echo "[finetune-init] Validating tarball contents..." + tar -tzf /tmp/model.tar.gz > /tmp/tar-contents.txt + + # Block dangerous file types and path traversal + if grep -qE '\.(sh|py|exe|so|dylib)$|\.\.\/'' /tmp/tar-contents.txt; then + echo "[finetune-init] ERROR: Tarball contains prohibited file types or path traversal" + cat /tmp/tar-contents.txt + exit 1 + fi + + # Extract to /data (model-volume mount) + echo "[finetune-init] Extracting model to /data/{{ .Values.finetune.fileId }}" + mkdir -p /data/{{ .Values.finetune.fileId }} + tar -xzf /tmp/model.tar.gz -C /data/{{ .Values.finetune.fileId }} + + # Verify model files exist (search in subdirectories) + CONFIG_FILE=$(find /data/{{ .Values.finetune.fileId }} -name "config.json" -type f | head -1) + if [ -z "$CONFIG_FILE" ]; then + echo "[finetune-init] ERROR: config.json not found after extraction" + ls -laR /data/{{ .Values.finetune.fileId }} + exit 1 + fi + + # Get the actual model directory + MODEL_DIR=$(dirname "$CONFIG_FILE") + echo "[finetune-init] Model successfully fetched and validated" + echo "[finetune-init] Model located at: $MODEL_DIR" + ls -lh "$MODEL_DIR" + + # Print the final model path that should be used + echo "[finetune-init] sglang should use model path: $MODEL_DIR" + env: + - name: MC_CONFIG_DIR + value: "/tmp/.mc" + envFrom: + - secretRef: + name: {{ .Values.finetune.minioCredentialsSecret }} + volumeMounts: + - name: model-volume + mountPath: /data + - name: tmp + mountPath: /tmp + securityContext: + runAsUser: 0 + readOnlyRootFilesystem: false + {{- end }} + containers: + - name: {{ .Chart.Name }} + envFrom: + - configMapRef: + name: {{ include "sglang.fullname" . }}-config + {{- if .Values.global.extraEnvConfig }} + - configMapRef: + name: {{ .Values.global.extraEnvConfig }} + optional: true + {{- end }} + - secretRef: + name: {{ include "sglang.fullname" . }}-secret + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + {{- if .Values.image.pullPolicy }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- end }} + command: + - python + - -m + - sglang.launch_server + args: + {{- $modelConfig := (index .Values.modelConfigs $modelName | default dict) }} + {{- $modelArgs := $modelConfig.extraCmdArgs | default .Values.defaultModelConfigs.extraCmdArgs }} + {{- range $modelArgs }} + - {{ . | quote }} + {{- end }} + + # CPU deployment - add parallelism settings + - "--tensor-parallel-size" + - {{ .Values.tensor_parallel_size | default (index .Values.modelConfigs $modelName | default dict).tensor_parallel_size | default .Values.defaultModelConfigs.tensor_parallel_size | quote}} + - "--pipeline-parallel-size" + - {{ .Values.pipeline_parallel_size | default (index .Values.modelConfigs $modelName | default dict).pipeline_parallel_size | default .Values.defaultModelConfigs.pipeline_parallel_size | quote}} + + {{- if .Values.finetune.enabled }} + - "--trust-remote-code" + - "--disable-overlap-schedule" + - "slow" + {{- end }} + - "--model" + - {{ .Values.LLM_MODEL_ID | quote }} + - --served-model-name + - {{ $modelName | quote }} + - "--port" + - {{ .Values.port | quote }} + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: {{ .Values.port }} + protocol: TCP + {{- if .Values.livenessProbe }} + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + {{- end }} + {{- if .Values.readinessProbe }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + {{- end }} + {{- if .Values.startupProbe }} + startupProbe: + {{- toYaml .Values.startupProbe | nindent 12 }} + {{- end }} + {{- if or .Values.cpu .Values.memory }} + resources: + + {{- if or .Values.cpu .Values.memory }} + requests: + {{- if .Values.cpu }} + cpu: {{ .Values.cpu | quote }} + {{- end }} + {{- if .Values.memory }} + memory: {{ .Values.memory | quote }} + {{- end }} + {{- end }} + + {{- end }} + + volumes: + - name: model-volume + {{- if .Values.pvc.enabled }} + persistentVolumeClaim: + claimName: {{ include "sglang.fullname" . }}-pvc + {{- else if .Values.global.modelUseHostPath }} + hostPath: + path: {{ .Values.global.modelUseHostPath }} + type: Directory + {{- else }} + emptyDir: {} + {{- end }} + - name: shm + emptyDir: + medium: Memory + sizeLimit: {{ .Values.shmSize }} + - name: tmp + emptyDir: {} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if .Values.evenly_distributed }} + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + {{- include "sglang.selectorLabels" . | nindent 14 }} + {{- end }} + + # extra time to finish processing buffered requests on CPU before pod is forcibly terminated + terminationGracePeriodSeconds: 120 diff --git a/core/helm-charts/sglang/templates/ingress.yaml b/core/helm-charts/sglang/templates/ingress.yaml new file mode 100644 index 00000000..99560d5a --- /dev/null +++ b/core/helm-charts/sglang/templates/ingress.yaml @@ -0,0 +1,45 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# Please edit the object below. Lines beginning with a '#' will be ignored, +# and an empty file will abort the edit. If an error occurs while saving this file will be +# reopened with the relevant failures. +{{- if .Values.ingress.enabled}} +{{- $modelName := (default .Values.LLM_MODEL_ID .Values.SERVED_MODEL_NAME) | splitList "/" | last }} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + kubernetes.io/ingress.class: nginx + # nginx.ingress.kubernetes.io/rewrite-target: /{{ $modelName }}/$1 + nginx.ingress.kubernetes.io/rewrite-target: {{- if .Values.apisix.enabled }}{{- else }}{{- end }} + resourceVersion: "244487" + uid: df2b31a1-6653-4d71-9de0-4df33cb93ad1 +spec: + ingressClassName: nginx + rules: + - host: {{ .Values.ingress.host }} + http: + paths: + - backend: + service: + name: {{- if .Values.apisix.enabled }} + auth-apisix-gateway + {{- else }} + {{ include "sglang.fullname" . }}-service + {{- end }} + port: + number: 80 + # path: /{{ $modelName }}/(.*) + path: {{- if and .Values.apisix.enabled .Values.accelDevice }} + /{{ $modelName }}/(.*) + {{- else if and .Values.apisix.enabled (not .Values.accelDevice) }} + /{{ $modelName }}-sglangcpu/(.*) + {{- else if not .Values.apisix.enabled }} + /{{ $modelName }}/(.*) + {{- end }} + pathType: ImplementationSpecific + tls: + - hosts: + - {{ .Values.ingress.host }} + secretName: {{ .Values.ingress.secretname }} +{{- end }} \ No newline at end of file diff --git a/core/helm-charts/sglang/templates/ingress_eks.yaml b/core/helm-charts/sglang/templates/ingress_eks.yaml new file mode 100644 index 00000000..6f50dcc8 --- /dev/null +++ b/core/helm-charts/sglang/templates/ingress_eks.yaml @@ -0,0 +1,50 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +{{- if .Values.ingress.enabled }} +{{- $modelName := (default .Values.LLM_MODEL_ID .Values.SERVED_MODEL_NAME) | splitList "/" | last }} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "sglang.fullname" . }}-ingress + namespace: {{- if .Values.apisix.enabled }} + auth-apisix + {{- else }} + {{ .Values.ingress.namespace }} + {{- end }} + annotations: + alb.ingress.kubernetes.io/actions.ssl-redirect: '{"Type": "redirect", "RedirectConfig": { "Protocol": "HTTPS", "Port": "443", "StatusCode": "HTTP_301"}}' + {{- if .Values.aws_certificate_arn }} + alb.ingress.kubernetes.io/certificate-arn: '{{ .Values.aws_certificate_arn }}' + {{- end }} + alb.ingress.kubernetes.io/group.name: ei-eks + alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}, {"HTTPS": 443}]' + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip +spec: + ingressClassName: alb + rules: + - host: {{ .Values.ingress.host }} + http: + paths: + - backend: + service: + name: {{- if .Values.apisix.enabled }} + auth-apisix-gateway + {{- else }} + {{ include "sglang.fullname" . }}-service + {{- end }} + port: + number: 80 + path: {{- if and .Values.apisix.enabled .Values.accelDevice }} + /{{ $modelName }}/* + {{- else if and .Values.apisix.enabled (not .Values.accelDevice) }} + /{{ $modelName }}-sglangcpu/* + {{- else if not .Values.apisix.enabled }} + /{{ $modelName }}/(.*) + {{- end }} + pathType: ImplementationSpecific + tls: + - hosts: + - {{ .Values.ingress.host }} + secretName: {{ .Values.ingress.secretname }} +{{- end }} \ No newline at end of file diff --git a/core/helm-charts/sglang/templates/pvc.yaml b/core/helm-charts/sglang/templates/pvc.yaml new file mode 100644 index 00000000..28c61ab8 --- /dev/null +++ b/core/helm-charts/sglang/templates/pvc.yaml @@ -0,0 +1,15 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +{{- if and .Values.pvc.enabled .Release.IsInstall }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "sglang.fullname" . }}-pvc + namespace: default +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 250Gi # Adjust this according to model storage size +{{- end }} \ No newline at end of file diff --git a/core/helm-charts/sglang/templates/route.yaml b/core/helm-charts/sglang/templates/route.yaml new file mode 100644 index 00000000..bfc8b1a2 --- /dev/null +++ b/core/helm-charts/sglang/templates/route.yaml @@ -0,0 +1,57 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +{{- if .Values.route.enabled }} +{{- $modelName := (default .Values.LLM_MODEL_ID .Values.SERVED_MODEL_NAME) | splitList "/" | last }} +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + {{- if .Values.apisix.enabled }} + annotations: + router.openshift.io/allow-hostname-claim: "true" + {{- end }} + labels: + {{- include "sglang.labels" . | nindent 4 }} + name: {{ include "sglang.fullname" . }}-route + namespace: {{- if .Values.apisix.enabled }} + auth-apisix + {{- else }} + default + {{- end }} +spec: + host: {{ .Values.route.host }} + {{- if .Values.apisix.enabled }} + path: + {{- end }} + to: + kind: Service + name: {{- if .Values.apisix.enabled }} + auth-apisix-gateway + {{- else }} + {{ include "sglang.fullname" . }}-service + {{- end }} + weight: 100 + port: + targetPort: apisix-gateway + {{- if .Values.route.tls }} + tls: + {{- if .Values.route.tls.termination }} + termination: {{ .Values.route.tls.termination }} + {{- end }} + {{- if .Values.route.tls.insecureEdgeTerminationPolicy }} + insecureEdgeTerminationPolicy: {{ .Values.route.tls.insecureEdgeTerminationPolicy }} + {{- end }} + {{- if .Values.route.tls.certificate }} + certificate: {{ .Values.route.tls.certificate }} + {{- end }} + {{- if .Values.route.tls.key }} + key: {{ .Values.route.tls.key }} + {{- end }} + {{- if .Values.route.tls.caCertificate }} + caCertificate: {{ .Values.route.tls.caCertificate }} + {{- end }} + {{- if .Values.route.tls.destinationCACertificate }} + destinationCACertificate: {{ .Values.route.tls.destinationCACertificate }} + {{- end }} + {{- end }} + wildcardPolicy: {{ .Values.route.wildcardPolicy | default "None" }} +{{- end }} diff --git a/core/helm-charts/sglang/templates/secret.yaml b/core/helm-charts/sglang/templates/secret.yaml new file mode 100644 index 00000000..c12d7d9c --- /dev/null +++ b/core/helm-charts/sglang/templates/secret.yaml @@ -0,0 +1,17 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "sglang.fullname" . }}-secret + labels: + app: {{- include "sglang.labels" . | nindent 4 }} +type: Opaque +data: + {{- if .Values.global.HUGGINGFACEHUB_API_TOKEN }} + HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | b64enc | quote }} + {{- end }} + {{- if .Values.apisix.enabled }} + client_id: {{ .Values.oidc.client_id | b64enc | quote }} + client_secret: {{ .Values.oidc.client_secret | b64enc | quote }} + {{- end }} \ No newline at end of file diff --git a/core/helm-charts/sglang/templates/service.yaml b/core/helm-charts/sglang/templates/service.yaml new file mode 100644 index 00000000..034ddbe8 --- /dev/null +++ b/core/helm-charts/sglang/templates/service.yaml @@ -0,0 +1,18 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "sglang.fullname" . }}-service + labels: + {{- include "sglang.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: 80 + targetPort: {{ .Values.port }} + protocol: TCP + name: sglang + selector: + {{- include "sglang.selectorLabels" . | nindent 4 }} \ No newline at end of file diff --git a/core/helm-charts/sglang/templates/servicemonitor.yaml b/core/helm-charts/sglang/templates/servicemonitor.yaml new file mode 100644 index 00000000..2f2b5b68 --- /dev/null +++ b/core/helm-charts/sglang/templates/servicemonitor.yaml @@ -0,0 +1,20 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +{{- if .Values.svcmonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "sglang.fullname" . }}-service + labels: + release: {{ .Values.global.prometheusRelease }} +spec: +spec: + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + app.kubernetes.io/instance: {{ include "sglang.fullname" . }} + endpoints: + - port: "sglang" +{{- end }} diff --git a/core/helm-charts/sglang/values.yaml b/core/helm-charts/sglang/values.yaml new file mode 100644 index 00000000..39ebaaad --- /dev/null +++ b/core/helm-charts/sglang/values.yaml @@ -0,0 +1,221 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for sglang. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +# Enabling HPA will: +# - Ignore above replica count, as it will be controlled by HPA +# - Add example HPA scaling rules with custom metrics thresholds +# - Require custom metrics ConfigMap available in the main application chart +autoscaling: + maxReplicas: 4 + enabled: false + +# empty for CPU (longer latencies are tolerated before HPA scaling unaccelerated service) +accelDevice: "" + +port: 2080 +shmSize: 1Gi +image: + repository: lmsysorg/sglang + # Uncomment the following line to set desired image pull policy if needed, as one of Always, IfNotPresent, Never. + # pullPolicy: "" + # Overrides the image tag whose default is the chart appVersion. + tag: "v0.5.11-xeon" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + # Specifies whether a service account should be created + create: false + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} + +# CPU Balloon configuration for NRI resource policy +#cpu_balloon_annotation: "" + +podSecurityContext: + fsGroup: 1001 + runAsUser: 1001 + +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + add: + - SYS_NICE + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault +# readOnlyRootFilesystem: true +# allowPrivilegeEscalation: false +# runAsNonRoot: true +# runAsUser: 1000 +# capabilities: +# drop: +# - ALL +# seccompProfile: +# type: RuntimeDefault + +service: + type: ClusterIP + +pvc: + enabled: true + +apisix: + enabled: false + +ingress: + enabled: false # Set to true to enable the Ingress resource + host: "" + namespace: default + secretname: "" + +route: + enabled: false + host: "" + tls: + termination: "edge" + insecureEdgeTerminationPolicy: "Redirect" + +# Platform type: openshift or vanilla kubernetes +platform: vanilla + +oidc: + realm: master + client_id: "" + client_secret: "" + discovery: http://keycloak.default.svc.cluster.local/realms/master/.well-known/openid-configuration + introspection_endpoint: http://keycloak.default.svc.cluster.local/realms/master/protocol/openid-connect/token/introspect + use_jwks: true + + +extraCmdArgs: [] + +# livenessProbe: +# httpGet: +# path: /health +# port: http +# initialDelaySeconds: 4600 # Start liveness checks after 1 hour (safe buffer) +# periodSeconds: 60 +# failureThreshold: 60 # Allows up to 30 minutes of failures before restart + +# readinessProbe: +# httpGet: +# path: /health +# port: http +# initialDelaySeconds: 4600 # Only mark ready after 1 hour +# periodSeconds: 60 + +# startupProbe: +# httpGet: +# path: /health +# port: http +# initialDelaySeconds: 3600 +# periodSeconds: 10 +# failureThreshold: 7200 # Allows up to 2 hours (10s * 7200 = 72000s) + +readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 120 + periodSeconds: 30 + failureThreshold: 98 +# startupProbe: +# httpGet: +# path: /health +# port: http +# initialDelaySeconds: 120 +# periodSeconds: 30 +# failureThreshold: 79 + +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: ei-inference-eligible + operator: In + values: ["true"] + +LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 +SERVED_MODEL_NAME: "" +# Environment variables for sglang (set in configmap): +# https://docs.sglang.ai/en/latest/getting_started/installation.html#environment-variables +sglang_CPU_KVCACHE_SPACE: "40" +sglang_RPC_TIMEOUT: "100000" +sglang_ALLOW_LONG_MAX_MODEL_LEN: "1" +sglang_ENGINE_ITERATION_TIMEOUT_S: "120" +sglang_CPU_NUM_OF_RESERVED_CPU: "0" +sglang_CONFIGURE_LOGGING: 1 +sglang_NO_USAGE_STATS: 1 +sglang_CPU_SGL_KERNEL: "1" +DO_NOT_TRACK: 1 + +# CPU optimization settings (can be updated by automation) +cpu_optimization_enabled: false +max_num_batched_tokens: "2048" +max_num_seqs: "256" + +# Parallelism configuration (root level for global access) +tensor_parallel_size: "2" +pipeline_parallel_size: "1" + +svcmonitor: + enabled: false + +global: + http_proxy: "" + https_proxy: "" + no_proxy: "" + HUGGINGFACEHUB_API_TOKEN: "" + # service account name to be shared with all parent/child charts. + # If set, it will overwrite serviceAccount.name. + # If set, and serviceAccount.create is false, it will assume this service account is already created by others. + sharedSAName: "" + + # Choose where to save your downloaded models + # Set modelUseHostPath for local directory, this is good for one node test. Example: + # modelUseHostPath: /mnt/opea-models + # Set modelUsePVC for PersistentVolumeClaim(PVC), which is suitable for multinode deployment. Example: + # modelUsePVC: model-volume + # You can only set one of the following var, the behavior is not defined is both are set. + # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume. + modelUseHostPath: "" + modelUsePVC: "" + + # Install Prometheus serviceMonitor for service + monitoring: false + + # Prometheus Helm install release name for serviceMonitor + prometheusRelease: observability +# Fine-tuning integration: when enabled, an init container fetches the fine-tuned +# model tarball from MinIO before sglang starts, and extra args are passed to sglang. +finetune: + enabled: false + fileId: "" + extractPath: "/models" + images: + minioMc: + repository: "alpine" + tag: "latest" + pullPolicy: "IfNotPresent" + minioCredentialsSecret: "minio-internal-creds" diff --git a/core/helm-charts/sglang/xeon-values.yaml b/core/helm-charts/sglang/xeon-values.yaml new file mode 100644 index 00000000..575548c2 --- /dev/null +++ b/core/helm-charts/sglang/xeon-values.yaml @@ -0,0 +1,163 @@ +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Xeon CPU-optimized override values for vLLM deployments +# This file contains lean, CPU-specific overrides for Intel Xeon processors +# Base values are inherited from values.yaml + +# CPU deployment - no accelerator device +accelDevice: "" + +# CPU Balloon configuration for NRI resource policy +cpu_balloon_annotation: "" + + +resources: + requests: + cpu: "{{ .Values.cpu }}" + memory: "{{ .Values.memory }}" + +# CPU-specific configurations +block_size: 128 +max_num_seqs: 256 +max_seq_len_to_capture: 2048 +d_type: "bfloat16" +max_model_len: 8192 + +# SGLang CPU image configuration +image: + repository: lmsysorg/sglang + tag: "v0.5.11-xeon" + pullPolicy: IfNotPresent + +# Node affinity for Xeon inference nodes +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: ei-inference-eligible + operator: In + values: ["true"] + +# vLLM CPU environment variables +SGLANG_CPU_SGL_KERNEL: "1" +SGLANG_NO_USAGE_STATS: 1 +DO_NOT_TRACK: 1 + +LLM_MODEL_ID: "" + + +modelConfigs: + "meta-llama/Llama-3.1-8B-Instruct": + configMapValues: {} + extraCmdArgs: + [ + "--trust-remote-code", + "--disable-overlap-schedule", + "--device", + "cpu", + "--host", + "0.0.0.0" + ] + tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" + pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" + + "meta-llama/Llama-3.2-3B-Instruct": + configMapValues: {} + extraCmdArgs: + [ + "--trust-remote-code", + "--disable-overlap-schedule", + "--device", + "cpu", + "--host", + "0.0.0.0" + ] + tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" + pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" + + "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": + configMapValues: {} + extraCmdArgs: + [ + "--trust-remote-code", + "--disable-overlap-schedule", + "--device", + "cpu", + "--host", + "0.0.0.0" + ] + tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" + pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" + + "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": + configMapValues: {} + extraCmdArgs: + [ + "--trust-remote-code", + "--disable-overlap-schedule", + "--device", + "cpu", + "--host", + "0.0.0.0" + ] + tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" + pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" + + "Qwen/Qwen3-1.7B": + configMapValues: {} + extraCmdArgs: + [ + "--trust-remote-code", + "--disable-overlap-schedule", + "--device", + "cpu", + "--host", + "0.0.0.0" + ] + tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" + pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" + + "Qwen/Qwen3-4B-Instruct-2507": + configMapValues: {} + extraCmdArgs: + [ + "--trust-remote-code", + "--disable-overlap-schedule", + "--device", + "cpu", + "--host", + "0.0.0.0" + ] + tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" + pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" + + # Qwen/Qwen3-Coder-30B-A3B-Instruct — MoE tool-calling model (~80 GB RAM) + "Qwen/Qwen3-Coder-30B-A3B-Instruct": + configMapValues: {} + extraCmdArgs: + [ + "--trust-remote-code", + "--disable-overlap-schedule", + "--device", + "cpu", + "--host", + "0.0.0.0" + ] + tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" + pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" + +defaultModelConfigs: + configMapValues: {} + extraCmdArgs: + [ + "--trust-remote-code", + "--disable-overlap-schedule", + "--device", + "cpu", + "--host", + "0.0.0.0" + ] + tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" + pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" From e46482461abbb8d66505f215aab4f2cf31f099a4 Mon Sep 17 00:00:00 2001 From: vinayK34 Date: Thu, 14 May 2026 13:34:35 +0530 Subject: [PATCH 2/5] chore: clean up Gaudi and VLLM variables for SGLang in Xeon deployment templates Signed-off-by: vinayK34 --- .../sglang/templates/configmap.yaml | 51 +++++++++---------- core/helm-charts/sglang/values.yaml | 12 +---- core/helm-charts/sglang/xeon-values.yaml | 4 -- 3 files changed, 25 insertions(+), 42 deletions(-) diff --git a/core/helm-charts/sglang/templates/configmap.yaml b/core/helm-charts/sglang/templates/configmap.yaml index 72c90521..70cc5f22 100644 --- a/core/helm-charts/sglang/templates/configmap.yaml +++ b/core/helm-charts/sglang/templates/configmap.yaml @@ -1,28 +1,25 @@ -# Copyright (C) 2025-2026 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -{{- $modelName := (default .Values.LLM_MODEL_ID .Values.SERVED_MODEL_NAME) }} -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "sglang.fullname" . }}-config - labels: - {{- include "sglang.labels" . | nindent 4 }} -data: - {{- if .Values.global.HF_ENDPOINT }} - HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} - {{- end }} - http_proxy: {{ .Values.global.http_proxy | quote }} - https_proxy: {{ .Values.global.https_proxy | quote }} - no_proxy: {{ .Values.global.no_proxy | quote }} - TORCHINDUCTOR_CACHE_DIR: "/tmp" - NUMBA_CACHE_DIR: "/tmp" - HF_HOME: "/data" - OUTLINES_CACHE_DIR: "/tmp/.cache/outlines" - {{- if .Values.runtime }} - runtime: {{ .Values.runtime | quote}} - {{- end }} - - {{- $modelConfig := (index .Values.modelConfigs $modelName | default dict).configMapValues | default .Values.defaultModelConfigs.configMapValues }} - {{- range $key, $value := $modelConfig }} - {{ $key }}: {{ $value | quote }} +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +{{- $modelName := (default .Values.LLM_MODEL_ID .Values.SERVED_MODEL_NAME) }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "sglang.fullname" . }}-config + labels: + {{- include "sglang.labels" . | nindent 4 }} +data: + {{- if .Values.global.HF_ENDPOINT }} + HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} + {{- end }} + http_proxy: {{ .Values.global.http_proxy | quote }} + https_proxy: {{ .Values.global.https_proxy | quote }} + no_proxy: {{ .Values.global.no_proxy | quote }} + TORCHINDUCTOR_CACHE_DIR: "/tmp" + NUMBA_CACHE_DIR: "/tmp" + HF_HOME: "/data" + OUTLINES_CACHE_DIR: "/tmp/.cache/outlines" + + {{- $modelConfig := (index .Values.modelConfigs $modelName | default dict).configMapValues | default .Values.defaultModelConfigs.configMapValues }} + {{- range $key, $value := $modelConfig }} + {{ $key }}: {{ $value | quote }} {{- end }} \ No newline at end of file diff --git a/core/helm-charts/sglang/values.yaml b/core/helm-charts/sglang/values.yaml index 39ebaaad..00800523 100644 --- a/core/helm-charts/sglang/values.yaml +++ b/core/helm-charts/sglang/values.yaml @@ -160,20 +160,10 @@ LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 SERVED_MODEL_NAME: "" # Environment variables for sglang (set in configmap): # https://docs.sglang.ai/en/latest/getting_started/installation.html#environment-variables -sglang_CPU_KVCACHE_SPACE: "40" -sglang_RPC_TIMEOUT: "100000" -sglang_ALLOW_LONG_MAX_MODEL_LEN: "1" -sglang_ENGINE_ITERATION_TIMEOUT_S: "120" -sglang_CPU_NUM_OF_RESERVED_CPU: "0" -sglang_CONFIGURE_LOGGING: 1 -sglang_NO_USAGE_STATS: 1 -sglang_CPU_SGL_KERNEL: "1" -DO_NOT_TRACK: 1 + # CPU optimization settings (can be updated by automation) cpu_optimization_enabled: false -max_num_batched_tokens: "2048" -max_num_seqs: "256" # Parallelism configuration (root level for global access) tensor_parallel_size: "2" diff --git a/core/helm-charts/sglang/xeon-values.yaml b/core/helm-charts/sglang/xeon-values.yaml index 575548c2..e0bb797b 100644 --- a/core/helm-charts/sglang/xeon-values.yaml +++ b/core/helm-charts/sglang/xeon-values.yaml @@ -41,13 +41,9 @@ affinity: values: ["true"] # vLLM CPU environment variables -SGLANG_CPU_SGL_KERNEL: "1" -SGLANG_NO_USAGE_STATS: 1 -DO_NOT_TRACK: 1 LLM_MODEL_ID: "" - modelConfigs: "meta-llama/Llama-3.1-8B-Instruct": configMapValues: {} From 2288fc4cae74e9434b9379a0a22f336e61473cba Mon Sep 17 00:00:00 2001 From: vinayK34 Date: Thu, 14 May 2026 14:05:19 +0530 Subject: [PATCH 3/5] Created the docunetation for EI Support for SGLang Signed-off-by: vinayK34 --- docs/sglang-model-deploy-guide.md | 207 ++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 docs/sglang-model-deploy-guide.md diff --git a/docs/sglang-model-deploy-guide.md b/docs/sglang-model-deploy-guide.md new file mode 100644 index 00000000..71d00c68 --- /dev/null +++ b/docs/sglang-model-deploy-guide.md @@ -0,0 +1,207 @@ +# SGLang on Intel Enterprise Inference - Single Node Deployment Guide + +This guide provides step-by-step instructions for deploying LLM models using the SGLang server on an existing Intel® AI for Enterprise Inference single-node cluster. + +> **IMPORTANT:** Unlike other inference engines in this repository, **SGLang natively supports ONLY Intel® Xeon® Processors (CPU)** in this helm chart implementation. Gaudi (HPU) accelerator support is NOT available for SGLang within this enterprise inference stack. Therefore, SGLang must be deployed using the explicit CPU paths and overrides as shown below. + +## Prerequisites +Before running the deployment, ensure you have completed all general [prerequisites](./prerequisites.md), and standard cluster deployments. SGLang deployment is performed via Helm directly and is not yet mapped into the main Ansible playbooks like vLLM. + +## Setup Steps + +### Step 1: Modify the hosts file +Since we are testing locally, we need to map a testing domain (`api.example.com`) to `localhost` in the `/etc/hosts` file. + +Run the following command to edit the hosts file: +```bash +sudo nano /etc/hosts +``` +Add this line at the end: +```text +127.0.0.1 api.example.com +``` +Save and exit (`CTRL+X`, then `Y` and `Enter`). + +### Step 2: Generate a self-signed SSL certificate +Run the following command to create a self-signed SSL certificate that covers api.example.com and trace-api.example.com (used if deploying ingress routes): +```bash +mkdir -p ~/certs && cd ~/certs && \ +openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -days 365 -nodes \ + -subj "/CN=api.example.com" \ + -addext "subjectAltName = DNS:api.example.com, DNS:trace-api.example.com" +``` +Note: the -addext option requires OpenSSL >= 1.1.1. + +### Step 3: Configure the Automation config file +Move the single node preset inference config file to the working directory: + +```bash +cd ~/Enterprise-Inference +cp -f docs/examples/single-node/inference-config.cfg core/inventory/inference-config.cfg +``` + +### Step 4: Modify `inference-config.cfg` and set deploy_llm_models to off + +Since automatic LLM deployment via the playbook is supported natively for vLLM but not SGLang, turn off automatic model deployment: + +```bash +nano ~/Enterprise-Inference/core/inventory/inference-config.cfg +``` +Change `" deploy_llm_models=on "` -> `" deploy_llm_models=off "` + +Ensure the `cluster_url` field is set to the DNS used, and the paths to the certificate and key files are valid. The deployment options can be left unchanged. + +### Step 5: Update `hosts.yaml` File and run the Setup +Copy the single node preset hosts config file to the working directory: + +```bash +cp -f docs/examples/single-node/hosts.yaml core/inventory/hosts.yaml +``` + +> **Note** The `ansible_user` field is set to *ubuntu* by default. Change it to the actual username. + +Export your Hugging Face API token and run the automation deployer: + +```bash +export HUGGINGFACE_TOKEN="Your_Hugging_Face_Token_ID" + +cd ~/Enterprise-Inference/core +chmod +x inference-stack-deploy.sh + +./inference-stack-deploy.sh --cpu-or-gpu "cpu" --hugging-face-token $HUGGINGFACE_TOKEN +``` + +**Select Option 1 and confirm the Yes/No prompt.** + +This will deploy the setup automatically. Once the cluster setup is complete, you can configure your SGLang endpoints. + +### Step 6: Configure Authentication (OIDC) via Keycloak + +If your cluster has Keycloak deployed for API security, you need to grab the auto-generated Client ID and Secret and configure them in the SGLang chart. + +```bash +cd ~/Enterprise-Inference/core/scripts +source generate-token.sh + +cd ~/Enterprise-Inference/core/helm-charts/sglang/ + +echo $KEYCLOAK_CLIENT_ID # Prints your keycloak client ID +echo $KEYCLOAK_CLIENT_SECRET # Prints your keycloak client secret +``` + +Open the `values.yaml` file to configure OIDC: +```bash +nano ~/Enterprise-Inference/core/helm-charts/sglang/values.yaml +``` + +Update the OIDC block: +```yaml +oidc: + realm: master + client_id: "<>" + client_secret: "<>" + discovery: "http://keycloak.default.svc.cluster.local/realms/master/.well-known/openid-configuration" + introspection_endpoint: "http://keycloak.default.svc.cluster.local/realms/master/protocol/openid-connect/token/introspect" + use_jwks: true +``` + +#### Host Configuration +If you depend on APISIX for routing, assure you update the host parameter under `apisixRoute`: +```yaml +apisixRoute: + enabled: true + namespace: default + name: "" + host: "api.example.com" # Update this to your configured DNS +``` + +## Optimized Model List +A list of popular LLMs are optimized and run efficiently on CPU, including the most notable open-source models like Llama series, Qwen series, and DeepSeek series like DeepSeek-R1 and DeepSeek-V3.1-Terminus. + +| Model Name | BF16 | W8A8_INT8 | FP8 | +| --- | --- | --- | --- | +| DeepSeek-R1 | | meituan/DeepSeek-R1-Channel-INT8 | deepseek-ai/DeepSeek-R1 | +| DeepSeek-V3.1-Terminus | | IntervitensInc/DeepSeek-V3.1-Terminus-Channel-int8 | deepseek-ai/DeepSeek-V3.1-Terminus | +| Llama-3.2-3B | meta-llama/Llama-3.2-3B-Instruct | RedHatAI/Llama-3.2-3B-quantized.w8a8 | | +| Llama-3.1-8B | meta-llama/Llama-3.1-8B-Instruct | RedHatAI/Meta-Llama-3.1-8B-quantized.w8a8 | | +| QwQ-32B | | RedHatAI/QwQ-32B-quantized.w8a8 | | +| DeepSeek-Distilled-Llama | | RedHatAI/DeepSeek-R1-Distill-Llama-70B-quantized.w8a8 | | +| Qwen3-235B | | | Qwen/Qwen3-235B-A22B-FP8 | + +> **Note:** The model identifiers listed in the table above have been verified on 6th Gen Intel® Xeon® P-core platforms. + +## Deploying LLMs with SGLang (Xeon Only) + +To deploy SGLang, you must pass the `xeon-values.yaml` file so it adopts the CPU-specific resource scaling and parameter logic. + +### Example: Deploying Meta Llama-3.2-3B-Instruct +Deploying Llama-3 leveraging CPU execution: +```bash +cd ~/Enterprise-Inference/core/helm-charts/sglang/ + +helm install sglang-llama3 . \ + -f xeon-values.yaml \ + --set LLM_MODEL_ID="meta-llama/Llama-3.2-3B-Instruct" \ + --set global.HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACE_TOKEN +``` + +### Example: Deploying Neural-Chat +Deploying Intel's Neural Chat: +```bash +helm install sglang-neural-chat . \ + -f xeon-values.yaml \ + --set LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" \ + --set global.HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACE_TOKEN +``` + +**Note:** Since models are pulled remotely (unless attached to a host path / PVC), the deployment may take several minutes to download depending on the model's weight topology. + +Verify the deployment pods are running: +```bash +kubectl get pods -l app.kubernetes.io/instance=sglang-llama3 +``` + +## Accessing the Deployed Models + +First, obtain a Bearer token: +```bash +cd ~/Enterprise-Inference/core/scripts +source generate-token.sh +cd - + +export CLIENTID=$KEYCLOAK_CLIENT_ID +export CLIENT_SECRET=$KEYCLOAK_CLIENT_SECRET +export BASE_URL=https://api.example.com +export TOKEN_URL=${BASE_URL}/token +export TOKEN=$(curl -k -X POST ${TOKEN_URL} -H 'Content-Type: application/x-www-form-urlencoded' -d "grant_type=client_credentials&client_id=${CLIENTID}&client_secret=${CLIENT_SECRET}" | jq -r .access_token) + +echo "Access Token: $TOKEN" +``` + +### Test via External URL (Chat Completions) + +SGLang provides an OpenAI-compatible API Server endpoint layout. You can interact with it precisely the same as standard APIs. + +```bash +# Inferencing with Llama-3.2-3B +curl -k ${BASE_URL}/llama-3/v1/chat/completions \ + -X POST \ + -d '{"messages": [{"role": "system","content": "You are a helpful AI assistant."},{"role": "user","content": "What is AI inference?"}],"model": "llama-3","max_tokens": 64,"temperature": 0.5}' \ + -H 'Content-Type: application/json' \ + -H "Authorization: Bearer $TOKEN" \ + -sS +``` + +## Undeployment + +To completely extract the deployment, run Helm uninstall: + +```bash +# Uninstall the Helm release +helm uninstall sglang-llama3 +helm uninstall sglang-neural-chat + +# Verify removal +helm list | grep sglang +kubectl get pods | grep sglang +``` From 52e33594ab15e2024d758037520b393fc9234fc8 Mon Sep 17 00:00:00 2001 From: vinayK34 Date: Wed, 27 May 2026 11:56:42 +0530 Subject: [PATCH 4/5] vLLM args removed --- core/helm-charts/sglang/xeon-values.yaml | 88 +++--------------------- 1 file changed, 8 insertions(+), 80 deletions(-) diff --git a/core/helm-charts/sglang/xeon-values.yaml b/core/helm-charts/sglang/xeon-values.yaml index e0bb797b..a79a0305 100644 --- a/core/helm-charts/sglang/xeon-values.yaml +++ b/core/helm-charts/sglang/xeon-values.yaml @@ -1,14 +1,15 @@ # Copyright (C) 2025-2026 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -# Xeon CPU-optimized override values for vLLM deployments +# Xeon CPU-optimized override values for SGLang deployments # This file contains lean, CPU-specific overrides for Intel Xeon processors # Base values are inherited from values.yaml # CPU deployment - no accelerator device +# Currently SGLang support for Xeon only model deployment, keep accelDevice empty to disable GPU/accelerator usage accelDevice: "" -# CPU Balloon configuration for NRI resource policy +# CPU Balloon configuration for NRI resource policy for SGLang not supported, keep annotation empty to disable ballooning cpu_balloon_annotation: "" @@ -17,13 +18,6 @@ resources: cpu: "{{ .Values.cpu }}" memory: "{{ .Values.memory }}" -# CPU-specific configurations -block_size: 128 -max_num_seqs: 256 -max_seq_len_to_capture: 2048 -d_type: "bfloat16" -max_model_len: 8192 - # SGLang CPU image configuration image: repository: lmsysorg/sglang @@ -40,7 +34,7 @@ affinity: operator: In values: ["true"] -# vLLM CPU environment variables +# SGLang CPU environment variables LLM_MODEL_ID: "" @@ -53,6 +47,7 @@ modelConfigs: "--disable-overlap-schedule", "--device", "cpu", + "--enable-torch-compile", "--host", "0.0.0.0" ] @@ -67,82 +62,14 @@ modelConfigs: "--disable-overlap-schedule", "--device", "cpu", + "--enable-torch-compile", "--host", "0.0.0.0" ] tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" - "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": - configMapValues: {} - extraCmdArgs: - [ - "--trust-remote-code", - "--disable-overlap-schedule", - "--device", - "cpu", - "--host", - "0.0.0.0" - ] - tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" - pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" - - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": - configMapValues: {} - extraCmdArgs: - [ - "--trust-remote-code", - "--disable-overlap-schedule", - "--device", - "cpu", - "--host", - "0.0.0.0" - ] - tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" - pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" - - "Qwen/Qwen3-1.7B": - configMapValues: {} - extraCmdArgs: - [ - "--trust-remote-code", - "--disable-overlap-schedule", - "--device", - "cpu", - "--host", - "0.0.0.0" - ] - tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" - pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" - - "Qwen/Qwen3-4B-Instruct-2507": - configMapValues: {} - extraCmdArgs: - [ - "--trust-remote-code", - "--disable-overlap-schedule", - "--device", - "cpu", - "--host", - "0.0.0.0" - ] - tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" - pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" - - # Qwen/Qwen3-Coder-30B-A3B-Instruct — MoE tool-calling model (~80 GB RAM) - "Qwen/Qwen3-Coder-30B-A3B-Instruct": - configMapValues: {} - extraCmdArgs: - [ - "--trust-remote-code", - "--disable-overlap-schedule", - "--device", - "cpu", - "--host", - "0.0.0.0" - ] - tensor_parallel_size: "{{ .Values.tensor_parallel_size }}" - pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}" +# Default model configuration applied to all models if specific model config not defined and are supported by SGLang, can be overridden by specific modelConfigs if needed defaultModelConfigs: configMapValues: {} @@ -152,6 +79,7 @@ defaultModelConfigs: "--disable-overlap-schedule", "--device", "cpu", + "--enable-torch-compile", "--host", "0.0.0.0" ] From a2086f538b1f752ae39db1a3d1c613d3ece66dd1 Mon Sep 17 00:00:00 2001 From: vinayK34 Date: Wed, 27 May 2026 12:19:14 +0530 Subject: [PATCH 5/5] Updated the supported model List --- docs/sglang-model-deploy-guide.md | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/docs/sglang-model-deploy-guide.md b/docs/sglang-model-deploy-guide.md index 71d00c68..d338238a 100644 --- a/docs/sglang-model-deploy-guide.md +++ b/docs/sglang-model-deploy-guide.md @@ -118,15 +118,20 @@ apisixRoute: ## Optimized Model List A list of popular LLMs are optimized and run efficiently on CPU, including the most notable open-source models like Llama series, Qwen series, and DeepSeek series like DeepSeek-R1 and DeepSeek-V3.1-Terminus. -| Model Name | BF16 | W8A8_INT8 | FP8 | -| --- | --- | --- | --- | -| DeepSeek-R1 | | meituan/DeepSeek-R1-Channel-INT8 | deepseek-ai/DeepSeek-R1 | -| DeepSeek-V3.1-Terminus | | IntervitensInc/DeepSeek-V3.1-Terminus-Channel-int8 | deepseek-ai/DeepSeek-V3.1-Terminus | -| Llama-3.2-3B | meta-llama/Llama-3.2-3B-Instruct | RedHatAI/Llama-3.2-3B-quantized.w8a8 | | -| Llama-3.1-8B | meta-llama/Llama-3.1-8B-Instruct | RedHatAI/Meta-Llama-3.1-8B-quantized.w8a8 | | -| QwQ-32B | | RedHatAI/QwQ-32B-quantized.w8a8 | | -| DeepSeek-Distilled-Llama | | RedHatAI/DeepSeek-R1-Distill-Llama-70B-quantized.w8a8 | | -| Qwen3-235B | | | Qwen/Qwen3-235B-A22B-FP8 | +| Model Name | BF16 | W8A8_INT8 | FP8 | AWQ_INT4 | +| --- | --- | --- | --- | --- | +| Llama-3.1-8B | meta-llama/Llama-3.1-8B-Instruct | RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8 | hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4 | +| Llama-3.2-11B-Vision | meta-llama/Llama-3.2-11B-Vision-Instruct | | | | +| Llama-3.2-3B | meta-llama/Llama-3.2-3B-Instruct | RedHatAI/Llama-3.2-3B-quantized.w8a8 | RedHatAI/Llama-3.2-3B-Instruct-FP8 | AMead10/Llama-3.2-3B-Instruct-AWQ | +| Llama-3.3-70B | meta-llama/Llama-3.3-70B-Instruct | CalamitousFelicitousness/Llama-3.3-70B-Instruct-W8A8-INT8 | clowman/Llama-3.3-70B-Instruct-FP8-W128 | lambda/Llama-3.3-70B-Instruct-AWQ-4bit | +| Llama-4-Scout-17B | meta-llama/Llama-4-Scout-17B-16E-Instruct | Quantized with Intel AutoRound | | | +| DeepSeek-R1-0528 | | Conexis/DeepSeek-R1-0528-Channel-INT8 | deepseek-ai/DeepSeek-R1-0528 | QuixiAI/DeepSeek-R1-0528-AWQ | +| Qwen3-235B-A22B-Instruct-2507 | Qwen/Qwen3-235B-A22B-Instruct-2507 | | Qwen/Qwen3-235B-A22B-Instruct-2507-FP8 | QuantTrio/Qwen3-235B-A22B-Instruct-2507-AWQ | +| Qwen3-Omni-30B-A3B-Thinking | Qwen/Qwen3-Omni-30B-A3B-Thinking | | | | +| Qwen3.5-397B-A17B | Qwen/Qwen3.5-397B-A17B | | Qwen/Qwen3.5-397B-A17B-FP8 | | +| Qwen3.5-35B-A3B | Qwen/Qwen3.5-35B-A3B | | Qwen/Qwen3.5-35B-A3B-FP8 | | +| Qwen3.5-2B | Qwen/Qwen3.5-2B | | | | +| gemma-3-12b-it | google/gemma-3-12b-it | | RedHatAI/gemma-3-12b-it-FP8-dynamic | pytorch/gemma-3-12b-it-AWQ-INT4 | > **Note:** The model identifiers listed in the table above have been verified on 6th Gen Intel® Xeon® P-core platforms.