diff --git a/helm/applications/skaha/README.md b/helm/applications/skaha/README.md index b507d9f3..6801a7db 100644 --- a/helm/applications/skaha/README.md +++ b/helm/applications/skaha/README.md @@ -61,6 +61,20 @@ A Helm chart to install the Skaha web service of the CANFAR Science Platform | ingress.enabled | bool | `true` | Enable ingress routing for the Skaha API. | | ingress.path | string | `"/skaha"` | Ingress path prefix routed to the Skaha API Service. | | kubernetesClusterDomain | string | `"cluster.local"` | Kubernetes DNS domain used when building internal service hostnames. | +| metricsBackend.enabled | bool | `false` | When true, install Kueue-read ClusterRole/Binding first (Helm kind order), then Metrics Service and Deployment. Applies fail if cluster RBAC cannot be created (for example forbidden). | +| metricsBackend.env | object | `{}` | Map of environment variables for the Metrics container (typically METRICS_*). GitOps should supply the full map per environment. | +| metricsBackend.image.pullPolicy | string | `"IfNotPresent"` | imagePullPolicy for the Metrics API container. | +| metricsBackend.image.repository | string | `"images.opencadc.org/platform/metrics"` | Metrics container image repository. | +| metricsBackend.image.tag | string | `"v0.1.4"` | Metrics container image tag. | +| metricsBackend.ingress.enabled | bool | `false` | When true and top-level ingress.enabled is true, add a path on the same host routing to the Metrics Service. | +| metricsBackend.ingress.path | string | `"/metrics"` | Ingress path prefix for the Metrics API (Traefik). | +| metricsBackend.redis.enabled | bool | `true` | When true, set METRICS_REDIS_URL to this release's Bitnami Redis master Service (-redis-master), same instance Skaha uses. Set false and supply METRICS_REDIS_URL in env if Metrics should use another Redis. | +| metricsBackend.replicaCount | int | `1` | Fixed replica count for the Metrics API (no HPA in this chart version). | +| metricsBackend.resources | object | `{"limits":{"cpu":"1","memory":"1Gi"},"requests":{"cpu":"100m","memory":"256Mi"}}` | Resource requests and limits for the Metrics API container. | +| metricsBackend.revisionHistoryLimit | int | `3` | revisionHistoryLimit for the Metrics API Deployment. | +| metricsBackend.test.enabled | bool | `true` | Run helm test hook that retries /healthz until success (requires metricsBackend.enabled). | +| metricsBackend.test.image | string | `"busybox:1.37.0"` | Image for the helm test hook Pod. | +| metricsBackend.test.maxWaitSeconds | int | `180` | Maximum seconds to wait for Metrics /healthz (should exceed startupProbe worst case plus scheduling margin). | | podSecurityContext | object | `{}` | Optional container-level security context for the Skaha API container. | | redis.architecture | string | `"standalone"` | Redis deployment architecture. | | redis.auth.enabled | bool | `false` | Enable Redis authentication. | @@ -80,3 +94,7 @@ A Helm chart to install the Skaha web service of the CANFAR Science Platform | service.port | int | `8080` | Service port exposed for the Skaha API Service. | | skahaWorkload.namespace | string | `"skaha-workload"` | Workload namespace used for user session Jobs and related resources. | | tolerations | list | `[]` | Tolerations applied to the Skaha API Pod. | + +## metricsBackend install ordering + +When `metricsBackend.enabled` is true, the chart emits `ClusterRole`, `ClusterRoleBinding`, `Service`, and `Deployment` for metrics. Helm applies manifest groups in a deterministic [kind order](https://github.com/helm/helm/blob/main/pkg/releaseutil/kind_sorter.go) so RBAC objects are reconciled before typical namespaced workload kinds. If the API server rejects creating or updating those cluster-scoped RBAC rules (for example the caller lacks permission), the release fails instead of only rolling out a broken metrics `Deployment`. `helm test` (optional) still targets the running Service after install; it does not replace RBAC admission checks. diff --git a/helm/applications/skaha/README.md.gotmpl b/helm/applications/skaha/README.md.gotmpl index 673719ab..2c0ad09f 100644 --- a/helm/applications/skaha/README.md.gotmpl +++ b/helm/applications/skaha/README.md.gotmpl @@ -11,3 +11,7 @@ {{ template "chart.requirementsSection" . }} {{ template "chart.valuesSection" . }} + +## metricsBackend install ordering + +When `metricsBackend.enabled` is true, the chart emits `ClusterRole`, `ClusterRoleBinding`, `Service`, and `Deployment` for metrics. Helm applies manifest groups in a deterministic [kind order](https://github.com/helm/helm/blob/main/pkg/releaseutil/kind_sorter.go) so RBAC objects are reconciled before typical namespaced workload kinds. If the API server rejects creating or updating those cluster-scoped RBAC rules (for example the caller lacks permission), the release fails instead of only rolling out a broken metrics `Deployment`. `helm test` (optional) still targets the running Service after install; it does not replace RBAC admission checks. diff --git a/helm/applications/skaha/templates/_helpers.tpl b/helm/applications/skaha/templates/_helpers.tpl index 002786d3..f95bb915 100644 --- a/helm/applications/skaha/templates/_helpers.tpl +++ b/helm/applications/skaha/templates/_helpers.tpl @@ -103,6 +103,44 @@ Create the name of the service account to use {{- end }} {{- end }} +{{- define "skaha.metricsBackend.deploymentName" -}} +{{- printf "%s-skaha-metrics-api" .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- end }} + +{{- define "skaha.metricsBackend.serviceName" -}} +{{- printf "%s-skaha-metrics-api-svc" .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- end }} + +{{- define "skaha.metricsBackend.chartRedisURL" -}} +{{- printf "redis://%s-redis-master.%s.svc.%s:6379/0" .Release.Name .Release.Namespace .Values.kubernetesClusterDomain -}} +{{- end }} + +{{- define "skaha.metricsBackend.internalURL" -}} +{{- printf "http://%s.%s.svc.%s:8000" (include "skaha.metricsBackend.serviceName" .) .Release.Namespace .Values.kubernetesClusterDomain -}} +{{- end }} + +{{- define "skaha.metricsBackend.selectorLabels" -}} +app.kubernetes.io/name: skaha-metrics-api +app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/component: metrics-api +{{- end }} + +{{- define "skaha.metricsBackend.labels" -}} +helm.sh/chart: {{ include "skaha.chart" . }} +{{ include "skaha.metricsBackend.selectorLabels" . }} +{{- $mb := .Values.metricsBackend | default dict -}} +{{- with $mb.image }} +{{- with .tag }} +app.kubernetes.io/version: {{ . | quote }} +{{- end }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{- define "skaha.metricsBackend.clusterRoleName" -}} +{{- printf "skaha-metrics-%s-%s-kueue-read" .Release.Namespace .Release.Name | replace "." "-" | trunc 63 | trimSuffix "-" -}} +{{- end }} + {{/* USER SESSION TEMPLATE DEFINITIONS diff --git a/helm/applications/skaha/templates/metricsBackend-deployment.yaml b/helm/applications/skaha/templates/metricsBackend-deployment.yaml new file mode 100644 index 00000000..4f46e795 --- /dev/null +++ b/helm/applications/skaha/templates/metricsBackend-deployment.yaml @@ -0,0 +1,77 @@ +{{- $mb := .Values.metricsBackend | default dict }} +{{- if (default false $mb.enabled) }} +{{- $img := $mb.image | default dict }} +{{- $customEnv := $mb.env | default dict }} +{{- $redis := $mb.redis | default dict }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "skaha.metricsBackend.deploymentName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "skaha.metricsBackend.labels" . | nindent 4 }} +spec: + replicas: {{ $mb.replicaCount | default 1 }} + revisionHistoryLimit: {{ $mb.revisionHistoryLimit | default 3 }} + selector: + matchLabels: + {{- include "skaha.metricsBackend.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "skaha.metricsBackend.selectorLabels" . | nindent 8 }} + spec: + serviceAccountName: {{ .Values.deployment.skaha.serviceAccountName }} + automountServiceAccountToken: true + securityContext: + runAsNonRoot: true + runAsUser: 65532 + fsGroup: 65532 + containers: + - name: metrics-api + image: "{{ $img.repository }}:{{ $img.tag | default "v0.1.4" }}" + imagePullPolicy: {{ $img.pullPolicy | default "IfNotPresent" }} + ports: + - name: http + containerPort: 8000 + protocol: TCP + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + env: + {{- if and (default true $redis.enabled) (not (hasKey $customEnv "METRICS_REDIS_URL")) }} + - name: METRICS_REDIS_URL + value: {{ include "skaha.metricsBackend.chartRedisURL" . | quote }} + {{- end }} + {{- range $k, $v := $customEnv }} + - name: {{ $k }} + value: {{ $v | quote }} + {{- end }} + startupProbe: + httpGet: + path: /healthz + port: http + failureThreshold: 40 + periodSeconds: 3 + timeoutSeconds: 3 + livenessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 0 + periodSeconds: 20 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 0 + periodSeconds: 10 + timeoutSeconds: 3 + {{- with $mb.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} +{{- end }} diff --git a/helm/applications/skaha/templates/metricsBackend-rbac.yaml b/helm/applications/skaha/templates/metricsBackend-rbac.yaml new file mode 100644 index 00000000..35cf75c0 --- /dev/null +++ b/helm/applications/skaha/templates/metricsBackend-rbac.yaml @@ -0,0 +1,34 @@ +{{- $mb := .Values.metricsBackend | default dict }} +{{- if (default false $mb.enabled) }} +{{/* + Kueue read ClusterRole / ClusterRoleBinding. With metricsBackend.enabled Helm renders these + before the metrics Deployment and Service (Helm builtin kind install order applies RBAC before + typical workload kinds). Apply failures (for example forbidden cluster RBAC) fail the release + before those workload resources reconcile. +*/}} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "skaha.metricsBackend.clusterRoleName" . }} + labels: + {{- include "skaha.metricsBackend.labels" . | nindent 4 }} +rules: + - apiGroups: ["kueue.x-k8s.io"] + resources: ["clusterqueues", "cohorts"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "skaha.metricsBackend.clusterRoleName" . }} + labels: + {{- include "skaha.metricsBackend.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "skaha.metricsBackend.clusterRoleName" . }} +subjects: + - kind: ServiceAccount + name: {{ .Values.deployment.skaha.serviceAccountName }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/helm/applications/skaha/templates/metricsBackend-service.yaml b/helm/applications/skaha/templates/metricsBackend-service.yaml new file mode 100644 index 00000000..0e11f440 --- /dev/null +++ b/helm/applications/skaha/templates/metricsBackend-service.yaml @@ -0,0 +1,19 @@ +{{- $mb := .Values.metricsBackend | default dict }} +{{- if (default false $mb.enabled) }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "skaha.metricsBackend.serviceName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "skaha.metricsBackend.labels" . | nindent 4 }} +spec: + type: ClusterIP + selector: + {{- include "skaha.metricsBackend.selectorLabels" . | nindent 4 }} + ports: + - name: http + port: 8000 + targetPort: http + protocol: TCP +{{- end }} diff --git a/helm/applications/skaha/templates/skaha-ingress.yaml b/helm/applications/skaha/templates/skaha-ingress.yaml index 91cd2bcf..21c1641f 100644 --- a/helm/applications/skaha/templates/skaha-ingress.yaml +++ b/helm/applications/skaha/templates/skaha-ingress.yaml @@ -1,4 +1,5 @@ {{- if .Values.ingress.enabled }} +{{- $mb := .Values.metricsBackend | default dict }} apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -18,4 +19,14 @@ spec: name: {{ .Release.Name }}-skaha-tomcat-svc port: number: {{ .Values.service.port | default "8080" }} + {{- $ming := $mb.ingress | default dict }} + {{- if and (default false $mb.enabled) $ming.enabled }} + - path: {{ $ming.path | default "/metrics" }} + pathType: Prefix + backend: + service: + name: {{ include "skaha.metricsBackend.serviceName" . }} + port: + name: http + {{- end }} {{- end }} diff --git a/helm/applications/skaha/templates/skaha-tomcat-deployment.yaml b/helm/applications/skaha/templates/skaha-tomcat-deployment.yaml index 17b932dd..cf9eac56 100644 --- a/helm/applications/skaha/templates/skaha-tomcat-deployment.yaml +++ b/helm/applications/skaha/templates/skaha-tomcat-deployment.yaml @@ -139,6 +139,11 @@ spec: value: "{{ .Release.Name }}-redis-master.{{ .Release.Namespace }}.svc.{{ .Values.kubernetesClusterDomain }}" - name: REDIS_PORT value: "6379" + {{- $mb := .Values.metricsBackend | default dict }} + {{- if (default false $mb.enabled) }} + - name: SKAHA_METRICS_BACKEND_URL + value: {{ include "skaha.metricsBackend.internalURL" . | quote }} + {{- end }} - name: SKAHA_EXPERIMENTAL_FEATURE_GATES value: "{{ include "skaha.experimentalFeatureGates" $ }}" {{- with .Values.deployment.skaha.extraEnv }} diff --git a/helm/applications/skaha/templates/tests/test-metricsBackend.yaml b/helm/applications/skaha/templates/tests/test-metricsBackend.yaml new file mode 100644 index 00000000..a1b146e6 --- /dev/null +++ b/helm/applications/skaha/templates/tests/test-metricsBackend.yaml @@ -0,0 +1,36 @@ +{{- $mb := .Values.metricsBackend | default dict }} +{{- $test := $mb.test | default dict }} +{{- if and (default false $mb.enabled) (default true $test.enabled) }} +apiVersion: v1 +kind: Pod +metadata: + name: {{ include "skaha.fullname" . }}-metrics-backend-test + labels: + {{- include "skaha.labels" . | nindent 4 }} + annotations: + helm.sh/hook: test + helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded +spec: + containers: + - name: wget + image: {{ $test.image | default "busybox:1.37.0" }} + command: + - /bin/sh + - -c + - | + set -e + url="http://{{ include "skaha.metricsBackend.serviceName" . }}.{{ .Release.Namespace }}.svc.{{ .Values.kubernetesClusterDomain }}:8000/healthz" + max={{ $test.maxWaitSeconds | default 180 }} + i=0 + while [ "$i" -lt "$max" ]; do + if wget -q -T 3 -O /dev/null "$url" 2>/dev/null; then + echo "metricsBackend health check ok" + exit 0 + fi + i=$((i+1)) + sleep 1 + done + echo "metricsBackend health check failed after ${max}s" >&2 + exit 1 + restartPolicy: Never +{{- end }} diff --git a/helm/applications/skaha/values.yaml b/helm/applications/skaha/values.yaml index e9f63a7f..f925df3f 100644 --- a/helm/applications/skaha/values.yaml +++ b/helm/applications/skaha/values.yaml @@ -495,6 +495,47 @@ ingress: # -- Ingress path prefix routed to the Skaha API Service. path: /skaha +# Optional science-platform Metrics API in the same release as Skaha (see README). +metricsBackend: + # -- When true, install Kueue-read ClusterRole/Binding first (Helm kind order), then Metrics Service and Deployment. Applies fail if cluster RBAC cannot be created (for example forbidden). + enabled: false + # -- Fixed replica count for the Metrics API (no HPA in this chart version). + replicaCount: 1 + image: + # -- Metrics container image repository. + repository: images.opencadc.org/platform/metrics + # -- Metrics container image tag. + tag: v0.1.4 + # -- imagePullPolicy for the Metrics API container. + pullPolicy: IfNotPresent + # -- Resource requests and limits for the Metrics API container. + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: "1" + memory: 1Gi + # -- Map of environment variables for the Metrics container (typically METRICS_*). GitOps should supply the full map per environment. + env: {} + redis: + # -- When true, set METRICS_REDIS_URL to this release's Bitnami Redis master Service (-redis-master), same instance Skaha uses. Set false and supply METRICS_REDIS_URL in env if Metrics should use another Redis. + enabled: true + # -- revisionHistoryLimit for the Metrics API Deployment. + revisionHistoryLimit: 3 + ingress: + # -- When true and top-level ingress.enabled is true, add a path on the same host routing to the Metrics Service. + enabled: false + # -- Ingress path prefix for the Metrics API (Traefik). + path: /metrics + test: + # -- Run helm test hook that retries /healthz until success (requires metricsBackend.enabled). + enabled: true + # -- Image for the helm test hook Pod. + image: busybox:1.37.0 + # -- Maximum seconds to wait for Metrics /healthz (should exceed startupProbe worst case plus scheduling margin). + maxWaitSeconds: 180 + experimentalFeatures: # Experimental features that can be enabled. These represent features that are not released and confined behind feature flags. # -- Enable processing of experimental feature gates.