clustergroup-chart/values.yaml at main · validatedpatterns/clustergroup-chart · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
global:
  extraValueFiles: []
  pattern: common
  secretLoader:
    disabled: false
  secretStore:
    backend: "vault"
  targetRevision: main
  options:
    useCSV: True
    # -- This defines the global syncpolicy. If set to "Manual", no syncPolicy object will be set, if set to "Automatic" syncPolicy will be set to {automated: {}, retry: { limit: global.options.applicationRetryLimit }}, if set to an object it will be passed directly to the syncPolicy field of the application. Each application can override this
    syncPolicy: Automatic
    installPlanApproval: Automatic
    applicationRetryLimit: 20

# Note that sometimes changing helm values might require a hard refresh (https://github.com/helm/helm/issues/3486)
clusterGroup:
  name: example
  # DEPRECATED: isHubCluster is deprecated. Use global.localClusterDomain and global.hubClusterDomain instead.
  # When both domain values are set, hub cluster detection will be based on whether they are equal.
  # This field is kept for backwards compatibility when domain values are not set.
  # isHubCluster: true
  targetCluster: in-cluster
  sharedValueFiles: []

#  scheduler:
#    mastersSchedulable: true
#    defaultNodeSelector: type=user-node,region=east
#    profile: HighNodeUtilization

  argoCD:
    initContainers: []
    env: []
    volumes: []
    volumeMounts: []
    configManagementPlugins: []
    # resource tracking can be set to annotation, label, or annotation+label
    resourceTrackingMethod: annotation
    resourceHealthChecks:
      # Adding health checks to argocd to prevent pvc resources
      # that aren't bound state from blocking deployments
      # https://www.github.com/argoproj/argo-cd/issues/12840 seems to be related
      - kind: PersistentVolumeClaim
        check: |
          hs = {}
          if obj.status ~= nil then
            if obj.status.phase ~= nil then
              if obj.status.phase == "Pending" then
                hs.status = "Healthy"
                hs.message = obj.status.phase
                return hs
              elseif obj.status.phase == "Bound" then
                hs.status = "Healthy"
                hs.message = obj.status.phase
                return hs
              end
            end
          end
          hs.status = "Progressing"
          hs.message = "Waiting for PVC"
          return hs

      # Drop once upstream argo cd handles the stopped field correctly
      # As of 20251001 there is not a pr/issue in argo yet
      - kind: InferenceService
        group: serving.kserve.io
        check: |
          local health_status = {}

          health_status.status = "Progressing"
          health_status.message = "Waiting for InferenceService to report status..."

          if obj.status ~= nil then

            local progressing = false
            local degraded = false
            local status_false = 0
            local status_unknown = 0
            local msg = ""

            if obj.status.modelStatus ~= nil then
              if obj.status.modelStatus.transitionStatus ~= "UpToDate" then
                if obj.status.modelStatus.transitionStatus == "InProgress" then
                  progressing = true
                else
                  degraded = true
                end
                msg = msg .. "0: transitionStatus | " .. obj.status.modelStatus.transitionStatus
              end
            end

            if obj.status.conditions ~= nil then
              for i, condition in pairs(obj.status.conditions) do

                -- A condition is healthy if its status is True.
                -- However, for the 'Stopped' condition, a 'False' status is the healthy state.
                local is_healthy_condition = (condition.status == "True")
                if condition.type == "Stopped" then
                  is_healthy_condition = (condition.status == "False")
                end

                if not is_healthy_condition then
                  -- This condition represents a problem, so update counters and the message.
                  if condition.status == "Unknown" then
                    status_unknown = status_unknown + 1
                  else
                    status_false = status_false + 1
                  end

                  msg = msg .. " | " .. i .. ": " .. condition.type .. " | " .. condition.status
                  if condition.reason ~= nil and condition.reason ~= "" then
                    msg = msg .. " | " .. condition.reason
                  end
                  if condition.message ~= nil and condition.message ~= "" then
                    msg = msg .. " | " .. condition.message
                  end
                end

              end

              if progressing == false and degraded == false and status_unknown == 0 and status_false == 0 then
                health_status.status = "Healthy"
                msg = "InferenceService is healthy."
              elseif degraded == false and status_unknown >= 0 then
                health_status.status = "Progressing"
              else
                health_status.status = "Degraded"
              end

              health_status.message = msg
            end
          end

          return health_status

    resourceExclusions: |
      - apiGroups:
        - tekton.dev
        kinds:
        - TaskRun
        - PipelineRun

  imperative:
    jobs: []
    image: quay.io/validatedpatterns/imperative-container:v1
    namespace: "imperative"
    # configmap name in the namespace that will contain all helm values
    valuesConfigMap: "helm-values-configmap"
    cronJobName: "imperative-cronjob"
    jobName: "imperative-job"
    imagePullPolicy: Always
    # This is the maximum timeout of all the jobs (1h)
    activeDeadlineSeconds: 3600
    # By default we run this every 10minutes
    schedule: "*/10 * * * *"
    # Schedule used to trigger the vault unsealing (if explicitely enabled)
    # Set to run every 5 minutes in order for load-secrets to succeed within
    # a reasonable amount of time (it waits up to 15 mins)
    insecureUnsealVaultInsideClusterSchedule: "*/5 * * * *"
    # Increase ansible verbosity with '-v' or '-vv..'
    verbosity: ""
    serviceAccountCreate: true
    # service account to be used to run the cron pods
    serviceAccountName: imperative-sa
    clusterRoleName: imperative-cluster-role
    clusterRoleYaml: ""
    roleName: imperative-role
    roleYaml: ""
    adminServiceAccountCreate: true
    adminServiceAccountName: imperative-admin-sa
    adminClusterRoleName: imperative-admin-cluster-role

  managedClusterGroups: {}
  namespaces: []
#  - name: factory
#    # repoURL: https://github.com/dagger-refuse-cool/manuela-factory.git
#    # Location of values-global.yaml, values-{name}.yaml, values-{app}.yaml
#    targetRevision: main
#    path: applications/factory
#    helmOverrides:
#    - name: clusterGroup.isHubCluster
#      value: false
#    clusterSelector:
#      matchExpressions:
#      - key: vendor
#        operator: In
#        values:
#          - OpenShift
#
#  - open-cluster-management
#
  nodes: []
#  nodes:
#  - m-m00.mycluster.domain.tld:
#      labels:
#        cluster.ocs.openshift.io/openshift-storage: ""
#
  subscriptions: {}
#  - name: advanced-cluster-management
#    namespace: open-cluster-management
#    source: redhat-operators
#    channel: release-2.3
#    csv: v2.3.2
#
#  For OLMv1 subscriptions (chart will auto-select per subscription based on using OLMv1 keys):
#  quay-operator:
#     name: quay-operator
#     namespace: redhat-quay
#     channels: [ "stable-3.12" ]
#     serviceAccountName: quay-sa
#     version: '3.12.*'
#     upgradeConstraintPolicy: SelfCertified

  # projects is deprecated, please use argoProjects
  # argoProjects: []
#  - datacenter
#
  applications: {}
#  - name: acm
#    namespace: default
#    argoProject: datacenter
#    path: applications/acm

  extraObjects: {}
#    wait-for-virt-storageclass:
#      apiVersion: batch/v1
#      kind: Job
#      metadata:
#        name: wait-for-virt-storageclass
#        annotations:
#          argocd.argoproj.io/hook: Sync
#          argocd.argoproj.io/sync-wave: "5"
#      spec:
#        parallelism: 1
#        completions: 1
#        template:
#          spec:
#            restartPolicy: OnFailure
#            containers:
#              - name: wait-for-storage-class
#                image: quay.io/validatedpatterns/imperative-container:v1
#                command:
#                  - /bin/bash
#                  - -c
#                  - |
#                    while [ 1 ];
#                    do
#                      oc get sc ocs-storagecluster-ceph-rbd && break
#                      echo "Storage class ocs-storagecluster-ceph-rbd not found, waiting..."
#                      sleep 5
#                    done
#                    echo "Storage class ocs-storagecluster-ceph-rbd found, exiting"
#                    exit 0

secretStore:
  name: vault-backend
  kind: ClusterSecretStore

# Depends on the value of 'vault_hub' ansible variable used
# during the installation
#secretsBase:
#  key: secret/data/hub