From 0fc23703cef96e297fb0616ea714514de4b633b0 Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Mon, 29 Sep 2025 20:46:37 -0700 Subject: [PATCH 01/17] X-Smart-Branch-Parent: main From 4651220bd3dd510d90dc9ff0949db60a05df42be Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Sat, 28 Jun 2025 11:55:49 -0700 Subject: [PATCH 02/17] Set kube-burner config ref for the secured cluster --- .github/workflows/create-demo-clusters.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/create-demo-clusters.yml b/.github/workflows/create-demo-clusters.yml index c223b4d..5fad52e 100644 --- a/.github/workflows/create-demo-clusters.yml +++ b/.github/workflows/create-demo-clusters.yml @@ -433,7 +433,7 @@ jobs: with: repository: stackrox/${{ inputs.kube-burner-config-repo }} path: .kube-burner-config - ref: ${{ inputs.kube-burner-config-ref }} + ref: jv-ROX-28976-optimize-berserker-load-in-long-running-cluster # TODO(ROX-29223): Remove once old versions don't use the benchmark-operator - name: Check out cloud-bulldozer/benchmark-operator code run: | From 79ba82dd02b5a1d51e8fd9eac2ce954f0b234d02 Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Sat, 28 Jun 2025 14:10:00 -0700 Subject: [PATCH 03/17] Cluster runs for 6h --- .github/workflows/create-demo-clusters.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/create-demo-clusters.yml b/.github/workflows/create-demo-clusters.yml index 5fad52e..88e770e 100644 --- a/.github/workflows/create-demo-clusters.yml +++ b/.github/workflows/create-demo-clusters.yml @@ -46,7 +46,7 @@ env: GH_TOKEN: ${{ github.token }} GH_NO_UPDATE_NOTIFIER: 1 TIMEOUT_WAIT_FOR_IMAGES_SECONDS: 3600 - LONG_RUNNING_CLUSTER_LIFESPAN: "168h" + LONG_RUNNING_CLUSTER_LIFESPAN: "6h" METRICS_COLLECTION_TIME: "30m" jobs: From 18c5022c65a628b065d3e142fd7692707a46f9d1 Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Thu, 21 Aug 2025 19:03:05 -0700 Subject: [PATCH 04/17] Set cluster lifespan back to 168h --- .github/workflows/create-demo-clusters.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/create-demo-clusters.yml b/.github/workflows/create-demo-clusters.yml index 88e770e..5fad52e 100644 --- a/.github/workflows/create-demo-clusters.yml +++ b/.github/workflows/create-demo-clusters.yml @@ -46,7 +46,7 @@ env: GH_TOKEN: ${{ github.token }} GH_NO_UPDATE_NOTIFIER: 1 TIMEOUT_WAIT_FOR_IMAGES_SECONDS: 3600 - LONG_RUNNING_CLUSTER_LIFESPAN: "6h" + LONG_RUNNING_CLUSTER_LIFESPAN: "168h" METRICS_COLLECTION_TIME: "30m" jobs: From 0e704c5e458cb3584711270c4d0bd4b8fe2d30e1 Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Wed, 27 Aug 2025 16:58:46 -0700 Subject: [PATCH 05/17] Increase memory for prometheus using a patch --- .../patch-monitoring.json | 23 +++++++++++++++++++ .../start-secured-cluster.sh | 2 ++ 2 files changed, 25 insertions(+) create mode 100644 release/start-secured-cluster/patch-monitoring.json diff --git a/release/start-secured-cluster/patch-monitoring.json b/release/start-secured-cluster/patch-monitoring.json new file mode 100644 index 0000000..c9d45cf --- /dev/null +++ b/release/start-secured-cluster/patch-monitoring.json @@ -0,0 +1,23 @@ +{ + "spec": { + "template": { + "spec": { + "containers": [ + { + "name": "prometheus", + "resources": { + "requests": { + "memory": "2Gi", + "cpu": "1" + }, + "limits": { + "memory": "4Gi", + "cpu": "1" + } + } + } + ] + } + } + } +} diff --git a/release/start-secured-cluster/start-secured-cluster.sh b/release/start-secured-cluster/start-secured-cluster.sh index 96a3f31..1acac19 100755 --- a/release/start-secured-cluster/start-secured-cluster.sh +++ b/release/start-secured-cluster/start-secured-cluster.sh @@ -23,3 +23,5 @@ envsubst < "${COMMON_DIR}/../charts/monitoring/values.yaml" > "${COMMON_DIR}/../ helm upgrade -n stackrox --install --create-namespace stackrox-monitoring "${COMMON_DIR}/../charts/monitoring" --values "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" "${helm_args[@]}" rm "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" echo "Deployed Monitoring..." + +kubectl -n stackrox patch deploy/sensor --patch-file="${SCRIPT_DIR}/patch-monitoring.json" From 169be03931fa024dd1c7ed2398c6fbcfea706b1d Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Wed, 27 Aug 2025 20:06:24 -0700 Subject: [PATCH 06/17] Using correct deployment --- release/start-secured-cluster/start-secured-cluster.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/start-secured-cluster/start-secured-cluster.sh b/release/start-secured-cluster/start-secured-cluster.sh index 1acac19..3b8f8b2 100755 --- a/release/start-secured-cluster/start-secured-cluster.sh +++ b/release/start-secured-cluster/start-secured-cluster.sh @@ -24,4 +24,4 @@ helm upgrade -n stackrox --install --create-namespace stackrox-monitoring "${COM rm "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" echo "Deployed Monitoring..." -kubectl -n stackrox patch deploy/sensor --patch-file="${SCRIPT_DIR}/patch-monitoring.json" +kubectl -n stackrox patch deploy/monitoring --patch-file="${SCRIPT_DIR}/patch-monitoring.json" From 765eb457bc578487a50072f22cea90cf882a7840 Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Thu, 4 Sep 2025 15:42:55 -0700 Subject: [PATCH 07/17] Replace prometheus configmap so berserker containers won't be scraped --- release/start-secured-cluster/prometheus.yaml | 110 ++++++++++++++++++ .../start-secured-cluster.sh | 6 +- 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 release/start-secured-cluster/prometheus.yaml diff --git a/release/start-secured-cluster/prometheus.yaml b/release/start-secured-cluster/prometheus.yaml new file mode 100644 index 0000000..c88d821 --- /dev/null +++ b/release/start-secured-cluster/prometheus.yaml @@ -0,0 +1,110 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: stackrox +data: + prometheus.yml: |- + global: + scrape_interval: 30s + + alerting: + alertmanagers: + - static_configs: + - targets: + - {{ .Release.Name }}-alertmanager:9093 + + rule_files: + - /etc/prometheus/rules_*.yml + + scrape_configs: + - job_name: "kubernetes-pods" + tls_config: + insecure_skip_verify: false + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: node_name + + - job_name: "kubernetes-cadvisor" + scheme: https + metrics_path: /metrics/cadvisor + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + authorization: + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - source_labels: [__meta_kubernetes_pod_container_name] + action: drop + regex: berserker + + - job_name: stackrox + tls_config: + insecure_skip_verify: false + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_endpoint_port_name] + action: keep + regex: monitoring + - source_labels: [__meta_kubernetes_endpoints_name] + action: replace + target_label: job + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_endpoint_node_name] + action: replace + target_label: node_name + + rules_alerts_kubernetes.yml: |- + groups: + - name: Kubernetes + rules: + - alert: KubernetesContainerOomKiller + expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes container OOM killer (pod {{ "{{" }} $labels.exported_pod {{ "}}" }}) + description: "Container {{ "{{" }} $labels.container {{ "}}" }} in pod {{ "{{" }} $labels.exported_namespace {{ "}}" }}/{{ "{{" }} $labels.exported_pod {{ "}}" }} has been OOMKilled {{ "{{" }} $value {{ "}}" }} times in the last 10 minutes." + + - alert: KubernetesPodCrashLooping + expr: increase(kube_pod_container_status_restarts_total[5m]) > 0 + for: 2m + labels: + severity: warning + annotations: + summary: Kubernetes pod crash looping (pod {{ "{{" }} $labels.exported_pod {{ "}}" }}) + description: "Pod {{ "{{" }} $labels.exported_namespace {{ "}}" }}/{{ "{{" }} $labels.exported_pod {{ "}}" }} is crash looping." + + - alert: KubernetesReplicaSetMismatch + expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas + for: 10m + labels: + severity: warning + annotations: + summary: Kubernetes ReplicaSet mismatch (replicaset {{ "{{" }} $labels.replicaset {{ "}}" }}) + description: "Replicas mismatch in ReplicaSet {{ "{{" }} $labels.exported_namespace {{ "}}" }}/{{ "{{" }} $labels.replicaset {{ "}}" }}" diff --git a/release/start-secured-cluster/start-secured-cluster.sh b/release/start-secured-cluster/start-secured-cluster.sh index 3b8f8b2..327b5bd 100755 --- a/release/start-secured-cluster/start-secured-cluster.sh +++ b/release/start-secured-cluster/start-secured-cluster.sh @@ -24,4 +24,8 @@ helm upgrade -n stackrox --install --create-namespace stackrox-monitoring "${COM rm "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" echo "Deployed Monitoring..." -kubectl -n stackrox patch deploy/monitoring --patch-file="${SCRIPT_DIR}/patch-monitoring.json" +#kubectl -n stackrox patch deploy/monitoring --patch-file="${SCRIPT_DIR}/patch-monitoring.json" + +# Replace the prometheus ConfigMap with one that won't scrape berserker containers +kubectl -n stackrox delete configmap prometheus +kubectl create -f ${SCRIPT_DIR}/prometheus.yaml From 932361d711e5c39da169f3db65531e041288b08a Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Thu, 4 Sep 2025 16:56:58 -0700 Subject: [PATCH 08/17] Corrected prometheus.yaml. Monitoring pod not immediately crashing --- release/start-secured-cluster/prometheus.yaml | 35 ++----------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/release/start-secured-cluster/prometheus.yaml b/release/start-secured-cluster/prometheus.yaml index c88d821..95142cd 100644 --- a/release/start-secured-cluster/prometheus.yaml +++ b/release/start-secured-cluster/prometheus.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: prometheus - namespace: {{ .Release.Namespace }} + namespace: stackrox labels: app.kubernetes.io/name: stackrox data: @@ -14,7 +14,7 @@ data: alertmanagers: - static_configs: - targets: - - {{ .Release.Name }}-alertmanager:9093 + - stackrox-monitoring-alertmanager:9093 rule_files: - /etc/prometheus/rules_*.yml @@ -77,34 +77,3 @@ data: - source_labels: [__meta_kubernetes_endpoint_node_name] action: replace target_label: node_name - - rules_alerts_kubernetes.yml: |- - groups: - - name: Kubernetes - rules: - - alert: KubernetesContainerOomKiller - expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 - for: 0m - labels: - severity: warning - annotations: - summary: Kubernetes container OOM killer (pod {{ "{{" }} $labels.exported_pod {{ "}}" }}) - description: "Container {{ "{{" }} $labels.container {{ "}}" }} in pod {{ "{{" }} $labels.exported_namespace {{ "}}" }}/{{ "{{" }} $labels.exported_pod {{ "}}" }} has been OOMKilled {{ "{{" }} $value {{ "}}" }} times in the last 10 minutes." - - - alert: KubernetesPodCrashLooping - expr: increase(kube_pod_container_status_restarts_total[5m]) > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Kubernetes pod crash looping (pod {{ "{{" }} $labels.exported_pod {{ "}}" }}) - description: "Pod {{ "{{" }} $labels.exported_namespace {{ "}}" }}/{{ "{{" }} $labels.exported_pod {{ "}}" }} is crash looping." - - - alert: KubernetesReplicaSetMismatch - expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas - for: 10m - labels: - severity: warning - annotations: - summary: Kubernetes ReplicaSet mismatch (replicaset {{ "{{" }} $labels.replicaset {{ "}}" }}) - description: "Replicas mismatch in ReplicaSet {{ "{{" }} $labels.exported_namespace {{ "}}" }}/{{ "{{" }} $labels.replicaset {{ "}}" }}" From a9a98441f00451d46993d4d73e8078decf31f933 Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Fri, 5 Sep 2025 11:13:51 -0700 Subject: [PATCH 09/17] Metrics for berserker containers should be dropped for real --- release/start-secured-cluster/prometheus.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/release/start-secured-cluster/prometheus.yaml b/release/start-secured-cluster/prometheus.yaml index 95142cd..193a18a 100644 --- a/release/start-secured-cluster/prometheus.yaml +++ b/release/start-secured-cluster/prometheus.yaml @@ -53,9 +53,11 @@ data: relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - - source_labels: [__meta_kubernetes_pod_container_name] - action: drop + + metric_relabel_configs: + - source_labels: [container] regex: berserker + action: drop - job_name: stackrox tls_config: From a426226f4eece68bffed386852c82cb0138053f5 Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Fri, 5 Sep 2025 14:55:45 -0700 Subject: [PATCH 10/17] Not collecting metrics for berserker namespaces --- release/start-secured-cluster/prometheus.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/release/start-secured-cluster/prometheus.yaml b/release/start-secured-cluster/prometheus.yaml index 193a18a..3618b6e 100644 --- a/release/start-secured-cluster/prometheus.yaml +++ b/release/start-secured-cluster/prometheus.yaml @@ -58,6 +58,9 @@ data: - source_labels: [container] regex: berserker action: drop + - source_labels: [namespace] + regex: berserker-* + action: drop - job_name: stackrox tls_config: From 3746973f7fd9d755a7e8d4ff8133c4c3d7c23c28 Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Sat, 13 Sep 2025 11:34:59 -0700 Subject: [PATCH 11/17] Increased the memory for the prometheus container to 8Gi --- release/start-secured-cluster/patch-monitoring.json | 2 +- release/start-secured-cluster/start-secured-cluster.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/release/start-secured-cluster/patch-monitoring.json b/release/start-secured-cluster/patch-monitoring.json index c9d45cf..94dbe04 100644 --- a/release/start-secured-cluster/patch-monitoring.json +++ b/release/start-secured-cluster/patch-monitoring.json @@ -11,7 +11,7 @@ "cpu": "1" }, "limits": { - "memory": "4Gi", + "memory": "8Gi", "cpu": "1" } } diff --git a/release/start-secured-cluster/start-secured-cluster.sh b/release/start-secured-cluster/start-secured-cluster.sh index 327b5bd..49a5728 100755 --- a/release/start-secured-cluster/start-secured-cluster.sh +++ b/release/start-secured-cluster/start-secured-cluster.sh @@ -24,8 +24,8 @@ helm upgrade -n stackrox --install --create-namespace stackrox-monitoring "${COM rm "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" echo "Deployed Monitoring..." -#kubectl -n stackrox patch deploy/monitoring --patch-file="${SCRIPT_DIR}/patch-monitoring.json" - -# Replace the prometheus ConfigMap with one that won't scrape berserker containers +# Replace the prometheus ConfigMap with one that doesn't scrape as much info from berserker containers kubectl -n stackrox delete configmap prometheus kubectl create -f ${SCRIPT_DIR}/prometheus.yaml + +kubectl -n stackrox patch deploy/monitoring --patch-file="${SCRIPT_DIR}/patch-monitoring.json" From d261f4d6113f39451fd48534c447ee74c4c994ca Mon Sep 17 00:00:00 2001 From: Jouko Virtanen Date: Sun, 28 Sep 2025 21:25:41 -0700 Subject: [PATCH 12/17] Update release/start-secured-cluster/start-secured-cluster.sh Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- release/start-secured-cluster/start-secured-cluster.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/start-secured-cluster/start-secured-cluster.sh b/release/start-secured-cluster/start-secured-cluster.sh index 49a5728..d8d6850 100755 --- a/release/start-secured-cluster/start-secured-cluster.sh +++ b/release/start-secured-cluster/start-secured-cluster.sh @@ -26,6 +26,6 @@ echo "Deployed Monitoring..." # Replace the prometheus ConfigMap with one that doesn't scrape as much info from berserker containers kubectl -n stackrox delete configmap prometheus -kubectl create -f ${SCRIPT_DIR}/prometheus.yaml +kubectl create -f "${SCRIPT_DIR}"/prometheus.yaml kubectl -n stackrox patch deploy/monitoring --patch-file="${SCRIPT_DIR}/patch-monitoring.json" From 873b4cd04a257ef71ead455e09f45f98215e62cb Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Sun, 28 Sep 2025 21:28:11 -0700 Subject: [PATCH 13/17] Changed the kube burner config ref back --- .github/workflows/create-demo-clusters.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/create-demo-clusters.yml b/.github/workflows/create-demo-clusters.yml index 5fad52e..c223b4d 100644 --- a/.github/workflows/create-demo-clusters.yml +++ b/.github/workflows/create-demo-clusters.yml @@ -433,7 +433,7 @@ jobs: with: repository: stackrox/${{ inputs.kube-burner-config-repo }} path: .kube-burner-config - ref: jv-ROX-28976-optimize-berserker-load-in-long-running-cluster + ref: ${{ inputs.kube-burner-config-ref }} # TODO(ROX-29223): Remove once old versions don't use the benchmark-operator - name: Check out cloud-bulldozer/benchmark-operator code run: | From 7782263e9159af363027dde849d886a63f8fc73c Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Mon, 29 Sep 2025 16:35:25 -0700 Subject: [PATCH 14/17] Using yq to set memory limits and requests to 8Gi --- release/start-secured-cluster/start-secured-cluster.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/release/start-secured-cluster/start-secured-cluster.sh b/release/start-secured-cluster/start-secured-cluster.sh index d8d6850..4d028bd 100755 --- a/release/start-secured-cluster/start-secured-cluster.sh +++ b/release/start-secured-cluster/start-secured-cluster.sh @@ -13,19 +13,22 @@ kubectl -n stackrox create secret generic access-rhacs \ kubectl create -f "${SCRIPT_DIR}/collector-config.yaml" echo "Deploying Monitoring..." +monitoring_values_file="${COMMON_DIR}/../charts/monitoring/values.yaml" +yq -i '.resources.requests.memory = "8Gi"' "$monitoring_values_file" +yq -i '.resources.limits.memory = "8Gi"' "$monitoring_values_file" + helm_args=( --set persistence.type="${STORAGE}" --set exposure.type="${MONITORING_LOAD_BALANCER}" ) helm dependency update "${COMMON_DIR}/../charts/monitoring" -envsubst < "${COMMON_DIR}/../charts/monitoring/values.yaml" > "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" +envsubst < "$monitoring_values_file" > "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" helm upgrade -n stackrox --install --create-namespace stackrox-monitoring "${COMMON_DIR}/../charts/monitoring" --values "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" "${helm_args[@]}" rm "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" -echo "Deployed Monitoring..." # Replace the prometheus ConfigMap with one that doesn't scrape as much info from berserker containers kubectl -n stackrox delete configmap prometheus kubectl create -f "${SCRIPT_DIR}"/prometheus.yaml -kubectl -n stackrox patch deploy/monitoring --patch-file="${SCRIPT_DIR}/patch-monitoring.json" +#kubectl -n stackrox patch deploy/monitoring --patch-file="${SCRIPT_DIR}/patch-monitoring.json" From 9b9249fe1468d9a054ddc45414815fb5af2085e3 Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Mon, 29 Sep 2025 18:46:40 -0700 Subject: [PATCH 15/17] Removed patch-monitoring.json --- .../patch-monitoring.json | 23 ------------------- .../start-secured-cluster.sh | 2 -- 2 files changed, 25 deletions(-) delete mode 100644 release/start-secured-cluster/patch-monitoring.json diff --git a/release/start-secured-cluster/patch-monitoring.json b/release/start-secured-cluster/patch-monitoring.json deleted file mode 100644 index 94dbe04..0000000 --- a/release/start-secured-cluster/patch-monitoring.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "spec": { - "template": { - "spec": { - "containers": [ - { - "name": "prometheus", - "resources": { - "requests": { - "memory": "2Gi", - "cpu": "1" - }, - "limits": { - "memory": "8Gi", - "cpu": "1" - } - } - } - ] - } - } - } -} diff --git a/release/start-secured-cluster/start-secured-cluster.sh b/release/start-secured-cluster/start-secured-cluster.sh index 4d028bd..a242687 100755 --- a/release/start-secured-cluster/start-secured-cluster.sh +++ b/release/start-secured-cluster/start-secured-cluster.sh @@ -30,5 +30,3 @@ rm "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" # Replace the prometheus ConfigMap with one that doesn't scrape as much info from berserker containers kubectl -n stackrox delete configmap prometheus kubectl create -f "${SCRIPT_DIR}"/prometheus.yaml - -#kubectl -n stackrox patch deploy/monitoring --patch-file="${SCRIPT_DIR}/patch-monitoring.json" From 04a2cb756a76775a2bfbf24e45098e5f88300762 Mon Sep 17 00:00:00 2001 From: JoukoVirtanen Date: Mon, 29 Sep 2025 21:37:15 -0700 Subject: [PATCH 16/17] Removed redundant creation of configmap --- release/start-kube-burner/start-kube-burner.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/release/start-kube-burner/start-kube-burner.sh b/release/start-kube-burner/start-kube-burner.sh index 26aa685..8be9ef1 100755 --- a/release/start-kube-burner/start-kube-burner.sh +++ b/release/start-kube-burner/start-kube-burner.sh @@ -39,8 +39,6 @@ temp_metrics_file="${DIR}"/metrics.yml sed '/captureStart/d' "${KUBE_BURNER_METRICS_FILE}" > "$temp_metrics_file" kubectl create configmap --from-file="$temp_metrics_file" kube-burner-metrics-config -n kube-burner -kubectl create configmap --from-file="$KUBE_BURNER_METRICS_FILE" kube-burner-metrics-config -n kube-burner - kubectl create -f "${DIR}"/service-account.yaml kubectl create -f "${DIR}"/cluster-role-binding.yaml From c05bc366cf600841377609ec183d12cc5b689866 Mon Sep 17 00:00:00 2001 From: Tom Martensen Date: Thu, 2 Oct 2025 10:43:01 +0200 Subject: [PATCH 17/17] counter proposal with values files --- .../monitoring-values-override.yaml | 72 ++++++++++++++++ release/start-secured-cluster/prometheus.yaml | 84 ------------------- .../start-secured-cluster.sh | 15 ++-- 3 files changed, 78 insertions(+), 93 deletions(-) create mode 100644 release/start-secured-cluster/monitoring-values-override.yaml delete mode 100644 release/start-secured-cluster/prometheus.yaml diff --git a/release/start-secured-cluster/monitoring-values-override.yaml b/release/start-secured-cluster/monitoring-values-override.yaml new file mode 100644 index 0000000..68dfca9 --- /dev/null +++ b/release/start-secured-cluster/monitoring-values-override.yaml @@ -0,0 +1,72 @@ +resources: + requests: + memory: "8Gi" + limits: + memory: "8Gi" + +prometheus: + scrape_configs: | + - job_name: "kubernetes-pods" + tls_config: + insecure_skip_verify: false + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: node_name + + - job_name: "kubernetes-cadvisor" + scheme: https + metrics_path: /metrics/cadvisor + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + authorization: + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + + metric_relabel_configs: + - source_labels: [container] + regex: berserker + action: drop + - source_labels: [namespace] + regex: berserker-* + action: drop + + - job_name: stackrox + tls_config: + insecure_skip_verify: false + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_endpoint_port_name] + action: keep + regex: monitoring + - source_labels: [__meta_kubernetes_endpoints_name] + action: replace + target_label: job + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_endpoint_node_name] + action: replace + target_label: node_name + + rules_custom: "" diff --git a/release/start-secured-cluster/prometheus.yaml b/release/start-secured-cluster/prometheus.yaml deleted file mode 100644 index 3618b6e..0000000 --- a/release/start-secured-cluster/prometheus.yaml +++ /dev/null @@ -1,84 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus - namespace: stackrox - labels: - app.kubernetes.io/name: stackrox -data: - prometheus.yml: |- - global: - scrape_interval: 30s - - alerting: - alertmanagers: - - static_configs: - - targets: - - stackrox-monitoring-alertmanager:9093 - - rule_files: - - /etc/prometheus/rules_*.yml - - scrape_configs: - - job_name: "kubernetes-pods" - tls_config: - insecure_skip_verify: false - kubernetes_sd_configs: - - role: pod - namespaces: - own_namespace: true - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - - source_labels: [__meta_kubernetes_pod_node_name] - action: replace - target_label: node_name - - - job_name: "kubernetes-cadvisor" - scheme: https - metrics_path: /metrics/cadvisor - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - authorization: - credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - metric_relabel_configs: - - source_labels: [container] - regex: berserker - action: drop - - source_labels: [namespace] - regex: berserker-* - action: drop - - - job_name: stackrox - tls_config: - insecure_skip_verify: false - kubernetes_sd_configs: - - role: endpoints - namespaces: - own_namespace: true - relabel_configs: - - source_labels: [__meta_kubernetes_endpoint_port_name] - action: keep - regex: monitoring - - source_labels: [__meta_kubernetes_endpoints_name] - action: replace - target_label: job - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_endpoint_node_name] - action: replace - target_label: node_name diff --git a/release/start-secured-cluster/start-secured-cluster.sh b/release/start-secured-cluster/start-secured-cluster.sh index a242687..be84c0f 100755 --- a/release/start-secured-cluster/start-secured-cluster.sh +++ b/release/start-secured-cluster/start-secured-cluster.sh @@ -13,9 +13,6 @@ kubectl -n stackrox create secret generic access-rhacs \ kubectl create -f "${SCRIPT_DIR}/collector-config.yaml" echo "Deploying Monitoring..." -monitoring_values_file="${COMMON_DIR}/../charts/monitoring/values.yaml" -yq -i '.resources.requests.memory = "8Gi"' "$monitoring_values_file" -yq -i '.resources.limits.memory = "8Gi"' "$monitoring_values_file" helm_args=( --set persistence.type="${STORAGE}" @@ -23,10 +20,10 @@ helm_args=( ) helm dependency update "${COMMON_DIR}/../charts/monitoring" -envsubst < "$monitoring_values_file" > "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" -helm upgrade -n stackrox --install --create-namespace stackrox-monitoring "${COMMON_DIR}/../charts/monitoring" --values "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" "${helm_args[@]}" -rm "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" +envsubst < "${COMMON_DIR}/../charts/monitoring/values.yaml" > "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" +helm upgrade -n stackrox --install --create-namespace stackrox-monitoring "${COMMON_DIR}/../charts/monitoring" \ + --values "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml" \ + --values "${SCRIPT_DIR}/monitoring-values-override.yaml" \ + "${helm_args[@]}" -# Replace the prometheus ConfigMap with one that doesn't scrape as much info from berserker containers -kubectl -n stackrox delete configmap prometheus -kubectl create -f "${SCRIPT_DIR}"/prometheus.yaml +rm "${COMMON_DIR}/../charts/monitoring/values_substituted.yaml"