diff --git a/internal/constants/constants.go b/internal/constants/constants.go index 875193c226..a6a8bd6c0a 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -39,18 +39,21 @@ const ( InjectTrustedCABundleLabel = "config.openshift.io/inject-trusted-cabundle" //ServiceAccountSecretPath is the path to find the projected serviceAccount token and other SA secrets - ServiceAccountSecretPath = "/var/run/ocp-collector/serviceaccount" - TrustedCABundleMountFile = "tls-ca-bundle.pem" - TrustedCABundleMountDir = "/etc/pki/ca-trust/extracted/pem/" - ElasticsearchName = "elasticsearch" - VectorName = "vector" - KibanaName = "kibana" - LogfilesmetricexporterName = "logfilesmetricexporter" - LogfilesmetricexporterPort = int32(2112) - MetricsPortName = "metrics" - MetricsPort = int32(24231) - PodSecurityLabelEnforce = "pod-security.kubernetes.io/enforce" - PodSecurityLabelValue = "privileged" + ServiceAccountSecretPath = "/var/run/ocp-collector/serviceaccount" + TrustedCABundleMountFile = "tls-ca-bundle.pem" + TrustedCABundleMountDir = "/etc/pki/ca-trust/extracted/pem/" + ElasticsearchName = "elasticsearch" + VectorName = "vector" + KibanaName = "kibana" + LogfilesmetricexporterName = "logfilesmetricexporter" + LogfilesmetricexporterPort = int32(2112) + MetricsPortName = "metrics" + MetricsPort = int32(24231) + MetricsCollectionProfileFull = "full" + MetricsCollectionProfileMinimal = "minimal" + MetricsCollectionProfileTelemetry = "telemetry" + PodSecurityLabelEnforce = "pod-security.kubernetes.io/enforce" + PodSecurityLabelValue = "privileged" // Disable gosec linter, complains "possible hard-coded secret" CollectorSecretsDir = "/var/run/ocp-collector/secrets" //nolint:gosec ConfigMapBaseDir = "/var/run/ocp-collector/config" diff --git a/internal/constants/labels.go b/internal/constants/labels.go index 9cede163b7..0540e414d0 100644 --- a/internal/constants/labels.go +++ b/internal/constants/labels.go @@ -14,6 +14,8 @@ const ( LabelLoggingServiceType = "logging.observability.openshift.io/service-type" LabelLoggingInputServiceType = "logging.observability.openshift.io/input-service-type" + LabelMetricsCollectionProfile = "monitoring.openshift.io/collection-profile" + ServiceTypeMetrics = "metrics" ServiceTypeInput = "input" ) diff --git a/internal/controller/observability/collector.go b/internal/controller/observability/collector.go index c3a708cf9c..643a06b629 100644 --- a/internal/controller/observability/collector.go +++ b/internal/controller/observability/collector.go @@ -153,8 +153,16 @@ func ReconcileCollector(context internalcontext.ForwarderContext, pollInterval, return err } metricsSelector := metrics.BuildSelector(constants.CollectorName, resourceNames.CommonName) - if err := metrics.ReconcileServiceMonitor(context.Client, context.Forwarder.Namespace, resourceNames.CommonName, ownerRef, metricsSelector, constants.MetricsPortName); err != nil { - log.Error(err, "collector.ReconcileServiceMonitor") + if err := metrics.ReconcileServiceMonitor(context.Client, context.Forwarder.Namespace, resourceNames.CommonName, resourceNames.CommonName, ownerRef, metricsSelector, constants.MetricsPortName, metrics.FullRelabelConfigs, constants.MetricsCollectionProfileFull); err != nil { + log.Error(err, "collector.ReconcileServiceMonitor full") + return err + } + if err := metrics.ReconcileServiceMonitor(context.Client, context.Forwarder.Namespace, constants.MetricsCollectionProfileMinimal+"-"+resourceNames.CommonName, resourceNames.CommonName, ownerRef, metricsSelector, constants.MetricsPortName, metrics.CollectorMinimalRelabelConfigs, constants.MetricsCollectionProfileMinimal); err != nil { + log.Error(err, "collector.ReconcileServiceMonitor minimal") + return err + } + if err := metrics.ReconcileServiceMonitor(context.Client, context.Forwarder.Namespace, constants.MetricsCollectionProfileTelemetry+"-"+resourceNames.CommonName, resourceNames.CommonName, ownerRef, metricsSelector, constants.MetricsPortName, metrics.CollectorTelemetryRelabelConfigs, constants.MetricsCollectionProfileTelemetry); err != nil { + log.Error(err, "collector.ReconcileServiceMonitor telemetry") return err } diff --git a/internal/controller/observability/collector_test.go b/internal/controller/observability/collector_test.go index 4424d41ad6..74c7f2c84f 100644 --- a/internal/controller/observability/collector_test.go +++ b/internal/controller/observability/collector_test.go @@ -225,8 +225,31 @@ var _ = Describe("Reconciling the Collector", func() { service := &corev1.Service{} Expect(client.Get(context.TODO(), key, service)).Should(Succeed(), "Exp. to create a Service for metrics") - sm := &monitoringv1.ServiceMonitor{} - Expect(client.Get(context.TODO(), key, sm)).Should(Succeed(), "Exp. to create a ServiceMonitor for metrics") + By("verifying the full profile ServiceMonitor") + fullSM := &monitoringv1.ServiceMonitor{} + Expect(client.Get(context.TODO(), key, fullSM)).Should(Succeed(), "Exp. to create a full profile ServiceMonitor") + Expect(fullSM.Labels[constants.LabelMetricsCollectionProfile]).To(Equal(constants.MetricsCollectionProfileFull)) + Expect(fullSM.Spec.Endpoints).ToNot(BeEmpty()) + Expect(fullSM.Spec.Endpoints[0].MetricRelabelConfigs).To(HaveLen(1), "full profile should only have the rename rule") + + By("verifying the minimal profile ServiceMonitor") + minimalKey := types.NamespacedName{Name: constants.MetricsCollectionProfileMinimal + "-" + clfName, Namespace: namespaceName} + minimalSM := &monitoringv1.ServiceMonitor{} + Expect(client.Get(context.TODO(), minimalKey, minimalSM)).Should(Succeed(), "Exp. to create a minimal profile ServiceMonitor") + Expect(minimalSM.Labels[constants.LabelMetricsCollectionProfile]).To(Equal(constants.MetricsCollectionProfileMinimal)) + Expect(minimalSM.Spec.Endpoints).ToNot(BeEmpty()) + Expect(minimalSM.Spec.Endpoints[0].MetricRelabelConfigs).To(HaveLen(3), "minimal profile should have rename + keep + drop") + Expect(string(minimalSM.Spec.Endpoints[0].MetricRelabelConfigs[1].Action)).To(Equal("keep")) + Expect(string(minimalSM.Spec.Endpoints[0].MetricRelabelConfigs[2].Action)).To(Equal("drop")) + + By("verifying the telemetry profile ServiceMonitor") + telemetryKey := types.NamespacedName{Name: constants.MetricsCollectionProfileTelemetry + "-" + clfName, Namespace: namespaceName} + telemetrySM := &monitoringv1.ServiceMonitor{} + Expect(client.Get(context.TODO(), telemetryKey, telemetrySM)).Should(Succeed(), "Exp. to create a telemetry profile ServiceMonitor") + Expect(telemetrySM.Labels[constants.LabelMetricsCollectionProfile]).To(Equal(constants.MetricsCollectionProfileTelemetry)) + Expect(telemetrySM.Spec.Endpoints).ToNot(BeEmpty()) + Expect(telemetrySM.Spec.Endpoints[0].MetricRelabelConfigs).To(HaveLen(2), "telemetry profile should have rename + keep") + Expect(string(telemetrySM.Spec.Endpoints[0].MetricRelabelConfigs[1].Action)).To(Equal("keep")) }, Entry("when deployed as a DaemonSet", forwarder), diff --git a/internal/metrics/logfilemetricexporter/metric_exporter.go b/internal/metrics/logfilemetricexporter/metric_exporter.go index b3dc6abe37..4e045381b0 100644 --- a/internal/metrics/logfilemetricexporter/metric_exporter.go +++ b/internal/metrics/logfilemetricexporter/metric_exporter.go @@ -76,8 +76,16 @@ func Reconcile(lfmeInstance *loggingv1alpha1.LogFileMetricExporter, } metricsSelector := metrics.BuildSelector(constants.LogfilesmetricexporterName, lfmeInstance.Name) - if err := metrics.ReconcileServiceMonitor(requestClient, lfmeInstance.Namespace, resNames.CommonName, owner, metricsSelector, constants.MetricsPortName); err != nil { - log.Error(err, "logfilemetricexporter.ReconcileServiceMonitor") + if err := metrics.ReconcileServiceMonitor(requestClient, lfmeInstance.Namespace, resNames.CommonName, resNames.CommonName, owner, metricsSelector, constants.MetricsPortName, metrics.FullRelabelConfigs, constants.MetricsCollectionProfileFull); err != nil { + log.Error(err, "logfilemetricexporter.ReconcileServiceMonitor full") + return err + } + if err := metrics.ReconcileServiceMonitor(requestClient, lfmeInstance.Namespace, constants.MetricsCollectionProfileMinimal+"-"+resNames.CommonName, resNames.CommonName, owner, metricsSelector, constants.MetricsPortName, metrics.LFMEMinimalRelabelConfigs, constants.MetricsCollectionProfileMinimal); err != nil { + log.Error(err, "logfilemetricexporter.ReconcileServiceMonitor minimal") + return err + } + if err := metrics.ReconcileServiceMonitor(requestClient, lfmeInstance.Namespace, constants.MetricsCollectionProfileTelemetry+"-"+resNames.CommonName, resNames.CommonName, owner, metricsSelector, constants.MetricsPortName, metrics.LFMETelemetryRelabelConfigs, constants.MetricsCollectionProfileTelemetry); err != nil { + log.Error(err, "logfilemetricexporter.ReconcileServiceMonitor telemetry") return err } diff --git a/internal/metrics/logfilemetricexporter/metric_exporter_test.go b/internal/metrics/logfilemetricexporter/metric_exporter_test.go index 82aab19004..e6c8a7a5c3 100644 --- a/internal/metrics/logfilemetricexporter/metric_exporter_test.go +++ b/internal/metrics/logfilemetricexporter/metric_exporter_test.go @@ -115,11 +115,11 @@ var _ = Describe("Reconcile LogFileMetricExporter", func() { Expect(serviceInstance.Annotations[constants.AnnotationServingCertSecretName]). To(Equal(ExporterMetricsSecretName)) - // ServiceMonitor - // Get and check the ServiceMonitor + // ServiceMonitor (full profile) Expect(reqClient.Get(context.TODO(), serviceMonitorKey, smInstance)).Should(Succeed()) Expect(smInstance.Name).To(Equal(constants.LogfilesmetricexporterName)) + Expect(smInstance.Labels[constants.LabelMetricsCollectionProfile]).To(Equal(constants.MetricsCollectionProfileFull)) expJobLabel := fmt.Sprintf("monitor-%s", constants.LogfilesmetricexporterName) Expect(smInstance.Spec.JobLabel).To(Equal(expJobLabel)) @@ -132,6 +132,27 @@ var _ = Describe("Reconcile LogFileMetricExporter", func() { Expect(smInstance.Spec.Endpoints[0].BearerTokenFile). To(Equal("/var/run/secrets/kubernetes.io/serviceaccount/token")) + Expect(smInstance.Spec.Endpoints[0].MetricRelabelConfigs).To(HaveLen(1), "full profile should only have the rename rule") + + // ServiceMonitor (minimal profile) + minimalName := constants.MetricsCollectionProfileMinimal + "-" + constants.LogfilesmetricexporterName + minimalSM := &monitoringv1.ServiceMonitor{} + Expect(reqClient.Get(context.TODO(), types.NamespacedName{Name: minimalName, Namespace: namespace.Name}, minimalSM)).Should(Succeed()) + Expect(minimalSM.Labels[constants.LabelMetricsCollectionProfile]).To(Equal(constants.MetricsCollectionProfileMinimal)) + Expect(minimalSM.Spec.Endpoints).ToNot(BeEmpty()) + Expect(minimalSM.Spec.Endpoints[0].TLSConfig.SafeTLSConfig.ServerName).To(Equal(svcURL)) + Expect(minimalSM.Spec.Endpoints[0].MetricRelabelConfigs).To(HaveLen(2), "LFME minimal profile should have rename + keep") + Expect(string(minimalSM.Spec.Endpoints[0].MetricRelabelConfigs[1].Action)).To(Equal("keep")) + + // ServiceMonitor (telemetry profile) + telemetryName := constants.MetricsCollectionProfileTelemetry + "-" + constants.LogfilesmetricexporterName + telemetrySM := &monitoringv1.ServiceMonitor{} + Expect(reqClient.Get(context.TODO(), types.NamespacedName{Name: telemetryName, Namespace: namespace.Name}, telemetrySM)).Should(Succeed()) + Expect(telemetrySM.Labels[constants.LabelMetricsCollectionProfile]).To(Equal(constants.MetricsCollectionProfileTelemetry)) + Expect(telemetrySM.Spec.Endpoints).ToNot(BeEmpty()) + Expect(telemetrySM.Spec.Endpoints[0].TLSConfig.SafeTLSConfig.ServerName).To(Equal(svcURL)) + Expect(telemetrySM.Spec.Endpoints[0].MetricRelabelConfigs).To(HaveLen(2), "LFME telemetry profile should have rename + keep") + Expect(string(telemetrySM.Spec.Endpoints[0].MetricRelabelConfigs[1].Action)).To(Equal("keep")) // Metrics Auth RBAC // Verify the metrics auth ClusterRoleBinding exists and references system:auth-delegator diff --git a/internal/metrics/relabel.go b/internal/metrics/relabel.go new file mode 100644 index 0000000000..2234e6586c --- /dev/null +++ b/internal/metrics/relabel.go @@ -0,0 +1,114 @@ +package metrics + +import ( + "strings" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +type metricAllowlistConfig struct { + allowedMetrics []string +} + +type metricDropConfig struct { + labelName string + labelValue string + excludeMetrics []string +} + +var collectorMinimalAllowlist = &metricAllowlistConfig{ + allowedMetrics: []string{ + // Metrics used in alerts (collector_alerts.yaml) + "logcollector_component_event_unmatched_count", + "vector_http_client_errors_total", + "vector_http_client_requests_sent_total", + "vector_http_client_responses_total", + "vector_buffer_byte_size", + "vector_component_errors_total", + "vector_component_received_events_total", + + // Metrics used in recording rules (collector_alerts.yaml, telemetry_rules.yaml) + "vector_component_received_bytes_total", + + // Metrics used in dashboards (openshift-logging-dashboard.json) + "vector_component_sent_bytes_total", + "vector_component_received_event_bytes_total", + "vector_open_files", + "vector_component_discarded_events_total", + + // Additional buffer and event metrics + "vector_buffer_discarded_events_total", + "vector_buffer_events", + "vector_buffer_sent_events_total", + "vector_events_in_total", + }, +} + +var collectorMinimalDropConfigs = []metricDropConfig{ + { + labelName: "component_kind", + labelValue: "transform", + excludeMetrics: []string{ + "vector_component_received_bytes_total", + "vector_component_received_event_bytes_total", + "vector_component_received_events_total", + "vector_component_sent_bytes_total", + }, + }, +} + +var collectorTelemetryAllowlist = &metricAllowlistConfig{ + allowedMetrics: []string{ + // Used in recording rule (telemetry_rules.yaml) + "vector_component_received_bytes_total", + }, +} + +var lfmeMinimalAllowlist = &metricAllowlistConfig{ + allowedMetrics: []string{ + // Used in recording rule (collector_alerts.yaml) and dashboard + "log_logged_bytes_total", + }, +} + +var lfmeTelemetryAllowlist = &metricAllowlistConfig{ + allowedMetrics: []string{ + // Used in recording rule (collector_alerts.yaml) + "log_logged_bytes_total", + }, +} + +var CollectorMinimalRelabelConfigs = buildRelabelConfigs(collectorMinimalAllowlist, collectorMinimalDropConfigs) +var LFMEMinimalRelabelConfigs = buildRelabelConfigs(lfmeMinimalAllowlist, nil) +var CollectorTelemetryRelabelConfigs = buildRelabelConfigs(collectorTelemetryAllowlist, nil) +var LFMETelemetryRelabelConfigs = buildRelabelConfigs(lfmeTelemetryAllowlist, nil) +var FullRelabelConfigs = buildRelabelConfigs(nil, nil) + +func buildRelabelConfigs(allowlist *metricAllowlistConfig, dropConfigs []metricDropConfig) []*monitoringv1.RelabelConfig { + configs := []*monitoringv1.RelabelConfig{ + { + SourceLabels: []monitoringv1.LabelName{"__name__"}, + TargetLabel: "__name__", + Regex: "(.*)-(.*)", + Replacement: "${1}_${2}", + }, + } + + if allowlist != nil && len(allowlist.allowedMetrics) > 0 { + configs = append(configs, &monitoringv1.RelabelConfig{ + Action: "keep", + SourceLabels: []monitoringv1.LabelName{"__name__"}, + Regex: strings.Join(allowlist.allowedMetrics, "|"), + }) + } + + for _, drop := range dropConfigs { + configs = append(configs, &monitoringv1.RelabelConfig{ + Action: "drop", + SourceLabels: []monitoringv1.LabelName{monitoringv1.LabelName(drop.labelName), "__name__"}, + Regex: drop.labelValue + ";(" + strings.Join(drop.excludeMetrics, "|") + ")", + }) + } + + return configs +} diff --git a/internal/metrics/relabel_test.go b/internal/metrics/relabel_test.go new file mode 100644 index 0000000000..db571d9355 --- /dev/null +++ b/internal/metrics/relabel_test.go @@ -0,0 +1,113 @@ +package metrics + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +var _ = Describe("buildRelabelConfigs", func() { + It("should return only the rename rule when no allowlist or drops are provided", func() { + configs := buildRelabelConfigs(nil, nil) + Expect(configs).To(HaveLen(1)) + Expect(configs[0].TargetLabel).To(Equal("__name__")) + Expect(configs[0].Regex).To(Equal("(.*)-(.*)")) + Expect(configs[0].Replacement).To(Equal("${1}_${2}")) + }) + + It("should return rename + keep when an allowlist is provided", func() { + allowlist := &metricAllowlistConfig{ + allowedMetrics: []string{"metric_a", "metric_b"}, + } + configs := buildRelabelConfigs(allowlist, nil) + Expect(configs).To(HaveLen(2)) + + Expect(configs[0].Regex).To(Equal("(.*)-(.*)")) + + Expect(string(configs[1].Action)).To(Equal("keep")) + Expect(configs[1].SourceLabels).To(Equal([]monitoringv1.LabelName{"__name__"})) + Expect(configs[1].Regex).To(Equal("metric_a|metric_b")) + }) + + It("should return rename + keep + drop when allowlist and drops are provided", func() { + allowlist := &metricAllowlistConfig{ + allowedMetrics: []string{"metric_a", "metric_b", "metric_c"}, + } + drops := []metricDropConfig{ + { + labelName: "component_kind", + labelValue: "transform", + excludeMetrics: []string{"metric_a", "metric_b"}, + }, + } + configs := buildRelabelConfigs(allowlist, drops) + Expect(configs).To(HaveLen(3)) + + Expect(string(configs[1].Action)).To(Equal("keep")) + Expect(configs[1].Regex).To(Equal("metric_a|metric_b|metric_c")) + + Expect(string(configs[2].Action)).To(Equal("drop")) + Expect(configs[2].SourceLabels).To(Equal([]monitoringv1.LabelName{"component_kind", "__name__"})) + Expect(configs[2].Regex).To(Equal("transform;(metric_a|metric_b)")) + }) + + It("should build valid CollectorMinimalRelabelConfigs", func() { + Expect(CollectorMinimalRelabelConfigs).To(HaveLen(3)) + Expect(string(CollectorMinimalRelabelConfigs[1].Action)).To(Equal("keep")) + Expect(string(CollectorMinimalRelabelConfigs[2].Action)).To(Equal("drop")) + + keepRegex := CollectorMinimalRelabelConfigs[1].Regex + for _, m := range collectorMinimalAllowlist.allowedMetrics { + Expect(keepRegex).To(ContainSubstring(m), "missing metric in keep regex: %s", m) + } + }) + + It("should build valid LFMEMinimalRelabelConfigs", func() { + Expect(LFMEMinimalRelabelConfigs).To(HaveLen(2)) + Expect(string(LFMEMinimalRelabelConfigs[1].Action)).To(Equal("keep")) + Expect(LFMEMinimalRelabelConfigs[1].Regex).To(Equal("log_logged_bytes_total")) + }) + + It("should build valid CollectorTelemetryRelabelConfigs", func() { + Expect(CollectorTelemetryRelabelConfigs).To(HaveLen(2)) + Expect(string(CollectorTelemetryRelabelConfigs[1].Action)).To(Equal("keep")) + Expect(CollectorTelemetryRelabelConfigs[1].Regex).To(Equal("vector_component_received_bytes_total")) + }) + + It("should build valid LFMETelemetryRelabelConfigs", func() { + Expect(LFMETelemetryRelabelConfigs).To(HaveLen(2)) + Expect(string(LFMETelemetryRelabelConfigs[1].Action)).To(Equal("keep")) + Expect(LFMETelemetryRelabelConfigs[1].Regex).To(Equal("log_logged_bytes_total")) + }) + + It("should build FullRelabelConfigs with only the rename rule", func() { + Expect(FullRelabelConfigs).To(HaveLen(1)) + Expect(FullRelabelConfigs[0].Regex).To(Equal("(.*)-(.*)")) + }) + + It("should return only the rename rule when allowlist has empty metrics", func() { + allowlist := &metricAllowlistConfig{ + allowedMetrics: []string{}, + } + configs := buildRelabelConfigs(allowlist, nil) + Expect(configs).To(HaveLen(1)) + Expect(configs[0].TargetLabel).To(Equal("__name__")) + }) + + It("should return rename + drop when only drop configs are provided", func() { + drops := []metricDropConfig{ + { + labelName: "component_kind", + labelValue: "transform", + excludeMetrics: []string{"metric_a"}, + }, + } + configs := buildRelabelConfigs(nil, drops) + Expect(configs).To(HaveLen(2)) + + Expect(configs[0].Regex).To(Equal("(.*)-(.*)")) + Expect(string(configs[1].Action)).To(Equal("drop")) + Expect(configs[1].SourceLabels).To(Equal([]monitoringv1.LabelName{"component_kind", "__name__"})) + Expect(configs[1].Regex).To(Equal("transform;(metric_a)")) + }) +}) diff --git a/internal/metrics/service_monitor.go b/internal/metrics/service_monitor.go index 3723d4ec80..b39a1f6aec 100644 --- a/internal/metrics/service_monitor.go +++ b/internal/metrics/service_monitor.go @@ -17,7 +17,7 @@ const ( prometheusBearerTokenFile = "/var/run/secrets/kubernetes.io/serviceaccount/token" ) -func newServiceMonitor(namespace, name string, owner metav1.OwnerReference, selector map[string]string, portName string) *monitoringv1.ServiceMonitor { +func newServiceMonitor(namespace, name, serviceName string, owner metav1.OwnerReference, selector map[string]string, portName string, metricRelabelConfigs []*monitoringv1.RelabelConfig, profile string) *monitoringv1.ServiceMonitor { var endpoint = []monitoringv1.Endpoint{ { Port: portName, @@ -27,26 +27,18 @@ func newServiceMonitor(namespace, name string, owner metav1.OwnerReference, sele TLSConfig: &monitoringv1.TLSConfig{ CAFile: prometheusCAFile, SafeTLSConfig: monitoringv1.SafeTLSConfig{ - ServerName: fmt.Sprintf("%s.%s.svc", name, namespace), - }, - }, - // Replaces labels that have `-` with `_` - // Example: - // app_kubernetes_io_part-of -> app_kubernetes_io_part_of - MetricRelabelConfigs: []*monitoringv1.RelabelConfig{ - { - SourceLabels: []monitoringv1.LabelName{ - "__name__", - }, - TargetLabel: "__name__", - Regex: "(.*)-(.*)", - Replacement: "${1}_${2}", + ServerName: fmt.Sprintf("%s.%s.svc", serviceName, namespace), }, }, + MetricRelabelConfigs: metricRelabelConfigs, }, } desired := runtime.NewServiceMonitor(namespace, name) + if desired.Labels == nil { + desired.Labels = map[string]string{} + } + desired.Labels[constants.LabelMetricsCollectionProfile] = profile desired.Spec = monitoringv1.ServiceMonitorSpec{ JobLabel: fmt.Sprintf("monitor-%s", name), Endpoints: endpoint, @@ -77,7 +69,7 @@ func BuildSelector(component, instance string) map[string]string { } } -func ReconcileServiceMonitor(k8sClient client.Client, namespace, name string, owner metav1.OwnerReference, selector map[string]string, portName string) error { - desired := newServiceMonitor(namespace, name, owner, selector, portName) +func ReconcileServiceMonitor(k8sClient client.Client, namespace, name, serviceName string, owner metav1.OwnerReference, selector map[string]string, portName string, metricRelabelConfigs []*monitoringv1.RelabelConfig, profile string) error { + desired := newServiceMonitor(namespace, name, serviceName, owner, selector, portName, metricRelabelConfigs, profile) return reconcile.ServiceMonitor(k8sClient, desired) } diff --git a/internal/metrics/service_monitor_test.go b/internal/metrics/service_monitor_test.go index 404729e864..96c9cacbf9 100644 --- a/internal/metrics/service_monitor_test.go +++ b/internal/metrics/service_monitor_test.go @@ -22,8 +22,6 @@ var _ = Describe("Reconcile ServiceMonitor", func() { _ = monitoringv1.AddToScheme(scheme.Scheme) var ( - - // Adding ns and label to account for addSecurityLabelsToNamespace() added in LOG-2620 namespace = &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{"test": "true"}, @@ -45,21 +43,23 @@ var _ = Describe("Reconcile ServiceMonitor", func() { smInstance = &monitoringv1.ServiceMonitor{} ) - It("should successfully reconcile the ServiceMonitor", func() { - // Reconcile the exporter daemonset + It("should reconcile a full profile ServiceMonitor with only the rename rule", func() { Expect(ReconcileServiceMonitor( reqClient, constants.OpenshiftNS, serviceName, + serviceName, owner, selector, portName, + FullRelabelConfigs, + constants.MetricsCollectionProfileFull, )).To(Succeed()) - // Get and check the ServiceMonitor Expect(reqClient.Get(context.TODO(), serviceMonitorKey, smInstance)).Should(Succeed()) Expect(smInstance.Name).To(Equal(serviceName)) + Expect(smInstance.Labels[constants.LabelMetricsCollectionProfile]).To(Equal(constants.MetricsCollectionProfileFull)) expJobLabel := fmt.Sprintf("monitor-%s", serviceName) Expect(smInstance.Spec.JobLabel).To(Equal(expJobLabel)) @@ -72,5 +72,85 @@ var _ = Describe("Reconcile ServiceMonitor", func() { Expect(smInstance.Spec.Endpoints[0].BearerTokenFile). To(Equal("/var/run/secrets/kubernetes.io/serviceaccount/token")) + + By("verifying MetricRelabelConfigs contains only the rename rule") + relabelConfigs := smInstance.Spec.Endpoints[0].MetricRelabelConfigs + Expect(relabelConfigs).To(HaveLen(1)) + Expect(relabelConfigs[0].Regex).To(Equal("(.*)-(.*)")) + Expect(relabelConfigs[0].TargetLabel).To(Equal("__name__")) + }) + + It("should reconcile a minimal profile ServiceMonitor with collector relabel configs", func() { + minimalName := constants.MetricsCollectionProfileMinimal + "-" + serviceName + Expect(ReconcileServiceMonitor( + reqClient, + constants.OpenshiftNS, + minimalName, + serviceName, + owner, + selector, + portName, + CollectorMinimalRelabelConfigs, + constants.MetricsCollectionProfileMinimal, + )).To(Succeed()) + + sm := &monitoringv1.ServiceMonitor{} + Expect(reqClient.Get(context.TODO(), types.NamespacedName{Name: minimalName, Namespace: constants.OpenshiftNS}, sm)).Should(Succeed()) + + Expect(sm.Labels[constants.LabelMetricsCollectionProfile]).To(Equal(constants.MetricsCollectionProfileMinimal)) + + By("verifying TLS ServerName uses the service name, not the ServiceMonitor name") + svcURL := fmt.Sprintf("%s.openshift-logging.svc", serviceName) + Expect(sm.Spec.Endpoints[0].TLSConfig.SafeTLSConfig.ServerName).To(Equal(svcURL)) + + By("verifying MetricRelabelConfigs contains rename + keep + drop") + relabelConfigs := sm.Spec.Endpoints[0].MetricRelabelConfigs + Expect(relabelConfigs).To(HaveLen(3)) + + Expect(relabelConfigs[0].Regex).To(Equal("(.*)-(.*)")) + Expect(string(relabelConfigs[1].Action)).To(Equal("keep")) + Expect(relabelConfigs[1].SourceLabels).To(Equal([]monitoringv1.LabelName{"__name__"})) + Expect(relabelConfigs[1].Regex).To(Equal(CollectorMinimalRelabelConfigs[1].Regex)) + Expect(string(relabelConfigs[2].Action)).To(Equal("drop")) + Expect(relabelConfigs[2].SourceLabels).To(Equal([]monitoringv1.LabelName{"component_kind", "__name__"})) + }) + + It("should update an existing ServiceMonitor on re-reconciliation", func() { + By("creating with full profile") + Expect(ReconcileServiceMonitor( + reqClient, + constants.OpenshiftNS, + "update-test", + "update-test", + owner, + selector, + portName, + FullRelabelConfigs, + constants.MetricsCollectionProfileFull, + )).To(Succeed()) + + sm := &monitoringv1.ServiceMonitor{} + key := types.NamespacedName{Name: "update-test", Namespace: constants.OpenshiftNS} + Expect(reqClient.Get(context.TODO(), key, sm)).Should(Succeed()) + Expect(sm.Labels[constants.LabelMetricsCollectionProfile]).To(Equal(constants.MetricsCollectionProfileFull)) + Expect(sm.Spec.Endpoints[0].MetricRelabelConfigs).To(HaveLen(1)) + + By("re-reconciling with minimal profile and different relabel configs") + Expect(ReconcileServiceMonitor( + reqClient, + constants.OpenshiftNS, + "update-test", + "update-test", + owner, + selector, + portName, + CollectorMinimalRelabelConfigs, + constants.MetricsCollectionProfileMinimal, + )).To(Succeed()) + + updated := &monitoringv1.ServiceMonitor{} + Expect(reqClient.Get(context.TODO(), key, updated)).Should(Succeed()) + Expect(updated.Labels[constants.LabelMetricsCollectionProfile]).To(Equal(constants.MetricsCollectionProfileMinimal)) + Expect(updated.Spec.Endpoints[0].MetricRelabelConfigs).To(HaveLen(3)) }) }) diff --git a/test/e2e/collection/metrics/profile_test.go b/test/e2e/collection/metrics/profile_test.go new file mode 100644 index 0000000000..7b95580741 --- /dev/null +++ b/test/e2e/collection/metrics/profile_test.go @@ -0,0 +1,123 @@ +package metrics + +import ( + "fmt" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + obs "github.com/openshift/cluster-logging-operator/api/observability/v1" + "github.com/openshift/cluster-logging-operator/internal/constants" + internalruntime "github.com/openshift/cluster-logging-operator/internal/runtime" + obsruntime "github.com/openshift/cluster-logging-operator/internal/runtime/observability" + "github.com/openshift/cluster-logging-operator/test/client" + framework "github.com/openshift/cluster-logging-operator/test/framework/e2e" + "github.com/openshift/cluster-logging-operator/test/helpers/loki" + "github.com/openshift/cluster-logging-operator/test/helpers/oc" + "github.com/openshift/cluster-logging-operator/test/helpers/prometheus" + "github.com/openshift/cluster-logging-operator/test/runtime" + corev1 "k8s.io/api/core/v1" +) + +const ( + allowlistedMetric = `vector_open_files` + nonAllowlistedMetric = `vector_started_total` +) + +var _ = Describe("[e2e][collection][metrics] Metrics Collection Profiles", Ordered, func() { + var ( + e2e *framework.E2ETestFramework + clf *obs.ClusterLogForwarder + l *loki.Receiver + forwarderName = "collector" + lokiNS string + stressorNS string + + sa *corev1.ServiceAccount + err error + ) + + BeforeAll(func() { + e2e = framework.NewE2ETestFramework() + lokiNS = e2e.CreateTestNamespace() + stressorNS = e2e.CreateTestNamespace() + + sa, err = e2e.BuildAuthorizationFor(constants.OpenshiftNS, forwarderName). + AllowClusterRole(framework.ClusterRoleCollectApplicationLogs). + Create() + Expect(err).To(BeNil()) + + l = loki.NewReceiver(lokiNS, "loki-server") + clf = obsruntime.NewClusterLogForwarder(constants.OpenshiftNS, forwarderName, internalruntime.Initialize, func(clf *obs.ClusterLogForwarder) { + clf.Spec.ServiceAccount.Name = sa.Name + }) + + clf.Spec.Outputs = []obs.OutputSpec{ + { + Name: "loki-out", + Type: obs.OutputTypeLoki, + Loki: &obs.Loki{ + URLSpec: obs.URLSpec{ + URL: l.InternalURL("").String(), + }, + }, + }, + } + clf.Spec.Pipelines = []obs.PipelineSpec{ + { + InputRefs: []string{string(obs.InputTypeApplication)}, + OutputRefs: []string{"loki-out"}, + Name: "app-to-loki", + }, + } + + stressor := runtime.NewLogGenerator(stressorNS, "stressor", 0, 100*time.Millisecond, "profile test message") + Expect(client.Get().Create(stressor)).To(Succeed()) + Expect(l.Create(client.Get())).To(Succeed()) + + if err := e2e.CreateObservabilityClusterLogForwarder(clf); err != nil { + Fail(fmt.Sprintf("Unable to create CLF: %v", err)) + } + if err := e2e.WaitForDaemonSet(clf.Namespace, clf.Name); err != nil { + Fail(fmt.Sprintf("Failed waiting for collector: %v", err)) + } + }) + + AfterAll(func() { + e2e.Cleanup() + }) + + Context("ServiceMonitor labels", func() { + DescribeTable("should have the correct collection-profile label", + func(name, expectedProfile string) { + output, err := oc.Get(). + WithNamespace(constants.OpenshiftNS). + Resource("servicemonitor", name). + OutputJsonpath(`{.metadata.labels['monitoring\.openshift\.io/collection-profile']}`). + Run() + Expect(err).NotTo(HaveOccurred(), "Failed to get %s-profile ServiceMonitor", expectedProfile) + Expect(output).To(Equal(expectedProfile)) + }, + Entry("full profile", forwarderName, constants.MetricsCollectionProfileFull), + Entry("minimal profile", constants.MetricsCollectionProfileMinimal+"-"+forwarderName, constants.MetricsCollectionProfileMinimal), + ) + }) + + Context("Full profile scraping", func() { + It("should have all collector metrics available in Prometheus under the full profile", func() { + By("waiting for collector metrics to appear in Thanos") + Eventually(func(g Gomega) { + response, err := prometheus.Query(fmt.Sprintf(`%s{namespace="%s"}`, allowlistedMetric, constants.OpenshiftNS)) + g.Expect(err).NotTo(HaveOccurred(), "Failed to query allowlisted metric") + g.Expect(prometheus.HasResults(response)).To(BeTrue()) + }, 5*time.Minute, 30*time.Second).Should(Succeed(), "Allowlisted metric should be present under full profile") + + By("verifying a non-allowlisted metric is also present under full profile") + Eventually(func(g Gomega) { + response, err := prometheus.Query(fmt.Sprintf(`%s{namespace="%s"}`, nonAllowlistedMetric, constants.OpenshiftNS)) + g.Expect(err).NotTo(HaveOccurred(), "Failed to query non-allowlisted metric") + g.Expect(prometheus.HasResults(response)).To(BeTrue()) + }, 2*time.Minute, 15*time.Second).Should(Succeed(), "Non-allowlisted metric should also be present under full profile") + }) + }) +}) diff --git a/test/e2e/collection/metrics/suite_test.go b/test/e2e/collection/metrics/suite_test.go new file mode 100644 index 0000000000..67abb6a5b9 --- /dev/null +++ b/test/e2e/collection/metrics/suite_test.go @@ -0,0 +1,13 @@ +package metrics + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestSuite(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "[e2e][collection][metrics] Suite") +} diff --git a/test/e2e/flowcontrol/utils.go b/test/e2e/flowcontrol/utils.go index b087f9ba67..f42d661b89 100644 --- a/test/e2e/flowcontrol/utils.go +++ b/test/e2e/flowcontrol/utils.go @@ -3,38 +3,29 @@ package flowcontrol import ( "context" - "crypto/tls" - "encoding/json" "fmt" - "io" - "net/http" - "os/exec" "strconv" - "strings" "time" log "github.com/ViaQ/logerr/v2/log/static" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/openshift/cluster-logging-operator/test/helpers/errors" + "github.com/openshift/cluster-logging-operator/test/helpers/prometheus" "k8s.io/apimachinery/pkg/util/wait" ) const ( - SecretName = `oc get secret -n openshift-monitoring -o name | grep prometheus-k8s-token* | head -n 1 | grep -o prometheus-k8s.*` //nolint:gosec - Token = `oc get secret %s -n openshift-monitoring -o jsonpath={.data.token} | base64 -d` //nolint:gosec - ThanosHost = `oc get route thanos-querier -n openshift-monitoring -o jsonpath={.spec.host}` - VectorCompSentEvents = `rate(vector_component_sent_events_total{component_name="%s"}[30s])` VectorUpTotal = `vector_started_total` ) func WaitForMetricsToShow() bool { - if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) { - prometheusResponse := GetCollectorMetrics(VectorUpTotal) - prometheusResponse = prometheusResponse["data"].(map[string]interface{}) - return len(prometheusResponse["result"].([]interface{})) != 0, nil + response, err := prometheus.Query(VectorUpTotal) + if err != nil { + return false, err + } + return prometheus.HasResults(response), nil }); err != nil { log.V(0).Error(err, "Error waiting for metrics to be available") return false @@ -57,65 +48,9 @@ func ExpectMetricsWithinRange(prometheusResponse map[string]interface{}, lower, } func GetCollectorMetrics(metric string) map[string]interface{} { - var err error - var secret string - secret, err = ExecuteCmd(SecretName) - Expect(err).To(Succeed(), secret) - secret = strings.TrimSpace(secret) - - secret, err = ExecuteCmd(fmt.Sprintf(Token, secret)) - Expect(err).To(Succeed(), secret) - secret = strings.TrimSpace(secret) - - var thanosHost string - thanosHost, err = ExecuteCmd(ThanosHost) - Expect(err).To(Succeed(), thanosHost) - thanosHost = strings.TrimSpace(thanosHost) - - return QueryPrometheus(thanosHost, secret, metric) -} - -func ExecuteCmd(cmd string) (_ string, err error) { - var result []byte - result, err = exec.Command("bash", "-c", cmd).Output() - return string(result), err -} - -func QueryPrometheus(host, token, query string) map[string]interface{} { - tr := &http.Transport{ - TLSClientConfig: &tls.Config{ - //nolint:gosec - InsecureSkipVerify: true, - }, - } - client := &http.Client{Transport: tr} - request, err := http.NewRequest(http.MethodGet, fmt.Sprintf("https://%s/api/v1/query", host), nil) + response, err := prometheus.Query(metric) if err != nil { - Fail(fmt.Sprintf("Failed to create a new request for %s. Err: %v", host, err)) + Fail(fmt.Sprintf("Failed to query metric %s: %v", metric, err)) } - request.Header.Add("Authorization", "Bearer "+token) - request.Header.Add("Accept", "application/json") - - query_body := request.URL.Query() - query_body.Add("query", query) - request.URL.RawQuery = query_body.Encode() - - response, err := client.Do(request) - if err != nil { - Fail(fmt.Sprintf("Error when sending request to the server %v", err)) - } - - defer errors.LogIfError(response.Body.Close()) - body, err := io.ReadAll(response.Body) - if err != nil { - Fail(fmt.Sprintf("%v", err)) - } - - var result_json map[string]interface{} - if err := json.Unmarshal(body, &result_json); err != nil { - Fail(fmt.Sprintf("Failed to unmarshal json %v", err)) - } - - return result_json - + return response } diff --git a/test/e2e/logfilesmetricexporter/lfme_test.go b/test/e2e/logfilesmetricexporter/lfme_test.go index ad618cd824..94ec8ad641 100644 --- a/test/e2e/logfilesmetricexporter/lfme_test.go +++ b/test/e2e/logfilesmetricexporter/lfme_test.go @@ -17,6 +17,7 @@ import ( "github.com/openshift/cluster-logging-operator/test" framework "github.com/openshift/cluster-logging-operator/test/framework/e2e" "github.com/openshift/cluster-logging-operator/test/helpers/oc" + "github.com/openshift/cluster-logging-operator/test/helpers/prometheus" "k8s.io/apimachinery/pkg/util/wait" ) @@ -99,4 +100,21 @@ var _ = Describe("[e2e][logfilemetricexporter] LogFileMetricsExporter", func() { }) Expect(err).ToNot(HaveOccurred(), "Exp. to scrape metrics with a bearer token") }) + + It("should have LFME metrics scraped by Prometheus via the ServiceMonitor", func() { + e2e.AddCleanup(func() error { + return oc.Literal().From("oc -n openshift-logging delete --ignore-not-found logfilemetricexporter instance").Output() + }) + err = createLFME(validCR) + Expect(err).ToNot(HaveOccurred()) + Expect(e2e.WaitForDaemonSet(constants.OpenshiftNS, constants.LogfilesmetricexporterName)).To(Succeed()) + + By("querying Thanos for an LFME metric to validate the ServiceMonitor pipeline") + Eventually(func(g Gomega) { + response, err := prometheus.Query(`log_logged_bytes_total{namespace="openshift-logging"}`) + g.Expect(err).NotTo(HaveOccurred(), "Failed to query metric") + g.Expect(prometheus.HasResults(response)).To(BeTrue()) + }, 5*time.Minute, 30*time.Second).Should(Succeed(), + "LFME metrics should appear in Prometheus, indicating the ServiceMonitor is correctly configured") + }) }) diff --git a/test/e2e/operator/metrics/e2e_test.go b/test/e2e/operator/metrics/e2e_test.go index 376afd44af..e440d72533 100644 --- a/test/e2e/operator/metrics/e2e_test.go +++ b/test/e2e/operator/metrics/e2e_test.go @@ -12,6 +12,7 @@ import ( . "github.com/onsi/gomega" "github.com/openshift/cluster-logging-operator/internal/constants" "github.com/openshift/cluster-logging-operator/test/helpers/oc" + "github.com/openshift/cluster-logging-operator/test/helpers/prometheus" ) const ( @@ -275,6 +276,17 @@ var _ = Describe("Manager", Ordered, func() { strings.ToLower("clusterlogforwarder"), ))) }) + + It("should have operator metrics scraped by Prometheus via the ServiceMonitor", func() { + By("querying Thanos for an operator metric to validate the ServiceMonitor pipeline") + Eventually(func(g Gomega) { + response, err := prometheus.Query(fmt.Sprintf(`controller_runtime_reconcile_total{controller="%s",result="success"}`, + strings.ToLower("clusterlogforwarder"))) + g.Expect(err).NotTo(HaveOccurred(), "Failed to query metric") + g.Expect(prometheus.HasResults(response)).To(BeTrue()) + }, 5*time.Minute, 30*time.Second).Should(Succeed(), + "Operator metrics should appear in Prometheus, indicating the ServiceMonitor is correctly configured") + }) }) }) diff --git a/test/helpers/prometheus/prometheus.go b/test/helpers/prometheus/prometheus.go new file mode 100644 index 0000000000..2e41b59834 --- /dev/null +++ b/test/helpers/prometheus/prometheus.go @@ -0,0 +1,110 @@ +package prometheus + +import ( + "crypto/tls" + "encoding/json" + "fmt" + "io" + "net/http" + "os/exec" + "strings" + "time" + + log "github.com/ViaQ/logerr/v2/log/static" +) + +const ( + tokenCmd = `oc create token prometheus-k8s -n openshift-monitoring` //nolint:gosec + thanosHostCmd = `oc get route thanos-querier -n openshift-monitoring -o jsonpath={.spec.host}` +) + +func executeCmd(cmd string) (string, error) { + result, err := exec.Command("bash", "-c", cmd).Output() + return string(result), err +} + +func GetToken() (string, error) { + token, err := executeCmd(tokenCmd) + return strings.TrimSpace(token), err +} + +func GetThanosHost() (string, error) { + host, err := executeCmd(thanosHostCmd) + return strings.TrimSpace(host), err +} + +func QueryPrometheus(host, token, query string) map[string]interface{} { + empty := map[string]any{} + + tr := &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, //nolint:gosec + }, + } + client := &http.Client{Transport: tr, Timeout: 30 * time.Second} + request, err := http.NewRequest(http.MethodGet, fmt.Sprintf("https://%s/api/v1/query", host), nil) + if err != nil { + log.V(0).Error(err, "Failed to create request", "host", host) + return empty + } + request.Header.Add("Authorization", "Bearer "+token) + request.Header.Add("Accept", "application/json") + + q := request.URL.Query() + q.Add("query", query) + request.URL.RawQuery = q.Encode() + + response, err := client.Do(request) + if err != nil { + log.V(0).Error(err, "Error sending request to Thanos") + return empty + } + defer func() { + if err := response.Body.Close(); err != nil { + log.V(3).Error(err, "Failed to close response body") + } + }() + + if response.StatusCode != http.StatusOK { + log.V(0).Info("Unexpected status from Thanos", "status", response.StatusCode, "query", query) + return empty + } + + body, err := io.ReadAll(response.Body) + if err != nil { + log.V(0).Error(err, "Failed to read response body") + return empty + } + + var result map[string]interface{} + if err := json.Unmarshal(body, &result); err != nil { + log.V(0).Error(err, "Failed to unmarshal Thanos response") + return empty + } + + return result +} + +func Query(query string) (map[string]interface{}, error) { + token, err := GetToken() + if err != nil { + return nil, err + } + host, err := GetThanosHost() + if err != nil { + return nil, err + } + return QueryPrometheus(host, token, query), nil +} + +func HasResults(response map[string]interface{}) bool { + data, ok := response["data"].(map[string]interface{}) + if !ok { + return false + } + results, ok := data["result"].([]interface{}) + if !ok { + return false + } + return len(results) > 0 +}