diff --git a/Makefile b/Makefile index 9f05c40ff..4d45c4f26 100644 --- a/Makefile +++ b/Makefile @@ -813,6 +813,13 @@ helm-test-unittest: $(HELM_UNITTEST_PLUGIN) helm/charts/.stamp helm-test: ## Run all Helm validation tests helm-test: helm-test-schema helm-test-subchart helm-test-unittest helm-test-template +# Istio template test requires Istio CRD API versions to be available +tests/helm/template/istio.yaml: tests/helm/template/istio-overrides.yml helm/charts/.stamp helm/values.schema.json $(wildcard helm/templates/*.yaml) $(wildcard helm/templates/*.tpl) helm/values.yaml + $(call LOG,HELM,template $@) + $(Q)$(HELM_CMD) template --kube-version "$(KUBE_VERSION)" "$(HELM_SCHEMA_TEST_TARGET)" --namespace "$(HELM_SCHEMA_TEST_NAMESPACE)" \ + --api-versions networking.istio.io/v1 \ + ./helm --values $< > $@ + tests/helm/template/%.yaml: tests/helm/template/%-overrides.yml helm/charts/.stamp helm/values.schema.json $(wildcard helm/templates/*.yaml) $(wildcard helm/templates/*.tpl) helm/values.yaml $(call LOG,HELM,template $@) $(Q)$(HELM_CMD) template --kube-version "$(KUBE_VERSION)" "$(HELM_SCHEMA_TEST_TARGET)" --namespace "$(HELM_SCHEMA_TEST_NAMESPACE)" ./helm --values $< > $@ diff --git a/app/config/validator/diagnostics.go b/app/config/validator/diagnostics.go index 814219377..f0586eed3 100644 --- a/app/config/validator/diagnostics.go +++ b/app/config/validator/diagnostics.go @@ -18,6 +18,7 @@ const ( DiagnosticScrapeConfig string = "scrape_cfg" DiagnosticInsightsIngress string = "webhook_server_reachable" DiagnosticAgentSettings string = "agent_settings" + DiagnosticIstioXClusterLB string = "istio_xcluster_lb" ) const ( @@ -36,7 +37,7 @@ func IsValidDiagnostic(d string) bool { DiagnosticK8sNamespace, DiagnosticK8sProvider, DiagnosticKMS, DiagnosticScrapeConfig, DiagnosticPrometheusVersion, DiagnosticInsightsIngress, - DiagnosticAgentSettings: + DiagnosticAgentSettings, DiagnosticIstioXClusterLB: return true } return false diff --git a/app/config/validator/settings.go b/app/config/validator/settings.go index 0f7464370..4a8130d7a 100644 --- a/app/config/validator/settings.go +++ b/app/config/validator/settings.go @@ -15,13 +15,26 @@ import ( type Settings struct { ExecutionContext Context - Logging Logging `yaml:"logging"` - Deployment Deployment `yaml:"deployment"` - Versions Versions `yaml:"versions"` - Cloudzero Cloudzero `yaml:"cloudzero"` - Prometheus Prometheus `yaml:"prometheus"` - Diagnostics Diagnostics `yaml:"diagnostics"` - Services Services `yaml:"services"` + Logging Logging `yaml:"logging"` + Deployment Deployment `yaml:"deployment"` + Versions Versions `yaml:"versions"` + Cloudzero Cloudzero `yaml:"cloudzero"` + Prometheus Prometheus `yaml:"prometheus"` + Diagnostics Diagnostics `yaml:"diagnostics"` + Services Services `yaml:"services"` + Integrations Integrations `yaml:"integrations"` +} + +// Integrations contains configuration for third-party integrations +type Integrations struct { + Istio Istio `yaml:"istio"` +} + +// Istio contains Istio service mesh integration settings +type Istio struct { + // ClusterID is the Istio cluster ID from Helm values (integrations.istio.clusterID). + // Used to validate cluster-local routing configuration in multi-cluster meshes. + ClusterID string `yaml:"cluster_id"` } type Services struct { diff --git a/app/domain/diagnostic/catalog/catalog.go b/app/domain/diagnostic/catalog/catalog.go index 00a38ed39..bca5623c0 100644 --- a/app/domain/diagnostic/catalog/catalog.go +++ b/app/domain/diagnostic/catalog/catalog.go @@ -11,6 +11,7 @@ import ( config "github.com/cloudzero/cloudzero-agent/app/config/validator" "github.com/cloudzero/cloudzero-agent/app/domain/diagnostic" "github.com/cloudzero/cloudzero-agent/app/domain/diagnostic/cz" + "github.com/cloudzero/cloudzero-agent/app/domain/diagnostic/istio" "github.com/cloudzero/cloudzero-agent/app/domain/diagnostic/k8s/namespace" "github.com/cloudzero/cloudzero-agent/app/domain/diagnostic/k8s/provider" "github.com/cloudzero/cloudzero-agent/app/domain/diagnostic/k8s/version" @@ -55,6 +56,7 @@ func createRegistry(ctx context.Context, c *config.Settings) *registry { r.add(config.DiagnosticScrapeConfig, false, promcfg.NewProvider(ctx, c)) r.add(config.DiagnosticPrometheusVersion, false, promver.NewProvider(ctx, c)) r.add(config.DiagnosticInsightsIngress, false, webhook.NewProvider(ctx, c)) + r.add(config.DiagnosticIstioXClusterLB, false, istio.NewProvider(ctx, c)) // Internal diagnostics emitted based on stage r.add(config.DiagnosticInternalInitStart, true, stage.NewProvider(ctx, c, status.StatusType_STATUS_TYPE_INIT_STARTED)) diff --git a/app/domain/diagnostic/catalog/catalog_test.go b/app/domain/diagnostic/catalog/catalog_test.go index 6b72b4d9d..c9a058d12 100644 --- a/app/domain/diagnostic/catalog/catalog_test.go +++ b/app/domain/diagnostic/catalog/catalog_test.go @@ -82,5 +82,5 @@ func TestRegistry_List(t *testing.T) { // Test listing providers providers := r.List() - assert.Len(t, providers, 8) // Update the expected length to 8 + assert.Len(t, providers, 9) } diff --git a/app/domain/diagnostic/istio/istio.go b/app/domain/diagnostic/istio/istio.go new file mode 100644 index 000000000..df8b075b9 --- /dev/null +++ b/app/domain/diagnostic/istio/istio.go @@ -0,0 +1,444 @@ +// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Package istio provides diagnostics for detecting Istio service mesh configuration, +// cross-cluster load balancing, and validating cluster ID settings. +package istio + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "strings" + "time" + + config "github.com/cloudzero/cloudzero-agent/app/config/validator" + "github.com/cloudzero/cloudzero-agent/app/domain/diagnostic" + logging "github.com/cloudzero/cloudzero-agent/app/logging/validator" + "github.com/cloudzero/cloudzero-agent/app/types/status" + "github.com/sirupsen/logrus" +) + +const ( + DiagnosticIstioXClusterLB = config.DiagnosticIstioXClusterLB + + // envoyAdminClustersURL is the Envoy admin API endpoint for cluster information + envoyAdminClustersURL = "http://localhost:15000/clusters" + + // envoyAdminServerInfoURL is the Envoy admin API endpoint for server info (includes cluster ID) + envoyAdminServerInfoURL = "http://localhost:15000/server_info" + + // logRetryAttempt is the format string for retry attempt logging + logRetryAttempt = "Attempt %d: %v" + + // envIstioAmbientRedirection is the env var set via Downward API for ambient mode detection + envIstioAmbientRedirection = "ISTIO_AMBIENT_REDIRECTION" + + // istioAmbientRedirectionEnabled is the value indicating ambient mode is active + istioAmbientRedirectionEnabled = "enabled" + + // envIstioTopologyCluster is the env var set via Downward API for topology label validation + envIstioTopologyCluster = "ISTIO_TOPOLOGY_CLUSTER" +) + +// IstioMode represents the detected Istio service mesh mode +type IstioMode string + +const ( + // IstioModeNone indicates no Istio service mesh detected + IstioModeNone IstioMode = "none" + // IstioModeSidecar indicates traditional sidecar proxy mode + IstioModeSidecar IstioMode = "sidecar" + // IstioModeAmbient indicates ambient mode (sidecarless) + IstioModeAmbient IstioMode = "ambient" +) + +var ( + // Exported for testing + MaxRetry = 3 + RetryInterval = 5 * time.Second +) + +// checker implements the diagnostic.Provider interface for Istio cross-cluster LB detection +type checker struct { + cfg *config.Settings + logger *logrus.Entry + configuredClusterID string // From Helm values (integrations.istio.clusterID) - explicit only + clusterName string // From deployment.cluster_name - used for fallback + topologyCluster string // From Downward API label (topology.istio.io/cluster) + aggregatorService string // Service name to look for in Envoy clusters + namespace string // Namespace where aggregator runs + + // URLs for Envoy admin API endpoints (configurable for testing) + serverInfoURL string + clustersURL string +} + +// NewProvider creates a new Istio cross-cluster LB diagnostic provider +var NewProvider = func(ctx context.Context, cfg *config.Settings) diagnostic.Provider { + return &checker{ + cfg: cfg, + logger: logging.NewLogger(). + WithContext(ctx).WithField(logging.OpField, "istio-xcluster"), + configuredClusterID: cfg.Integrations.Istio.ClusterID, + clusterName: cfg.Deployment.ClusterName, + topologyCluster: os.Getenv(envIstioTopologyCluster), + aggregatorService: cfg.Services.CollectorService, + namespace: cfg.Services.Namespace, + serverInfoURL: envoyAdminServerInfoURL, + clustersURL: envoyAdminClustersURL, + } +} + +// effectiveClusterID returns the cluster ID that Helm uses for DestinationRule. +// This matches the Helm template: {{ .Values.integrations.istio.clusterID | default .Values.clusterName }} +func (c *checker) effectiveClusterID() string { + if c.configuredClusterID != "" { + return c.configuredClusterID + } + return c.clusterName +} + +// detectIstioMode determines the Istio service mesh mode by checking: +// 1. Sidecar mode: localhost:15000 (Envoy admin API) is reachable +// 2. Ambient mode: ISTIO_AMBIENT_REDIRECTION env var is "enabled" (set via Downward API) +// 3. None: Neither indicator present +func (c *checker) detectIstioMode(ctx context.Context, client *http.Client) IstioMode { + // Try sidecar detection first - if we can reach localhost:15000, we have a sidecar + if _, err := c.getIstioClusterID(ctx, client); err == nil { + return IstioModeSidecar + } + + // Check for ambient mode via Downward API env var + if os.Getenv(envIstioAmbientRedirection) == istioAmbientRedirectionEnabled { + return IstioModeAmbient + } + + return IstioModeNone +} + +// Check performs the Istio cross-cluster load balancing detection and validation. +// It detects Istio mode (Sidecar, Ambient, or None) and validates configuration: +// +// - None: PASS - not running in an Istio mesh +// - Sidecar: Full validation via localhost:15000 (cluster ID match, cross-cluster LB detection) +// - Ambient: PASS - trust configured cluster ID (no local proxy to validate against) +// +// In Ambient mode, traffic fencing relies on the DestinationRule configured at deploy time. +// Runtime validation is not possible because there's no local sidecar to query. +func (c *checker) Check(ctx context.Context, client *http.Client, accessor status.Accessor) error { + c.logger.Infof("Configured cluster ID: '%s', cluster name: '%s', effective: '%s'", + c.configuredClusterID, c.clusterName, c.effectiveClusterID()) + + // Detect Istio mode + mode := c.detectIstioMode(ctx, client) + c.logger.Infof("Detected Istio mode: %s", mode) + + switch mode { + case IstioModeNone: + c.logger.Info("Not running in Istio mesh") + accessor.AddCheck(&status.StatusCheck{ + Name: DiagnosticIstioXClusterLB, + Passing: true, + }) + return nil + + case IstioModeAmbient: + return c.checkAmbientMode(ctx, accessor) + + case IstioModeSidecar: + return c.checkSidecarMode(ctx, client, accessor) + } + + // Should not reach here, but handle gracefully + accessor.AddCheck(&status.StatusCheck{ + Name: DiagnosticIstioXClusterLB, + Passing: true, + }) + return nil +} + +// checkAmbientMode handles validation for Istio Ambient mode. +// In ambient mode, there's no local sidecar to query, so we require explicit clusterID +// and trust it for traffic fencing via DestinationRule. +func (c *checker) checkAmbientMode(_ context.Context, accessor status.Accessor) error { + c.logger.Info("Running in Istio Ambient mode (sidecarless)") + + // Ambient mode REQUIRES explicit clusterID (no fallback to clusterName) + // because we cannot validate it at runtime + if c.configuredClusterID == "" { + c.logger.Warn("Ambient mode detected but explicit cluster ID not configured - " + + "in multi-cluster deployments, traffic may be routed to other clusters") + accessor.AddCheck(&status.StatusCheck{ + Name: DiagnosticIstioXClusterLB, + Passing: false, + Error: "Istio Ambient mode detected but integrations.istio.clusterID not configured. " + + "In Ambient mode, explicit cluster ID is required for traffic fencing. " + + "Set integrations.istio.clusterID to your Istio cluster ID.", + }) + return nil + } + + // If topology label is available, validate it matches configured cluster ID + if c.topologyCluster != "" && c.topologyCluster != c.configuredClusterID { + c.logger.Warnf("Configured cluster ID '%s' does not match pod topology label '%s'", + c.configuredClusterID, c.topologyCluster) + accessor.AddCheck(&status.StatusCheck{ + Name: DiagnosticIstioXClusterLB, + Passing: false, + Error: fmt.Sprintf("Configured cluster ID '%s' does not match pod topology label '%s'. "+ + "Update integrations.istio.clusterID to match your Istio cluster ID.", + c.configuredClusterID, c.topologyCluster), + }) + return nil + } + + // Trust the configured cluster ID since we can't validate against sidecar + c.logger.Infof("Ambient mode: trusting configured cluster ID '%s' for traffic fencing", c.configuredClusterID) + accessor.AddCheck(&status.StatusCheck{ + Name: DiagnosticIstioXClusterLB, + Passing: true, + }) + return nil +} + +// checkSidecarMode handles validation for Istio Sidecar mode. +// This is the traditional mode with per-pod Envoy proxies that we can query. +// In sidecar mode, we validate the effective cluster ID (clusterID || clusterName) +// against what Istio knows, since that's what Helm uses for the DestinationRule. +func (c *checker) checkSidecarMode(ctx context.Context, client *http.Client, accessor status.Accessor) error { + // Get the Istio cluster ID from the sidecar + istioClusterID, err := c.getIstioClusterID(ctx, client) + if err != nil { + // Unexpected - we already verified sidecar was reachable in detectIstioMode + c.logger.Warnf("Could not query Envoy sidecar: %v", err) + accessor.AddCheck(&status.StatusCheck{ + Name: DiagnosticIstioXClusterLB, + Passing: true, + }) + return nil + } + + c.logger.Infof("Sidecar mode: Istio cluster ID from Envoy: '%s'", istioClusterID) + + // Compute effective cluster ID (what Helm uses for DestinationRule) + // Note: clusterName is always set (required field, auto-detected by scout) + // so effective is never empty + effective := c.effectiveClusterID() + c.logger.Infof("Effective cluster ID (for DestinationRule): '%s'", effective) + + // Validate effective cluster ID matches Istio's cluster ID + // This ensures the DestinationRule will work correctly + if istioClusterID != "" && effective != istioClusterID { + c.logger.Warnf("Effective cluster ID '%s' does not match Istio cluster ID '%s'", + effective, istioClusterID) + accessor.AddCheck(&status.StatusCheck{ + Name: DiagnosticIstioXClusterLB, + Passing: false, + Error: fmt.Sprintf("Effective cluster ID '%s' does not match Istio cluster ID '%s'. "+ + "The DestinationRule will not route traffic correctly. "+ + "Set integrations.istio.clusterID to '%s' to match your Istio configuration.", + effective, istioClusterID, istioClusterID), + }) + return nil + } + + // Also check topology label if available + if c.topologyCluster != "" && c.topologyCluster != effective { + c.logger.Warnf("Effective cluster ID '%s' does not match pod topology label '%s'", + effective, c.topologyCluster) + accessor.AddCheck(&status.StatusCheck{ + Name: DiagnosticIstioXClusterLB, + Passing: false, + Error: fmt.Sprintf("Effective cluster ID '%s' does not match pod topology label '%s'. "+ + "Set integrations.istio.clusterID to '%s' to match your Istio configuration.", + effective, c.topologyCluster, c.topologyCluster), + }) + return nil + } + + // Query Envoy sidecar to detect cross-cluster load balancing + xclusterDetected, err := c.detectCrossClusterLB(ctx, client) + if err != nil { + c.logger.Warnf("Could not query Envoy clusters endpoint: %v", err) + accessor.AddCheck(&status.StatusCheck{ + Name: DiagnosticIstioXClusterLB, + Passing: true, + }) + return nil + } + + if xclusterDetected { + c.logger.Info("Cross-cluster load balancing detected, cluster-local routing configured correctly") + } else { + c.logger.Info("No cross-cluster load balancing detected for aggregator service") + } + + accessor.AddCheck(&status.StatusCheck{ + Name: DiagnosticIstioXClusterLB, + Passing: true, + }) + return nil +} + +// serverInfoResponse represents the structure of the Envoy /server_info response +type serverInfoResponse struct { + Node struct { + Metadata map[string]interface{} `json:"metadata"` + } `json:"node"` +} + +// getIstioClusterID queries the Envoy sidecar's server_info endpoint to get the Istio cluster ID +func (c *checker) getIstioClusterID(ctx context.Context, client *http.Client) (string, error) { + var lastErr error + + for attempt := 1; attempt <= MaxRetry; attempt++ { + resp, err := client.Get(c.serverInfoURL) + if err != nil { + lastErr = fmt.Errorf("failed to query Envoy server_info: %w", err) + c.logger.Debugf(logRetryAttempt, attempt, lastErr) + time.Sleep(RetryInterval) + continue + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + lastErr = fmt.Errorf("envoy server_info returned status %d", resp.StatusCode) + c.logger.Debugf(logRetryAttempt, attempt, lastErr) + time.Sleep(RetryInterval) + continue + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + lastErr = fmt.Errorf("failed to read Envoy server_info response: %w", err) + c.logger.Debugf(logRetryAttempt, attempt, lastErr) + time.Sleep(RetryInterval) + continue + } + + var info serverInfoResponse + if err := json.Unmarshal(body, &info); err != nil { + lastErr = fmt.Errorf("failed to parse Envoy server_info JSON: %w", err) + c.logger.Debugf(logRetryAttempt, attempt, lastErr) + time.Sleep(RetryInterval) + continue + } + + // The cluster ID is in node.metadata.CLUSTER_ID + if clusterID, ok := info.Node.Metadata["CLUSTER_ID"].(string); ok && clusterID != "" { + return clusterID, nil + } + + // If CLUSTER_ID not found, return empty but no error (sidecar is reachable) + c.logger.Debug("CLUSTER_ID not found in Envoy server_info metadata") + return "", nil + } + + return "", lastErr +} + +// detectCrossClusterLB queries the Envoy sidecar's admin API to check if the aggregator +// service has endpoints in multiple clusters (indicating cross-cluster load balancing). +func (c *checker) detectCrossClusterLB(ctx context.Context, client *http.Client) (bool, error) { + var lastErr error + + for attempt := 1; attempt <= MaxRetry; attempt++ { + resp, err := client.Get(c.clustersURL) + if err != nil { + lastErr = fmt.Errorf("failed to query Envoy admin API: %w", err) + c.logger.Debugf(logRetryAttempt, attempt, lastErr) + time.Sleep(RetryInterval) + continue + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + lastErr = fmt.Errorf("envoy admin API returned status %d", resp.StatusCode) + c.logger.Debugf(logRetryAttempt, attempt, lastErr) + time.Sleep(RetryInterval) + continue + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + lastErr = fmt.Errorf("failed to read Envoy response: %w", err) + c.logger.Debugf(logRetryAttempt, attempt, lastErr) + time.Sleep(RetryInterval) + continue + } + + return c.parseEnvoyResponse(string(body)), nil + } + + return false, lastErr +} + +// parseEnvoyResponse parses the Envoy clusters response and checks for multiple +// distinct localities for the aggregator service, which indicates cross-cluster LB. +// +// The text format (default) looks like: +// outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1 +// outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::zone::us-east-1a +// +// We look for the aggregator service and check if it has endpoints with different regions. +func (c *checker) parseEnvoyResponse(body string) bool { + // Build the service FQDN pattern to look for + // Format: outbound|||..svc.cluster.local + servicePattern := fmt.Sprintf("%s.%s.svc.cluster.local", c.aggregatorService, c.namespace) + + c.logger.Debugf("Looking for service pattern: %s", servicePattern) + + // Track unique regions/zones for the aggregator service + regions := make(map[string]struct{}) + + scanner := bufio.NewScanner(strings.NewReader(body)) + for scanner.Scan() { + line := scanner.Text() + + // Skip lines that don't match our service + if !strings.Contains(line, servicePattern) { + continue + } + + // Look for region information in the line + // Format: ...::region::us-east-1 + if strings.Contains(line, "::region::") { + parts := strings.Split(line, "::region::") + if len(parts) >= 2 { + // Extract the region value (everything up to the next ::) + regionPart := strings.Split(parts[1], "::")[0] + regionPart = strings.TrimSpace(regionPart) + if regionPart != "" { + c.logger.Debugf("Found region '%s' for service %s", regionPart, servicePattern) + regions[regionPart] = struct{}{} + } + } + } + } + + // If we have more than one distinct region, cross-cluster LB is active + if len(regions) > 1 { + regionList := make([]string, 0, len(regions)) + for r := range regions { + regionList = append(regionList, r) + } + c.logger.Infof("Cross-cluster LB detected: aggregator service has endpoints in %d regions: %v", + len(regions), regionList) + return true + } + + if len(regions) == 1 { + for r := range regions { + c.logger.Debugf("Aggregator service endpoints are all in region: %s", r) + } + } else { + c.logger.Debug("No region information found for aggregator service endpoints") + } + + return false +} diff --git a/app/domain/diagnostic/istio/istio_test.go b/app/domain/diagnostic/istio/istio_test.go new file mode 100644 index 000000000..92b0ac08a --- /dev/null +++ b/app/domain/diagnostic/istio/istio_test.go @@ -0,0 +1,981 @@ +// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +package istio + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/cloudzero/cloudzero-agent/app/types/status" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// testLogger returns a logger for use in tests +func testLogger() *logrus.Entry { + logger := logrus.New() + logger.SetLevel(logrus.DebugLevel) + return logger.WithField("test", true) +} + +// mockAccessor implements status.Accessor for testing +type mockAccessor struct { + checks []*status.StatusCheck +} + +func (m *mockAccessor) AddCheck(checks ...*status.StatusCheck) { + m.checks = append(m.checks, checks...) +} + +func (m *mockAccessor) WriteToReport(fn func(*status.ClusterStatus)) { + // No-op for tests +} + +func (m *mockAccessor) ReadFromReport(fn func(*status.ClusterStatus)) { + // No-op for tests +} + +func (m *mockAccessor) GetCheck(name string) *status.StatusCheck { + for _, c := range m.checks { + if c.Name == name { + return c + } + } + return nil +} + +// setupTestRetries configures retry settings for fast tests and returns a cleanup function +func setupTestRetries() func() { + origMaxRetry := MaxRetry + origRetryInterval := RetryInterval + MaxRetry = 1 + RetryInterval = 0 + return func() { + MaxRetry = origMaxRetry + RetryInterval = origRetryInterval + } +} + +// mockEnvoyServer creates a test server that responds to both /server_info and /clusters endpoints +func mockEnvoyServer(serverInfoResponse, clustersResponse string, serverInfoStatus, clustersStatus int) *httptest.Server { + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case strings.HasSuffix(r.URL.Path, "/server_info"): + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(serverInfoStatus) + w.Write([]byte(serverInfoResponse)) + case strings.HasSuffix(r.URL.Path, "/clusters"): + w.WriteHeader(clustersStatus) + w.Write([]byte(clustersResponse)) + default: + w.WriteHeader(http.StatusNotFound) + } + })) +} + +// ============================================================================ +// Full Check() Flow Tests with Mock Servers +// ============================================================================ + +func TestCheck_NotInIstioMesh_SidecarUnreachable(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + // Create checker with URL pointing to non-existent server + c := &checker{ + logger: testLogger(), + configuredClusterID: "test-cluster", + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: "http://localhost:1/server_info", // Non-existent + clustersURL: "http://localhost:1/clusters", + } + + accessor := &mockAccessor{} + err := c.Check(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.True(t, check.Passing, "should pass when sidecar is unreachable") + assert.Empty(t, check.Error) +} + +func TestCheck_InIstioMesh_NoCrossClusterLB_ClusterIDMatch(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + serverInfo := `{"node":{"metadata":{"CLUSTER_ID":"my-cluster"}}}` + clusters := `outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1 +outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.6:8080::region::us-east-1` + + server := mockEnvoyServer(serverInfo, clusters, http.StatusOK, http.StatusOK) + defer server.Close() + + c := &checker{ + logger: testLogger(), + configuredClusterID: "my-cluster", + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: server.URL + "/server_info", + clustersURL: server.URL + "/clusters", + } + + accessor := &mockAccessor{} + err := c.Check(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.True(t, check.Passing, "should pass when in Istio mesh with matching cluster ID and no cross-cluster LB") + assert.Empty(t, check.Error) +} + +func TestCheck_InIstioMesh_ClusterIDMismatch(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + // Istio reports "gke-cluster" but we configured "foo" + serverInfo := `{"node":{"metadata":{"CLUSTER_ID":"gke-cluster"}}}` + + server := mockEnvoyServer(serverInfo, "", http.StatusOK, http.StatusOK) + defer server.Close() + + c := &checker{ + logger: testLogger(), + configuredClusterID: "foo", // Mismatch! + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: server.URL + "/server_info", + clustersURL: server.URL + "/clusters", + } + + accessor := &mockAccessor{} + err := c.Check(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.False(t, check.Passing, "should fail when cluster ID mismatches") + assert.Contains(t, check.Error, "does not match Istio cluster ID") + assert.Contains(t, check.Error, "foo") + assert.Contains(t, check.Error, "gke-cluster") +} + +func TestCheck_InIstioMesh_CrossClusterLB_WithClusterID(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + serverInfo := `{"node":{"metadata":{"CLUSTER_ID":"my-cluster"}}}` + // Multiple regions = cross-cluster LB detected + clusters := `outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1 +outbound|8080||aggregator.cza.svc.cluster.local::10.0.2.10:8080::region::us-west-2` + + server := mockEnvoyServer(serverInfo, clusters, http.StatusOK, http.StatusOK) + defer server.Close() + + c := &checker{ + logger: testLogger(), + configuredClusterID: "my-cluster", + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: server.URL + "/server_info", + clustersURL: server.URL + "/clusters", + } + + accessor := &mockAccessor{} + err := c.Check(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.True(t, check.Passing, "should pass when cross-cluster LB detected but cluster ID is configured") + assert.Empty(t, check.Error) +} + +func TestCheck_InIstioMesh_EffectiveClusterIDMismatch(t *testing.T) { + // Test: clusterID not explicitly set, clusterName (fallback) doesn't match Istio + // This tests the scenario where the Helm chart uses clusterName as fallback + // but it doesn't match the Istio cluster ID + cleanup := setupTestRetries() + defer cleanup() + + serverInfo := `{"node":{"metadata":{"CLUSTER_ID":"istio-cluster"}}}` + clusters := `outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1` + + server := mockEnvoyServer(serverInfo, clusters, http.StatusOK, http.StatusOK) + defer server.Close() + + c := &checker{ + logger: testLogger(), + configuredClusterID: "", // Not explicitly configured + clusterName: "eks-cluster", // This is what Helm would use as fallback + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: server.URL + "/server_info", + clustersURL: server.URL + "/clusters", + } + + accessor := &mockAccessor{} + err := c.Check(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.False(t, check.Passing, "should fail when effective cluster ID doesn't match Istio") + assert.Contains(t, check.Error, "Effective cluster ID 'eks-cluster' does not match Istio cluster ID 'istio-cluster'") +} + +func TestCheck_InIstioMesh_NoClusterIDInSidecar(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + // Sidecar responds but has no CLUSTER_ID in metadata + serverInfo := `{"node":{"metadata":{"NAMESPACE":"cza"}}}` + clusters := `outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1` + + server := mockEnvoyServer(serverInfo, clusters, http.StatusOK, http.StatusOK) + defer server.Close() + + c := &checker{ + logger: testLogger(), + configuredClusterID: "my-cluster", + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: server.URL + "/server_info", + clustersURL: server.URL + "/clusters", + } + + accessor := &mockAccessor{} + err := c.Check(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + // When Istio cluster ID is empty, we skip the mismatch check + assert.True(t, check.Passing, "should pass when sidecar has no cluster ID") +} + +func TestCheck_InIstioMesh_ClustersEndpointFails(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + serverInfo := `{"node":{"metadata":{"CLUSTER_ID":"my-cluster"}}}` + + server := mockEnvoyServer(serverInfo, "", http.StatusOK, http.StatusInternalServerError) + defer server.Close() + + c := &checker{ + logger: testLogger(), + configuredClusterID: "my-cluster", + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: server.URL + "/server_info", + clustersURL: server.URL + "/clusters", + } + + accessor := &mockAccessor{} + err := c.Check(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + // When clusters endpoint fails, we pass (can't detect cross-cluster LB) + assert.True(t, check.Passing, "should pass when clusters endpoint fails") +} + +func TestCheck_InIstioMesh_ServerInfoReturns500(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + server := mockEnvoyServer("", "", http.StatusInternalServerError, http.StatusOK) + defer server.Close() + + c := &checker{ + logger: testLogger(), + configuredClusterID: "my-cluster", + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: server.URL + "/server_info", + clustersURL: server.URL + "/clusters", + } + + accessor := &mockAccessor{} + err := c.Check(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + // When server_info fails, we treat it as "not in Istio mesh" + assert.True(t, check.Passing, "should pass when server_info returns error") +} + +// ============================================================================ +// parseEnvoyResponse Unit Tests +// ============================================================================ + +func TestParseEnvoyResponse_SingleRegion(t *testing.T) { + envoyResponse := `outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1 +outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::zone::us-east-1a +outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.6:8080::region::us-east-1 +outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.6:8080::zone::us-east-1b` + + c := &checker{ + logger: testLogger(), + aggregatorService: "aggregator", + namespace: "cza", + } + + assert.False(t, c.parseEnvoyResponse(envoyResponse), "should not detect cross-cluster LB with single region") +} + +func TestParseEnvoyResponse_MultipleRegions(t *testing.T) { + envoyResponse := `outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1 +outbound|8080||aggregator.cza.svc.cluster.local::10.0.2.10:8080::region::us-west-2` + + c := &checker{ + logger: testLogger(), + aggregatorService: "aggregator", + namespace: "cza", + } + + assert.True(t, c.parseEnvoyResponse(envoyResponse), "should detect cross-cluster LB with multiple regions") +} + +func TestParseEnvoyResponse_NoServiceMatch(t *testing.T) { + envoyResponse := `outbound|8080||other-service.other-ns.svc.cluster.local::10.0.1.5:8080::region::us-east-1 +outbound|8080||other-service.other-ns.svc.cluster.local::10.0.2.10:8080::region::us-west-2` + + c := &checker{ + logger: testLogger(), + aggregatorService: "aggregator", + namespace: "cza", + } + + assert.False(t, c.parseEnvoyResponse(envoyResponse), "should not detect cross-cluster LB for non-matching service") +} + +func TestParseEnvoyResponse_NoRegionInfo(t *testing.T) { + envoyResponse := `outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::health_flags::healthy +outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.6:8080::health_flags::healthy` + + c := &checker{ + logger: testLogger(), + aggregatorService: "aggregator", + namespace: "cza", + } + + assert.False(t, c.parseEnvoyResponse(envoyResponse), "should not detect cross-cluster LB without region info") +} + +func TestParseEnvoyResponse_EmptyResponse(t *testing.T) { + c := &checker{ + logger: testLogger(), + aggregatorService: "aggregator", + namespace: "cza", + } + + assert.False(t, c.parseEnvoyResponse(""), "should not detect cross-cluster LB with empty response") +} + +func TestParseEnvoyResponse_MixedServices(t *testing.T) { + // Other services have multiple regions, but aggregator only has one + envoyResponse := `outbound|8080||other-service.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1 +outbound|8080||other-service.cza.svc.cluster.local::10.0.2.10:8080::region::us-west-2 +outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1 +outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.6:8080::region::us-east-1` + + c := &checker{ + logger: testLogger(), + aggregatorService: "aggregator", + namespace: "cza", + } + + assert.False(t, c.parseEnvoyResponse(envoyResponse), "should only consider aggregator service") +} + +// ============================================================================ +// serverInfoResponse Parsing Tests +// ============================================================================ + +func TestServerInfoResponse_WithClusterID(t *testing.T) { + serverInfoJSON := `{ + "node": { + "metadata": { + "CLUSTER_ID": "gke-cluster", + "NAMESPACE": "cza" + } + } + }` + + var info serverInfoResponse + err := json.Unmarshal([]byte(serverInfoJSON), &info) + require.NoError(t, err) + + clusterID, ok := info.Node.Metadata["CLUSTER_ID"].(string) + assert.True(t, ok, "should find CLUSTER_ID in metadata") + assert.Equal(t, "gke-cluster", clusterID) +} + +func TestServerInfoResponse_WithoutClusterID(t *testing.T) { + serverInfoJSON := `{ + "node": { + "metadata": { + "NAMESPACE": "cza" + } + } + }` + + var info serverInfoResponse + err := json.Unmarshal([]byte(serverInfoJSON), &info) + require.NoError(t, err) + + clusterID, ok := info.Node.Metadata["CLUSTER_ID"].(string) + assert.False(t, ok || clusterID != "", "should not find CLUSTER_ID in metadata") +} + +func TestServerInfoResponse_EmptyMetadata(t *testing.T) { + serverInfoJSON := `{ + "node": { + "metadata": {} + } + }` + + var info serverInfoResponse + err := json.Unmarshal([]byte(serverInfoJSON), &info) + require.NoError(t, err) + + clusterID, ok := info.Node.Metadata["CLUSTER_ID"].(string) + assert.False(t, ok || clusterID != "", "should not find CLUSTER_ID in empty metadata") +} + +// ============================================================================ +// getIstioClusterID Method Tests +// ============================================================================ + +func TestGetIstioClusterID_Success(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"node":{"metadata":{"CLUSTER_ID":"test-cluster"}}}`)) + })) + defer server.Close() + + c := &checker{ + logger: testLogger(), + serverInfoURL: server.URL, + } + + clusterID, err := c.getIstioClusterID(context.Background(), http.DefaultClient) + require.NoError(t, err) + assert.Equal(t, "test-cluster", clusterID) +} + +func TestGetIstioClusterID_NoClusterID(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"node":{"metadata":{"NAMESPACE":"cza"}}}`)) + })) + defer server.Close() + + c := &checker{ + logger: testLogger(), + serverInfoURL: server.URL, + } + + clusterID, err := c.getIstioClusterID(context.Background(), http.DefaultClient) + require.NoError(t, err) + assert.Empty(t, clusterID, "should return empty when no CLUSTER_ID in metadata") +} + +func TestGetIstioClusterID_ServerError(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer server.Close() + + c := &checker{ + logger: testLogger(), + serverInfoURL: server.URL, + } + + _, err := c.getIstioClusterID(context.Background(), http.DefaultClient) + require.Error(t, err) + assert.Contains(t, err.Error(), "status 500") +} + +func TestGetIstioClusterID_InvalidJSON(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte(`not valid json`)) + })) + defer server.Close() + + c := &checker{ + logger: testLogger(), + serverInfoURL: server.URL, + } + + _, err := c.getIstioClusterID(context.Background(), http.DefaultClient) + require.Error(t, err) + assert.Contains(t, err.Error(), "parse") +} + +// ============================================================================ +// detectCrossClusterLB Method Tests +// ============================================================================ + +func TestDetectCrossClusterLB_Success(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte(`outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1 +outbound|8080||aggregator.cza.svc.cluster.local::10.0.2.10:8080::region::us-west-2`)) + })) + defer server.Close() + + c := &checker{ + logger: testLogger(), + aggregatorService: "aggregator", + namespace: "cza", + clustersURL: server.URL, + } + + detected, err := c.detectCrossClusterLB(context.Background(), http.DefaultClient) + require.NoError(t, err) + assert.True(t, detected, "should detect cross-cluster LB") +} + +func TestDetectCrossClusterLB_ServerError(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer server.Close() + + c := &checker{ + logger: testLogger(), + aggregatorService: "aggregator", + namespace: "cza", + clustersURL: server.URL, + } + + _, err := c.detectCrossClusterLB(context.Background(), http.DefaultClient) + require.Error(t, err) + assert.Contains(t, err.Error(), "status 500") +} + +// ============================================================================ +// detectIstioMode Tests +// ============================================================================ + +func TestDetectIstioMode_Sidecar(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"node":{"metadata":{"CLUSTER_ID":"test-cluster"}}}`)) + })) + defer server.Close() + + c := &checker{ + logger: testLogger(), + serverInfoURL: server.URL, + } + + mode := c.detectIstioMode(context.Background(), http.DefaultClient) + assert.Equal(t, IstioModeSidecar, mode, "should detect sidecar mode when localhost:15000 is reachable") +} + +func TestDetectIstioMode_Ambient(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + // Set the ambient env var + t.Setenv(envIstioAmbientRedirection, istioAmbientRedirectionEnabled) + + // Point to non-existent server (sidecar not available) + c := &checker{ + logger: testLogger(), + serverInfoURL: "http://localhost:1/server_info", + } + + mode := c.detectIstioMode(context.Background(), http.DefaultClient) + assert.Equal(t, IstioModeAmbient, mode, "should detect ambient mode when env var is set and sidecar unreachable") +} + +func TestDetectIstioMode_None(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + // Ensure ambient env var is not set (t.Setenv auto-cleans up) + t.Setenv(envIstioAmbientRedirection, "") + + // Point to non-existent server (sidecar not available) + c := &checker{ + logger: testLogger(), + serverInfoURL: "http://localhost:1/server_info", + } + + mode := c.detectIstioMode(context.Background(), http.DefaultClient) + assert.Equal(t, IstioModeNone, mode, "should detect no Istio when sidecar unreachable and no ambient env var") +} + +func TestDetectIstioMode_SidecarTakesPrecedenceOverAmbient(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + // Set the ambient env var (shouldn't matter if sidecar is reachable) + t.Setenv(envIstioAmbientRedirection, istioAmbientRedirectionEnabled) + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"node":{"metadata":{"CLUSTER_ID":"test-cluster"}}}`)) + })) + defer server.Close() + + c := &checker{ + logger: testLogger(), + serverInfoURL: server.URL, + } + + mode := c.detectIstioMode(context.Background(), http.DefaultClient) + assert.Equal(t, IstioModeSidecar, mode, "sidecar mode should take precedence over ambient when both indicators present") +} + +// ============================================================================ +// Ambient Mode Check Tests +// ============================================================================ + +func TestCheck_AmbientMode_WithClusterID(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + // Set ambient mode + t.Setenv(envIstioAmbientRedirection, istioAmbientRedirectionEnabled) + + // Point to non-existent server (no sidecar in ambient mode) + c := &checker{ + logger: testLogger(), + configuredClusterID: "my-cluster", + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: "http://localhost:1/server_info", + clustersURL: "http://localhost:1/clusters", + } + + accessor := &mockAccessor{} + err := c.Check(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.True(t, check.Passing, "should pass in ambient mode with cluster ID configured") + assert.Empty(t, check.Error) +} + +func TestCheck_AmbientMode_WithoutClusterID(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + // Set ambient mode + t.Setenv(envIstioAmbientRedirection, istioAmbientRedirectionEnabled) + + // Point to non-existent server (no sidecar in ambient mode) + c := &checker{ + logger: testLogger(), + configuredClusterID: "", // Not configured! + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: "http://localhost:1/server_info", + clustersURL: "http://localhost:1/clusters", + } + + accessor := &mockAccessor{} + err := c.Check(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.False(t, check.Passing, "should fail in ambient mode without cluster ID") + assert.Contains(t, check.Error, "Ambient mode detected") + assert.Contains(t, check.Error, "clusterID not configured") +} + +func TestCheckAmbientMode_TrustsConfiguredClusterID(t *testing.T) { + c := &checker{ + logger: testLogger(), + configuredClusterID: "trusted-cluster-id", + } + + accessor := &mockAccessor{} + err := c.checkAmbientMode(context.Background(), accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.True(t, check.Passing, "should pass when cluster ID is configured") + assert.Empty(t, check.Error) +} + +func TestCheckAmbientMode_FailsWithoutClusterID(t *testing.T) { + c := &checker{ + logger: testLogger(), + configuredClusterID: "", + } + + accessor := &mockAccessor{} + err := c.checkAmbientMode(context.Background(), accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.False(t, check.Passing, "should fail when cluster ID is not configured") + assert.Contains(t, check.Error, "Ambient mode") +} + +// ============================================================================ +// effectiveClusterID Helper Tests +// ============================================================================ + +func TestEffectiveClusterID_ReturnsConfiguredWhenSet(t *testing.T) { + c := &checker{ + configuredClusterID: "explicit-cluster", + clusterName: "fallback-cluster", + } + + assert.Equal(t, "explicit-cluster", c.effectiveClusterID(), + "should return configured cluster ID when explicitly set") +} + +func TestEffectiveClusterID_FallsBackToClusterName(t *testing.T) { + c := &checker{ + configuredClusterID: "", // Not set + clusterName: "my-eks-cluster", + } + + assert.Equal(t, "my-eks-cluster", c.effectiveClusterID(), + "should fall back to clusterName when configuredClusterID is empty") +} + +func TestEffectiveClusterID_ReturnsEmptyWhenBothEmpty(t *testing.T) { + c := &checker{ + configuredClusterID: "", + clusterName: "", + } + + assert.Equal(t, "", c.effectiveClusterID(), + "should return empty when both configuredClusterID and clusterName are empty") +} + +// ============================================================================ +// Topology Label Validation Tests +// ============================================================================ + +func TestCheckAmbientMode_TopologyLabelMatch(t *testing.T) { + c := &checker{ + logger: testLogger(), + configuredClusterID: "my-cluster", + topologyCluster: "my-cluster", // Matches! + } + + accessor := &mockAccessor{} + err := c.checkAmbientMode(context.Background(), accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.True(t, check.Passing, "should pass when topology label matches configured cluster ID") + assert.Empty(t, check.Error) +} + +func TestCheckAmbientMode_TopologyLabelMismatch(t *testing.T) { + c := &checker{ + logger: testLogger(), + configuredClusterID: "configured-cluster", + topologyCluster: "actual-istio-cluster", // Mismatch! + } + + accessor := &mockAccessor{} + err := c.checkAmbientMode(context.Background(), accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.False(t, check.Passing, "should fail when topology label doesn't match configured cluster ID") + assert.Contains(t, check.Error, "configured-cluster") + assert.Contains(t, check.Error, "actual-istio-cluster") + assert.Contains(t, check.Error, "does not match pod topology label") +} + +func TestCheckAmbientMode_TopologyLabelEmpty(t *testing.T) { + c := &checker{ + logger: testLogger(), + configuredClusterID: "my-cluster", + topologyCluster: "", // Not available + } + + accessor := &mockAccessor{} + err := c.checkAmbientMode(context.Background(), accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.True(t, check.Passing, "should pass when topology label is not available (can't validate)") + assert.Empty(t, check.Error) +} + +func TestCheckSidecarMode_TopologyLabelMismatch(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + serverInfo := `{"node":{"metadata":{"CLUSTER_ID":"my-cluster"}}}` + clusters := `outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1` + + server := mockEnvoyServer(serverInfo, clusters, http.StatusOK, http.StatusOK) + defer server.Close() + + c := &checker{ + logger: testLogger(), + configuredClusterID: "my-cluster", + clusterName: "my-cluster", + topologyCluster: "different-cluster", // Mismatch with effective! + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: server.URL + "/server_info", + clustersURL: server.URL + "/clusters", + } + + accessor := &mockAccessor{} + err := c.checkSidecarMode(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.False(t, check.Passing, "should fail when topology label doesn't match effective cluster ID") + assert.Contains(t, check.Error, "my-cluster") + assert.Contains(t, check.Error, "different-cluster") +} + +func TestCheckSidecarMode_TopologyLabelMatch(t *testing.T) { + cleanup := setupTestRetries() + defer cleanup() + + serverInfo := `{"node":{"metadata":{"CLUSTER_ID":"my-cluster"}}}` + clusters := `outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1` + + server := mockEnvoyServer(serverInfo, clusters, http.StatusOK, http.StatusOK) + defer server.Close() + + c := &checker{ + logger: testLogger(), + configuredClusterID: "my-cluster", + clusterName: "my-cluster", + topologyCluster: "my-cluster", // Matches! + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: server.URL + "/server_info", + clustersURL: server.URL + "/clusters", + } + + accessor := &mockAccessor{} + err := c.checkSidecarMode(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.True(t, check.Passing, "should pass when topology label matches effective cluster ID") + assert.Empty(t, check.Error) +} + +// ============================================================================ +// Sidecar Mode with clusterName Fallback Tests +// ============================================================================ + +func TestCheckSidecarMode_ClusterNameFallbackMatch(t *testing.T) { + // Test: clusterID not explicitly set, clusterName fallback matches Istio + cleanup := setupTestRetries() + defer cleanup() + + serverInfo := `{"node":{"metadata":{"CLUSTER_ID":"my-eks-cluster"}}}` + clusters := `outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1` + + server := mockEnvoyServer(serverInfo, clusters, http.StatusOK, http.StatusOK) + defer server.Close() + + c := &checker{ + logger: testLogger(), + configuredClusterID: "", // Not explicitly set + clusterName: "my-eks-cluster", // Fallback matches Istio! + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: server.URL + "/server_info", + clustersURL: server.URL + "/clusters", + } + + accessor := &mockAccessor{} + err := c.checkSidecarMode(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.True(t, check.Passing, "should pass when clusterName fallback matches Istio cluster ID") + assert.Empty(t, check.Error) +} + +func TestCheckSidecarMode_ExplicitClusterIDOverridesClusterName(t *testing.T) { + // Test: explicit clusterID takes precedence over clusterName + cleanup := setupTestRetries() + defer cleanup() + + serverInfo := `{"node":{"metadata":{"CLUSTER_ID":"istio-cluster"}}}` + clusters := `outbound|8080||aggregator.cza.svc.cluster.local::10.0.1.5:8080::region::us-east-1` + + server := mockEnvoyServer(serverInfo, clusters, http.StatusOK, http.StatusOK) + defer server.Close() + + c := &checker{ + logger: testLogger(), + configuredClusterID: "istio-cluster", // Explicitly set, takes precedence + clusterName: "different-name", // Would cause mismatch if used + aggregatorService: "aggregator", + namespace: "cza", + serverInfoURL: server.URL + "/server_info", + clustersURL: server.URL + "/clusters", + } + + accessor := &mockAccessor{} + err := c.checkSidecarMode(context.Background(), http.DefaultClient, accessor) + require.NoError(t, err) + + check := accessor.GetCheck(DiagnosticIstioXClusterLB) + require.NotNil(t, check) + assert.True(t, check.Passing, "should pass when explicit clusterID matches Istio (ignoring clusterName)") + assert.Empty(t, check.Error) +} diff --git a/app/functions/helmless/default-values.yaml b/app/functions/helmless/default-values.yaml index a6b419920..1a7d9a01a 100644 --- a/app/functions/helmless/default-values.yaml +++ b/app/functions/helmless/default-values.yaml @@ -206,6 +206,42 @@ defaults: runAsGroup: 65534 fsGroup: 65534 +# Integration configuration for external systems and service meshes. +integrations: + # Istio service mesh integration settings. + istio: + # Controls Istio integration behavior. + # + # Possible values: + # - null (default): Auto-detect Istio via CRD presence in the cluster + # - true: Force Istio integration enabled + # - false: Force Istio integration disabled + enabled: null + # Istio cluster ID for multicluster mesh environments. + # + # In an Istio multicluster mesh, each cluster has a unique cluster ID that + # identifies it within the mesh. This value is used by the DestinationRule + # to pin traffic to pods in the local cluster using the + # `topology.istio.io/cluster` label that Istio automatically adds to pods. + # + # If not set, falls back to the top-level `clusterName` value. At least one + # of these must be set when Istio integration is enabled. + # + # Note that, if cross-cluster load balancing is enabled on your cluster, + # this *MUST* be set to the correct value. + # + # To find your Istio cluster ID, use istioctl to query any Istio-injected + # pod's bootstrap config: + # + # istioctl proxy-config bootstrap . -o yaml | \ + # grep 'CLUSTER_ID:' | awk '{print $2}' + # + # Or check the istiod deployment (assumes istio-system namespace): + # + # kubectl -n istio-system get deploy istiod \ + # -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="CLUSTER_ID")].value}' + clusterID: null + # Component-specific configuration settings. components: # The agent here refers to the CloudZero Agent, which is the component that @@ -598,7 +634,6 @@ components: limits: memory: "1024Mi" cpu: "2000m" - # Settings for the webhook server. webhookServer: replicas: null @@ -1213,13 +1248,6 @@ insightsController: deploymentAnnotations: {} # Annotations to add to the webhook server pods. podAnnotations: {} - # Whether to suppress Istio-related annotations on webhook server pods. When - # false (default), the sidecar.istio.io/inject: "false" annotation is added - # to prevent Istio sidecar injection which can interfere with webhook TLS. - # Set to true to disable this behavior and allow Istio sidecar injection. - # For additional information, see: - # https://github.com/Cloudzero/cloudzero-charts/blob/develop/charts/cloudzero-agent/docs/istio.md - suppressIstioAnnotations: false # Additional volume mounts to add to the insights controller pods. volumeMounts: [] # Additional volumes to add to the insights controller pods. diff --git a/helm/docs/istio.md b/helm/docs/istio.md index bc2205953..98d1859e9 100644 --- a/helm/docs/istio.md +++ b/helm/docs/istio.md @@ -1,80 +1,224 @@ -# Installing `cloudzero-agent` in Istio-Enabled Clusters +# Istio Integration -When installing the `cloudzero-agent` Helm chart in a Kubernetes cluster that uses Istio, the chart automatically includes Istio-compatible configuration. The webhook server pods include the `sidecar.istio.io/inject: "false"` annotation by default to prevent Istio sidecar injection, which can interfere with webhook TLS communication. +The CloudZero Agent attempts to detect Istio service mesh deployments and, when possible, configure itself accordingly. This document covers two Istio features that create challenges for the agent and how we address them. -In most cases, no additional configuration is required. However, you may need additional steps depending on your specific Istio setup. +## Strict mTLS -## Default Behavior +### What is Strict mTLS? -The webhook server pods automatically include the `sidecar.istio.io/inject: "false"` annotation. This prevents Istio sidecar injection and avoids TLS interference without any additional configuration. +Istio can enforce mutual TLS (mTLS) on all traffic within the mesh. When a `PeerAuthentication` policy is set to **Strict** mode, pods reject any plain-text connections - all traffic must be encrypted using Istio-issued certificates. -**To override this default behavior** and allow Istio sidecar injection, set: +Traffic enters the mesh through **sidecar injection** (in Sidecar mode) or **ambient enrollment** (in [Ambient mode](https://istio.io/latest/docs/ambient/overview/)): -```yaml -insightsController: - server: - suppressIstioAnnotations: true +- **Sidecar mode** injects an Envoy proxy container into each pod. All traffic passes through this sidecar, which handles mTLS termination and origination. + +- **Ambient mode** (Istio 1.18+) uses a node-level proxy called ztunnel instead of per-pod sidecars. Traffic is transparently redirected through ztunnel without modifying pod definitions. + +### How the Webhook Server Works + +The CloudZero webhook server receives notifications from the Kubernetes API server when pods are created or updated. It extracts metadata (labels, annotations, ownership) for cost attribution purposes and sends this data to the aggregator. + +By default, the webhook uses a self-signed TLS certificate. During installation, a Helm job (`init-cert`) generates this certificate and establishes trust: + +```mermaid +flowchart LR + subgraph "Installation" + IC[init-cert job] -->|certificate signed by CA| Secret[(Kubernetes Secret)] + IC -->|certificate authority| VWC[ValidatingWebhookConfiguration] + end + + subgraph "Runtime" + Secret --> WH[Webhook Server] + K8S[Kubernetes API] -->|events| WH + K8S -.-> VWC + WH -->|metrics| AGG[Aggregator] + end ``` -When this setting is enabled, you will need to use one of the additional configuration options below to ensure proper functionality. +The init-cert job creates a Certificate Authority, generates a server certificate signed by that CA, stores the certificate in a Kubernetes Secret for the webhook server to use, and writes the CA bundle to the `ValidatingWebhookConfiguration` so the Kubernetes API server trusts the webhook's certificate. -## Additional Configuration Options +### How Strict mTLS Conflicts with the Webhook -The `cloudzero-agent` includes a **webhook server** component responsible for handling admission review requests from the Kubernetes API server. These requests use TLS, and when intercepted by an Istio sidecar, Istio may attempt to apply its mTLS policies. These policies are not always compatible with the webhook's TLS configuration. +When strict mTLS is enabled, the Envoy sidecar intercepts all traffic entering the pod - including webhook admission requests. When the API server connects to the webhook: -While this does not block pod deployments, it **prevents the `insightsController` from collecting critical pod labels**, which are necessary for accurate cost allocation. +1. The API server initiates a TLS connection expecting the webhook's self-signed certificate +2. Envoy intercepts and attempts to terminate the connection with its own mTLS +3. Envoy presents an Istio-issued certificate instead of the webhook's certificate +4. The certificate chain doesn't match what the API server expects +5. The connection fails -If you have overridden the default behavior (by setting `suppressIstioAnnotations: true`) and need alternative configuration, you can choose from the following options: +The result: webhook admission requests fail, and the CloudZero Agent can't collect pod metadata. -- [**Disable envoy for webhook ports only**](#option-1-disable-envoy-for-webhook-ports-only) — Keeps the sidecar but excludes webhook traffic, preserving Istio functionality for all other traffic. -- [**Disable mTLS for `cloudzero-agent` webhook-server pods**](#option-2-disable-mtls-for-cloudzero-agent) — Keeps the sidecar but disables mTLS enforcement specifically for webhook-server traffic. +### Solutions ---- +#### cert-manager Integration (Preferred) + +When cert-manager is enabled (`insightsController.tls.useCertManager: true`), we assume [istio-csr](https://cert-manager.io/docs/usage/istio-csr/) is installed. This integrates cert-manager with Istio's PKI, so the webhook receives certificates that Istio's mTLS accepts. There's no conflict because both systems use the same certificate authority. -## **Option 1: Disable Envoy for Webhook Ports Only** +#### Port Exclusion Annotations -To prevent only requests to a single port on the webhook-server pods from being routed through envoy, apply the following annotation: +When cert-manager is not in use, the chart tells Envoy to pass webhook traffic through without interception. Istio supports this through pod annotations: ```yaml -insightsController: - server: - podAnnotations: - traffic.sidecar.istio.io/excludeInboundPorts: "8443" +traffic.sidecar.istio.io/excludeInboundPorts: "8443" ``` -In this case, the pods will still have an Istio sidecar injected, but traffic to port 8443 (the webhook port) will bypass envoy. +When Istio is detected and cert-manager is not in use, the chart automatically adds this annotation to webhook pods. Traffic on port 8443 bypasses Envoy entirely, and the webhook's self-signed certificate works as expected. -For more details, see [Istio Documentation](https://istio.io/latest/docs/reference/config/annotations/#SidecarTrafficExcludeInboundPorts). +The backfill job (which periodically scans the cluster for existing resources) also receives a port exclusion annotation: ---- +```yaml +traffic.sidecar.istio.io/excludeOutboundPorts: "443" +``` + +This ensures the backfill job can communicate with the webhook server without Istio intercepting and wrapping the connection in mTLS. + +## Cross-Cluster Load Balancing + +### What is Cross-Cluster Load Balancing? + +In multi-cluster Istio meshes, Istiod shares service endpoint information across all connected clusters. When a pod calls a service like `aggregator.cza.svc.cluster.local`, Istio may route that request to any cluster in the mesh that has matching endpoints. This enables cross-cluster load balancing and failover - if the local service is overloaded or unavailable, traffic automatically routes to another cluster. + +### Why This Is Problematic for CloudZero + +Each CloudZero aggregator collects and attributes metrics for its own cluster. The architecture looks like this: + +```mermaid +graph TB + subgraph "Cluster A" + CA[Agent A] --> AA[Aggregator A] + WA[Webhook A] --> AA + end + + subgraph "Cluster B" + CB[Agent B] --> AB[Aggregator B] + WB[Webhook B] --> AB + end + + AA --> CZ[CloudZero] + AB --> CZ +``` + +With cross-cluster load balancing, Istio may route traffic to the wrong aggregator: + +```mermaid +graph TB + subgraph "Cluster A" + CA[Agent A] + WA[Webhook A] + AA[Aggregator A] + end + + subgraph "Cluster B" + CB[Agent B] + WB[Webhook B] + AB[Aggregator B] + end + + CA --> AA + CA --> AB + CB --> AB + CB --> AA + WA --> AA + WA --> AB + WB --> AB + WB --> AA + AA --> CZ[CloudZero] + AB --> CZ + + linkStyle 1 stroke:red + linkStyle 3 stroke:red + linkStyle 5 stroke:red + linkStyle 7 stroke:red +``` -## **Option 2: Disable mTLS for `cloudzero-agent`** +If Istio routes traffic from Cluster A's agent to Cluster B's aggregator, those metrics get attributed to the wrong cluster. The cost data becomes corrupted - you'd see Cluster A's workloads appearing in Cluster B's reports. -To disable mTLS for the `cloudzero-agent` service, apply the following `PeerAuthentication` resource: +### Preventing Cross-Cluster Requests + +The chart creates Istio routing rules that keep aggregator traffic local. A `DestinationRule` defines which pods are "local" using the `topology.istio.io/cluster` label (which Istio adds to pods during injection), and a `VirtualService` routes all traffic to that subset: ```yaml -apiVersion: security.istio.io/v1beta1 -kind: PeerAuthentication -metadata: - name: cloudzero-agent-mtls - namespace: +apiVersion: networking.istio.io/v1 +kind: DestinationRule +spec: + host: aggregator.cza.svc.cluster.local + subsets: + - name: local-cluster + labels: + topology.istio.io/cluster: +--- +apiVersion: networking.istio.io/v1 +kind: VirtualService spec: - selector: - matchLabels: - app.kubernetes.io/component: webhook-server - mtls: - mode: DISABLE + http: + - route: + - destination: + subset: local-cluster + weight: 100 ``` -### **Steps to Apply:** +This requires knowing the Istio cluster ID. The chart computes an "effective cluster ID" using: -1. Replace `` with the namespace where `cloudzero-agent` is deployed. -2. Apply the resource: - ```sh - kubectl apply -f cloudzero-agent-mtls.yaml - ``` -3. Deploy the `cloudzero-agent` chart as instructed in the chart README.md +```text +effective = integrations.istio.clusterID || clusterName +``` -This configuration **disables mTLS for `cloudzero-agent` webhook-server pods only**, while keeping it enabled for the rest of the cluster. +If you explicitly set `integrations.istio.clusterID`, that value is used. Otherwise, the chart falls back to `clusterName`. ---- +In sidecar mode, the validator queries the local Envoy sidecar at `localhost:15000` to verify this value matches what Istio actually reports. + +### The Ambient Mode Wrinkle + +Ambient mode introduces a complication: there's no sidecar to query. + +In sidecar mode, each pod has an Envoy proxy at `localhost:15000` that we can query to verify the cluster ID configuration. If you set `integrations.istio.clusterID` to the wrong value, the validator detects the mismatch and reports an error. + +In ambient mode, traffic is handled by the node-level ztunnel proxy - there's nothing inside the pod to query. We detect ambient mode by checking for the `ambient.istio.io/redirection: enabled` annotation that Istio's CNI sets on enrolled pods. + +**The problem**: In ambient mode, we cannot verify that you've configured the cluster ID correctly. If you set it to an incorrect value: + +- The `DestinationRule` will select pods with the wrong `topology.istio.io/cluster` label +- No pods will match, so traffic has no valid destination +- The collector and webhook server will be unable to send data to the aggregator +- Metric collection stops + +For this reason, **ambient mode requires explicit configuration** of `integrations.istio.clusterID`. The chart won't fall back to `clusterName` since a misconfiguration cannot be detected and would cause silent data loss. + +## Configuration + +### Quick Reference + +| Deployment Type | Configuration Needed | +| ------------------------------------------------------- | --------------------------------------------- | +| Single cluster, no Istio | None | +| Single cluster with Istio | None (port exclusions auto-detected) | +| Multi-cluster sidecar mode, `clusterName` matches Istio | None | +| Multi-cluster sidecar mode, `clusterName` differs | Set `integrations.istio.clusterID` | +| Multi-cluster ambient mode | Set `integrations.istio.clusterID` (required) | + +### Settings + +```yaml +integrations: + istio: + enabled: null # null=auto-detect, true=force enable, false=disable + clusterID: null # Istio cluster ID for traffic fencing +``` + +| Setting | Type | Default | Description | +| ------------------------------ | --------- | ------- | ---------------------------------------------------------------------------- | +| `integrations.istio.enabled` | null/bool | `null` | `null` auto-detects via CRD presence, `true` forces enable, `false` disables | +| `integrations.istio.clusterID` | string | `null` | Istio cluster ID for traffic fencing; required in ambient mode | + +### Finding Your Istio Cluster ID + +```bash +# Query the istiod deployment +kubectl -n istio-system get deploy istiod \ + -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="CLUSTER_ID")].value}' + +# Or query any Istio-injected pod's bootstrap config +istioctl proxy-config bootstrap . -o yaml | \ + grep 'CLUSTER_ID:' | awk '{print $2}' +``` diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl index 7ff4abaf3..c58f0bdb4 100644 --- a/helm/templates/_helpers.tpl +++ b/helm/templates/_helpers.tpl @@ -121,6 +121,14 @@ Returns: string (e.g., "cloudzero-webhook.default.svc") valueFrom: fieldRef: fieldPath: metadata.name +- name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] +- name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] {{- end}} {{/* @@ -1405,3 +1413,60 @@ Returns: string (either "--agent", "--enable-feature=agent", or empty string) {{- end -}} {{- end -}} {{- end -}} + +{{/* +Istio Integration Detection Helper + +Determines whether Istio integration should be enabled based on configuration +and cluster capabilities. Supports three modes: + +- null (default): Auto-detect Istio via CRD presence in the cluster +- true: Force Istio integration enabled +- false: Force Istio integration disabled + +When Istio is detected/enabled: +- If cluster ID is set: DestinationRule and VirtualService are created for cluster-local service isolation +- Webhook/backfill pods get port exclusion annotations (when not using cert-manager) + +When Istio is NOT detected/disabled: +- No DestinationRule/VirtualService are created +- No Istio-specific annotations are added + +Usage: {{ if include "cloudzero-agent.Values.integrations.istio.enabled" . }}...{{ end }} +Returns: "true" (truthy) when enabled, empty string (falsy) when disabled +*/}} +{{- define "cloudzero-agent.Values.integrations.istio.enabled" -}} +{{- $istioSetting := .Values.integrations.istio.enabled -}} +{{- if kindIs "invalid" $istioSetting -}} + {{- /* null/not set = auto-detect via CRD presence */ -}} + {{- if or (.Capabilities.APIVersions.Has "networking.istio.io/v1") (.Capabilities.APIVersions.Has "networking.istio.io/v1beta1") -}} + {{- true -}} + {{- end -}} +{{- else if $istioSetting -}} + {{- /* true = force enabled */ -}} + {{- true -}} +{{- end -}} +{{- /* false = force disabled, returns empty string */ -}} +{{- end -}} + +{{/* +Istio Cluster ID Helper + +Returns the Istio cluster ID to use for multicluster mesh configurations. +Falls back from integrations.istio.clusterID to clusterName. + +This value is OPTIONAL. When set, DestinationRule and VirtualService resources +are created to ensure aggregator traffic stays within the local cluster. + +If not explicitly set, falls back to clusterName. This allows automatic traffic +fencing in sidecar mode where we can validate the effective value at runtime. + +The validator includes a runtime check that detects cross-cluster load balancing +and validates the effective cluster ID matches Istio's configuration. + +Usage: {{ include "cloudzero-agent.istio.clusterID" . }} +Returns: The Istio cluster ID string (explicit or fallback to clusterName) +*/}} +{{- define "cloudzero-agent.istio.clusterID" -}} +{{- .Values.integrations.istio.clusterID | default .Values.clusterName -}} +{{- end -}} diff --git a/helm/templates/aggregator-destinationrule.yaml b/helm/templates/aggregator-destinationrule.yaml new file mode 100644 index 000000000..efa07883a --- /dev/null +++ b/helm/templates/aggregator-destinationrule.yaml @@ -0,0 +1,70 @@ +{{/* +DestinationRule for Cluster-Local Aggregator Service Isolation + +In a multi-cluster Istio mesh, the Istio control plane (Istiod) discovers service +endpoints across all connected clusters and shares this information with every +sidecar proxy. This enables cross-cluster load balancing, which can cause traffic +intended for the local aggregator to be routed to aggregator instances in remote +clusters. + +This DestinationRule, combined with a VirtualService, overrides Istio's default +cross-cluster load balancing behavior for the aggregator service. It works by: + +1. Defining a subset called "local-cluster" that matches only pods with the + `topology.istio.io/cluster` label equal to the configured cluster ID +2. The VirtualService routes 100% of traffic to this subset, ensuring only + local pods receive traffic + +Key configuration: +- `host`: The aggregator service's FQDN +- `subsets[].labels`: Matches pods in the local cluster via the topology label + that Istio automatically injects on all pods + +This template requires BOTH conditions to be met: +1. Istio is detected (via CRD presence) or explicitly enabled via integrations.istio.enabled +2. A cluster ID is configured (via clusterName or integrations.istio.clusterID) + +When Istio is enabled but clusterID is not set, this template is skipped. The +validator will detect cross-cluster load balancing at runtime and warn if +clusterID should be configured. + +For more details, see helm/docs/istio.md +*/}} +{{- if and (include "cloudzero-agent.Values.integrations.istio.enabled" .) (ne (include "cloudzero-agent.istio.clusterID" .) "") }} +{{- $istioApiVersion := "networking.istio.io/v1" -}} +{{- if and (not (.Capabilities.APIVersions.Has "networking.istio.io/v1")) (.Capabilities.APIVersions.Has "networking.istio.io/v1beta1") -}} + {{- $istioApiVersion = "networking.istio.io/v1beta1" -}} +{{- end -}} +apiVersion: {{ $istioApiVersion }} +kind: DestinationRule +metadata: + name: {{ include "cloudzero-agent.aggregator.name" . }}-cluster-local + namespace: {{ .Release.Namespace }} + {{- include "cloudzero-agent.generateLabels" (dict + "root" . + "name" (include "cloudzero-agent.aggregator.name" .) + "component" "aggregator" + "labels" (list + .Values.defaults.labels + .Values.commonMetaLabels + .Values.components.aggregator.labels + ) + ) | nindent 2 }} + {{- include "cloudzero-agent.generateAnnotations" (dict + "root" . + "annotations" (list + .Values.defaults.annotations + .Values.components.aggregator.annotations + ) + ) | nindent 2 }} +spec: + # The service FQDN we are configuring routing for + host: {{ include "cloudzero-agent.aggregator.name" . }}.{{ .Release.Namespace }}.svc.cluster.local + # Define a subset that matches only pods in the local cluster + subsets: + - name: local-cluster + labels: + # Istio automatically adds this label to all pods with the cluster ID + # This ensures we only route to pods in our local cluster + topology.istio.io/cluster: {{ include "cloudzero-agent.istio.clusterID" . }} +{{- end }} diff --git a/helm/templates/aggregator-virtualservice.yaml b/helm/templates/aggregator-virtualservice.yaml new file mode 100644 index 000000000..45e39100e --- /dev/null +++ b/helm/templates/aggregator-virtualservice.yaml @@ -0,0 +1,70 @@ +{{/* +VirtualService for Cluster-Local Aggregator Service Isolation + +In a multi-cluster Istio mesh, the Istio control plane (Istiod) discovers service +endpoints across all connected clusters and shares this information with every +sidecar proxy. This enables cross-cluster load balancing, which can cause traffic +intended for the local aggregator to be routed to aggregator instances in remote +clusters. + +This VirtualService, combined with a DestinationRule, overrides Istio's default +cross-cluster load balancing behavior for the aggregator service. It works by: + +1. Routing 100% of traffic to the "local-cluster" subset defined in the + DestinationRule +2. The DestinationRule's subset matches only pods with the topology.istio.io/cluster + label equal to the configured cluster ID + +Key configuration: +- `hosts`: The aggregator service's FQDN +- `http.route.destination.subset`: Routes to the local-cluster subset + +This template requires BOTH conditions to be met: +1. Istio is detected (via CRD presence) or explicitly enabled via integrations.istio.enabled +2. A cluster ID is configured (via clusterName or integrations.istio.clusterID) + +When Istio is enabled but clusterID is not set, this template is skipped. The +validator will detect cross-cluster load balancing at runtime and warn if +clusterID should be configured. + +For more details, see helm/docs/istio.md +*/}} +{{- if and (include "cloudzero-agent.Values.integrations.istio.enabled" .) (ne (include "cloudzero-agent.istio.clusterID" .) "") }} +{{- $istioApiVersion := "networking.istio.io/v1" -}} +{{- if and (not (.Capabilities.APIVersions.Has "networking.istio.io/v1")) (.Capabilities.APIVersions.Has "networking.istio.io/v1beta1") -}} + {{- $istioApiVersion = "networking.istio.io/v1beta1" -}} +{{- end -}} +apiVersion: {{ $istioApiVersion }} +kind: VirtualService +metadata: + name: {{ include "cloudzero-agent.aggregator.name" . }}-cluster-local + namespace: {{ .Release.Namespace }} + {{- include "cloudzero-agent.generateLabels" (dict + "root" . + "name" (include "cloudzero-agent.aggregator.name" .) + "component" "aggregator" + "labels" (list + .Values.defaults.labels + .Values.commonMetaLabels + .Values.components.aggregator.labels + ) + ) | nindent 2 }} + {{- include "cloudzero-agent.generateAnnotations" (dict + "root" . + "annotations" (list + .Values.defaults.annotations + .Values.components.aggregator.annotations + ) + ) | nindent 2 }} +spec: + # The service FQDN we are configuring routing for + hosts: + - {{ include "cloudzero-agent.aggregator.name" . }}.{{ .Release.Namespace }}.svc.cluster.local + # Route all HTTP traffic to the local-cluster subset + http: + - route: + - destination: + host: {{ include "cloudzero-agent.aggregator.name" . }}.{{ .Release.Namespace }}.svc.cluster.local + subset: local-cluster + weight: 100 +{{- end }} diff --git a/helm/templates/backfill-job.yaml b/helm/templates/backfill-job.yaml index 6df47ee4f..f580433b3 100644 --- a/helm/templates/backfill-job.yaml +++ b/helm/templates/backfill-job.yaml @@ -26,6 +26,25 @@ */ -}} {{- range $jobType := list "CronJob" "Job" }} {{- $jobCategory := ternary "cronjob" "onetime" (eq $jobType "CronJob") }} +{{- /* + Istio mTLS Port Exclusion + + When Istio is detected and cert-manager is not in use, exclude port 443 from + Envoy proxying. The backfill job connects to the webhook service on port 443 + (the service port, which Kubernetes routes to pod port 8443). Without this + exclusion, Istio wraps the connection in mTLS, but the webhook uses a + self-signed certificate that Istio doesn't recognize, causing TLS failures. + + Port 443 is excluded (not 8443) because Istio's sidecar intercepts outbound + traffic based on the destination port the application connects to - the + service port, not the pod's target port. + + See helm/docs/istio.md for full documentation. +*/ -}} +{{- $istioAnnotations := dict -}} +{{- if and (include "cloudzero-agent.Values.integrations.istio.enabled" $) (not $.Values.insightsController.tls.useCertManager) -}} +{{- $istioAnnotations = dict "traffic.sidecar.istio.io/excludeOutboundPorts" "443" -}} +{{- end }} apiVersion: batch/v1 kind: {{ $jobType }} metadata: @@ -80,6 +99,7 @@ spec: {{- include "cloudzero-agent.generateAnnotations" (dict "root" $ "annotations" (list + $istioAnnotations $.Values.defaults.annotations $.Values.components.webhookServer.annotations $.Values.components.webhookServer.backfill.annotations @@ -108,6 +128,7 @@ spec: {{- include "cloudzero-agent.generateAnnotations" (dict "root" $ "annotations" (list + $istioAnnotations $.Values.defaults.annotations $.Values.components.webhookServer.annotations $.Values.components.webhookServer.backfill.annotations diff --git a/helm/templates/validator-cm.yaml b/helm/templates/validator-cm.yaml index 356ae44a1..c1a409274 100644 --- a/helm/templates/validator-cm.yaml +++ b/helm/templates/validator-cm.yaml @@ -45,6 +45,12 @@ data: insights_service: {{ include "cloudzero-agent.insightsController.server.webhookFullname" . }}-svc collector_service: {{ include "cloudzero-agent.aggregator.name" . }} + integrations: + istio: + # Use explicit value only (no fallback) so validator can distinguish user intent + # The validator receives clusterName separately via deployment.cluster_name + cluster_id: {{ .Values.integrations.istio.clusterID | quote }} + prometheus: {{- if .Values.validator.serviceEndpoints.kubeStateMetrics }} kube_state_metrics_service_endpoint: http://{{ .Values.validator.serviceEndpoints.kubeStateMetrics }}/ @@ -74,6 +80,7 @@ data: - prometheus_version - scrape_cfg - webhook_server_reachable + - istio_xcluster_lb - name: pre-stop enforce: false checks: diff --git a/helm/templates/webhook-deploy.yaml b/helm/templates/webhook-deploy.yaml index d50909bd3..ba45b5020 100644 --- a/helm/templates/webhook-deploy.yaml +++ b/helm/templates/webhook-deploy.yaml @@ -66,18 +66,33 @@ spec: .Values.components.webhookServer.podLabels ) ) | nindent 6 }} + {{- /* + Istio mTLS Port Exclusion + + When Istio is detected and cert-manager is not in use, exclude port 8443 + from Envoy proxying. The Kubernetes API server sends admission webhook + requests expecting the webhook's self-signed TLS certificate. Without this + exclusion, Istio intercepts the traffic and presents its own mTLS certificate, + which the API server doesn't trust, causing webhook calls to fail. + + Port 8443 is excluded (not 443) because Istio's sidecar intercepts inbound + traffic at the pod level, where traffic arrives on the target port (8443), + not the service port (443). + + See helm/docs/istio.md for full documentation. + */ -}} {{- $istioAnnotations := dict -}} - {{- if not .Values.insightsController.server.suppressIstioAnnotations -}} - {{- $istioAnnotations = dict "sidecar.istio.io/inject" "false" -}} + {{- if and (include "cloudzero-agent.Values.integrations.istio.enabled" .) (not .Values.insightsController.tls.useCertManager) -}} + {{- $istioAnnotations = dict "traffic.sidecar.istio.io/excludeInboundPorts" "8443" -}} {{- end -}} {{- include "cloudzero-agent.generateAnnotations" (dict "root" . "annotations" (list + $istioAnnotations .Values.defaults.annotations .Values.insightsController.server.podAnnotations .Values.components.webhookServer.annotations .Values.components.webhookServer.podAnnotations - $istioAnnotations (dict "checksum/config" (include "cloudzero-agent.configurationChecksum" .)) ) ) | nindent 6 }} diff --git a/helm/tests/istio_integration_test.yaml b/helm/tests/istio_integration_test.yaml new file mode 100644 index 000000000..f44296082 --- /dev/null +++ b/helm/tests/istio_integration_test.yaml @@ -0,0 +1,547 @@ +suite: test Istio integration +templates: + - aggregator-destinationrule.yaml + - aggregator-virtualservice.yaml + - webhook-deploy.yaml + - backfill-job.yaml +tests: + # ============================================================================ + # DestinationRule Auto-Detection Tests + # ============================================================================ + - it: should create aggregator DestinationRule when Istio CRDs detected (auto mode) + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + asserts: + - hasDocuments: + count: 1 + - isKind: + of: DestinationRule + - equal: + path: spec.subsets[0].name + value: local-cluster + - equal: + path: spec.subsets[0].labels["topology.istio.io/cluster"] + value: "test-cluster" + + - it: should NOT create aggregator DestinationRule when no Istio CRDs (auto mode) + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: [] + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + asserts: + - hasDocuments: + count: 0 + + # ============================================================================ + # VirtualService Auto-Detection Tests + # ============================================================================ + - it: should create aggregator VirtualService when Istio CRDs detected (auto mode) + template: aggregator-virtualservice.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + asserts: + - hasDocuments: + count: 1 + - isKind: + of: VirtualService + - equal: + path: spec.http[0].route[0].destination.subset + value: local-cluster + - equal: + path: spec.http[0].route[0].weight + value: 100 + + - it: should NOT create aggregator VirtualService when no Istio CRDs (auto mode) + template: aggregator-virtualservice.yaml + capabilities: + apiVersions: [] + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + asserts: + - hasDocuments: + count: 0 + + # ============================================================================ + # Explicit Enable/Disable Tests + # ============================================================================ + - it: should create aggregator DestinationRule when explicitly enabled + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + integrations.istio.enabled: true + asserts: + - hasDocuments: + count: 1 + - isKind: + of: DestinationRule + + - it: should NOT create aggregator DestinationRule when explicitly disabled (with CRDs) + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + integrations.istio.enabled: false + asserts: + - hasDocuments: + count: 0 + + - it: should create aggregator VirtualService when explicitly enabled + template: aggregator-virtualservice.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + integrations.istio.enabled: true + asserts: + - hasDocuments: + count: 1 + - isKind: + of: VirtualService + + - it: should NOT create aggregator VirtualService when explicitly disabled (with CRDs) + template: aggregator-virtualservice.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + integrations.istio.enabled: false + asserts: + - hasDocuments: + count: 0 + + # ============================================================================ + # Cluster ID Configuration Tests + # ============================================================================ + - it: should use integrations.istio.clusterID when set + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "my-cluster-name" + integrations.istio.clusterID: "my-istio-cluster-id" + asserts: + - equal: + path: spec.subsets[0].labels["topology.istio.io/cluster"] + value: "my-istio-cluster-id" + + - it: should fall back to clusterName when integrations.istio.clusterID is not set + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "fallback-cluster-name" + asserts: + - equal: + path: spec.subsets[0].labels["topology.istio.io/cluster"] + value: "fallback-cluster-name" + + # ============================================================================ + # Optional Cluster ID Tests (no clusterID = no routing rules, but no error) + # ============================================================================ + - it: should NOT create DestinationRule when Istio detected but no clusterID and no clusterName + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "" + integrations.istio.clusterID: "" + asserts: + - hasDocuments: + count: 0 + + - it: should NOT create VirtualService when Istio detected but no clusterID and no clusterName + template: aggregator-virtualservice.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "" + integrations.istio.clusterID: "" + asserts: + - hasDocuments: + count: 0 + + - it: should NOT create DestinationRule when explicitly enabled but no clusterID + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "" + integrations.istio.enabled: true + integrations.istio.clusterID: "" + asserts: + - hasDocuments: + count: 0 + + - it: should NOT create VirtualService when explicitly enabled but no clusterID + template: aggregator-virtualservice.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "" + integrations.istio.enabled: true + integrations.istio.clusterID: "" + asserts: + - hasDocuments: + count: 0 + + # ============================================================================ + # Webhook Port Exclusion Annotation Tests + # ============================================================================ + - it: webhook should have excludeInboundPorts annotation when Istio detected and useCertManager=false + template: webhook-deploy.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + insightsController.tls.useCertManager: false + asserts: + - equal: + path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeInboundPorts"] + value: "8443" + + - it: webhook should NOT have excludeInboundPorts annotation when Istio detected and useCertManager=true + template: webhook-deploy.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + insightsController.tls.useCertManager: true + asserts: + - isNull: + path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeInboundPorts"] + + - it: webhook should NOT have excludeInboundPorts annotation when Istio not detected + template: webhook-deploy.yaml + capabilities: + apiVersions: [] + set: + apiKey: "test-key" + existingSecretName: null + insightsController.tls.useCertManager: false + asserts: + - isNull: + path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeInboundPorts"] + + - it: webhook should NOT have excludeInboundPorts annotation when Istio explicitly disabled + template: webhook-deploy.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + insightsController.tls.useCertManager: false + integrations.istio.enabled: false + asserts: + - isNull: + path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeInboundPorts"] + + # ============================================================================ + # Backfill Job Port Exclusion Annotation Tests (CronJob) + # ============================================================================ + - it: backfill cronjob should have excludeOutboundPorts annotation when Istio detected and useCertManager=false + template: backfill-job.yaml + documentIndex: 0 + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + insightsController.tls.useCertManager: false + insightsController.enabled: true + asserts: + - equal: + path: spec.jobTemplate.spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeOutboundPorts"] + value: "443" + + - it: backfill cronjob should NOT have excludeOutboundPorts annotation when Istio detected and useCertManager=true + template: backfill-job.yaml + documentIndex: 0 + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + insightsController.tls.useCertManager: true + insightsController.enabled: true + asserts: + - isNull: + path: spec.jobTemplate.spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeOutboundPorts"] + + - it: backfill cronjob should NOT have excludeOutboundPorts annotation when Istio not detected + template: backfill-job.yaml + documentIndex: 0 + capabilities: + apiVersions: [] + set: + apiKey: "test-key" + existingSecretName: null + insightsController.tls.useCertManager: false + insightsController.enabled: true + asserts: + - isNull: + path: spec.jobTemplate.spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeOutboundPorts"] + + # ============================================================================ + # Backfill Job Port Exclusion Annotation Tests (Job) + # ============================================================================ + - it: backfill job should have excludeOutboundPorts annotation when Istio detected and useCertManager=false + template: backfill-job.yaml + documentIndex: 1 + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + insightsController.tls.useCertManager: false + insightsController.enabled: true + asserts: + - equal: + path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeOutboundPorts"] + value: "443" + + - it: backfill job should NOT have excludeOutboundPorts annotation when Istio detected and useCertManager=true + template: backfill-job.yaml + documentIndex: 1 + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + insightsController.tls.useCertManager: true + insightsController.enabled: true + asserts: + - isNull: + path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeOutboundPorts"] + + - it: backfill job should NOT have excludeOutboundPorts annotation when Istio not detected + template: backfill-job.yaml + documentIndex: 1 + capabilities: + apiVersions: [] + set: + apiKey: "test-key" + existingSecretName: null + insightsController.tls.useCertManager: false + insightsController.enabled: true + asserts: + - isNull: + path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeOutboundPorts"] + + # ============================================================================ + # Annotation Priority Tests (user can override Istio annotations) + # ============================================================================ + - it: webhook podAnnotations should override auto-generated Istio annotations + template: webhook-deploy.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + insightsController.tls.useCertManager: false + components.webhookServer.podAnnotations: + traffic.sidecar.istio.io/excludeInboundPorts: "8443,9090" + asserts: + - equal: + path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeInboundPorts"] + value: "8443,9090" + + - it: backfill podAnnotations should override auto-generated Istio annotations + template: backfill-job.yaml + documentIndex: 0 + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + insightsController.tls.useCertManager: false + insightsController.enabled: true + components.webhookServer.backfill.podAnnotations: + traffic.sidecar.istio.io/excludeOutboundPorts: "443,8443" + asserts: + - equal: + path: spec.jobTemplate.spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeOutboundPorts"] + value: "443,8443" + + # ============================================================================ + # Alternative CRD Detection Tests + # ============================================================================ + - it: should detect Istio via networking.istio.io/v1 CRD + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - networking.istio.io/v1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + asserts: + - hasDocuments: + count: 1 + + - it: should detect Istio via security.istio.io/v1 CRD + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - security.istio.io/v1 + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + asserts: + - hasDocuments: + count: 1 + + # ============================================================================ + # API Version Selection Tests + # ============================================================================ + - it: should use networking.istio.io/v1 when available + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - networking.istio.io/v1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + asserts: + - equal: + path: apiVersion + value: networking.istio.io/v1 + + - it: should use networking.istio.io/v1beta1 when v1 not available + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + asserts: + - equal: + path: apiVersion + value: networking.istio.io/v1beta1 + + - it: should prefer networking.istio.io/v1 when both v1 and v1beta1 are available + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - networking.istio.io/v1 + - networking.istio.io/v1beta1 + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + asserts: + - equal: + path: apiVersion + value: networking.istio.io/v1 + + # ============================================================================ + # DestinationRule/VirtualService Content Validation Tests + # ============================================================================ + - it: aggregator DestinationRule should have correct host + template: aggregator-destinationrule.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + release: + name: my-release + namespace: my-namespace + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + asserts: + - equal: + path: spec.host + value: my-release-cz-aggregator.my-namespace.svc.cluster.local + + - it: aggregator VirtualService should have correct hosts and destinations + template: aggregator-virtualservice.yaml + capabilities: + apiVersions: + - networking.istio.io/v1beta1 + release: + name: my-release + namespace: my-namespace + set: + apiKey: "test-key" + existingSecretName: null + clusterName: "test-cluster" + asserts: + - contains: + path: spec.hosts + content: my-release-cz-aggregator.my-namespace.svc.cluster.local + - equal: + path: spec.http[0].route[0].destination.host + value: my-release-cz-aggregator.my-namespace.svc.cluster.local diff --git a/helm/values.schema.json b/helm/values.schema.json index c8f105485..974a25f1a 100644 --- a/helm/values.schema.json +++ b/helm/values.schema.json @@ -6646,8 +6646,16 @@ "default": "1m" }, "suppressIstioAnnotations": { - "default": false, - "type": "boolean" + "default": null, + "deprecated": true, + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + } + ] }, "tolerations": { "$ref": "#/$defs/com.cloudzero.agent.tolerations" @@ -6756,6 +6764,33 @@ }, "type": "object" }, + "integrations": { + "additionalProperties": false, + "properties": { + "istio": { + "additionalProperties": false, + "properties": { + "clusterID": { + "default": null, + "type": ["string", "null"] + }, + "enabled": { + "default": null, + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + } + ] + } + }, + "type": "object" + } + }, + "type": "object" + }, "jobConfigID": { "type": ["string", "null"] }, diff --git a/helm/values.schema.yaml b/helm/values.schema.yaml index dbea2e266..c1b8c295d 100644 --- a/helm/values.schema.yaml +++ b/helm/values.schema.yaml @@ -1895,13 +1895,18 @@ properties: Annotations to add to the webhook server pods. $ref: "#/$defs/io.k8s.apimachinery.pkg.apis.meta.v1.ObjectMeta/properties/annotations" suppressIstioAnnotations: + deprecated: true description: | - Whether to suppress Istio-related annotations on webhook server pods. - When false (default), the sidecar.istio.io/inject: "false" annotation is added - to prevent Istio sidecar injection which can interfere with webhook TLS. - Set to true to disable this behavior and allow Istio sidecar injection. - type: boolean - default: false + **DEPRECATED**: This setting no longer has any effect and will be removed + in a future release. + + Istio integration is now controlled via `integrations.istio.enabled` and + the chart automatically detects Istio and applies appropriate port exclusion + annotations when needed. See helm/docs/istio.md for details. + oneOf: + - type: "null" + - type: boolean + default: null volumeMounts: description: | Additional volume mounts to add to the insights controller pods. @@ -1986,6 +1991,70 @@ properties: description: | Override the name of the ConfigMap used for configuration. + integrations: + type: object + additionalProperties: false + description: | + Integration configuration for external systems and service meshes. + + These settings control how the CloudZero Agent integrates with other + infrastructure components like Istio service mesh. + properties: + istio: + type: object + additionalProperties: false + description: | + Istio service mesh integration settings. + + When Istio is detected or enabled, the chart: + - Creates a DestinationRule and VirtualService for cluster-local service + isolation (prevents cross-cluster load balancing in multi-cluster mesh environments) + - Allows Istio sidecar injection on webhook pods + + For more details, see helm/docs/istio.md + properties: + enabled: + oneOf: + - type: "null" + - type: boolean + default: null + description: | + Controls Istio integration behavior. + + Possible values: + - null (default): Auto-detect Istio via CRD presence in the cluster + - true: Force Istio integration enabled + - false: Force Istio integration disabled + clusterID: + type: + - string + - "null" + default: null + description: | + Istio cluster ID for multicluster mesh environments. + + In an Istio multicluster mesh, each cluster has a unique cluster ID that + identifies it within the mesh. This value is used by the DestinationRule + to pin traffic to pods in the local cluster using the + `topology.istio.io/cluster` label that Istio automatically adds to pods. + + If not set, falls back to the top-level `clusterName` value. At least one + of these must be set when Istio integration is enabled. + + Note that, if cross-cluster load balancing is enabled on your cluster, + this *MUST* be set to the correct value. + + To find your Istio cluster ID, use istioctl to query any Istio-injected + pod's bootstrap config: + + istioctl proxy-config bootstrap . -o yaml | \ + grep 'CLUSTER_ID:' | awk '{print $2}' + + Or check the istiod deployment (assumes istio-system namespace): + + kubectl -n istio-system get deploy istiod \ + -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="CLUSTER_ID")].value}' + kubeStateMetrics: type: object description: | diff --git a/helm/values.yaml b/helm/values.yaml index 0fab9516c..a8ede938f 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -206,6 +206,42 @@ defaults: runAsGroup: 65534 fsGroup: 65534 +# Integration configuration for external systems and service meshes. +integrations: + # Istio service mesh integration settings. + istio: + # Controls Istio integration behavior. + # + # Possible values: + # - null (default): Auto-detect Istio via CRD presence in the cluster + # - true: Force Istio integration enabled + # - false: Force Istio integration disabled + enabled: null + # Istio cluster ID for multicluster mesh environments. + # + # In an Istio multicluster mesh, each cluster has a unique cluster ID that + # identifies it within the mesh. This value is used by the DestinationRule + # to pin traffic to pods in the local cluster using the + # `topology.istio.io/cluster` label that Istio automatically adds to pods. + # + # If not set, falls back to the top-level `clusterName` value. At least one + # of these must be set when Istio integration is enabled. + # + # Note that, if cross-cluster load balancing is enabled on your cluster, + # this *MUST* be set to the correct value. + # + # To find your Istio cluster ID, use istioctl to query any Istio-injected + # pod's bootstrap config: + # + # istioctl proxy-config bootstrap . -o yaml | \ + # grep 'CLUSTER_ID:' | awk '{print $2}' + # + # Or check the istiod deployment (assumes istio-system namespace): + # + # kubectl -n istio-system get deploy istiod \ + # -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="CLUSTER_ID")].value}' + clusterID: null + # Component-specific configuration settings. components: # The agent here refers to the CloudZero Agent, which is the component that @@ -598,7 +634,6 @@ components: limits: memory: "1024Mi" cpu: "2000m" - # Settings for the webhook server. webhookServer: replicas: null @@ -1213,13 +1248,6 @@ insightsController: deploymentAnnotations: {} # Annotations to add to the webhook server pods. podAnnotations: {} - # Whether to suppress Istio-related annotations on webhook server pods. When - # false (default), the sidecar.istio.io/inject: "false" annotation is added - # to prevent Istio sidecar injection which can interfere with webhook TLS. - # Set to true to disable this behavior and allow Istio sidecar injection. - # For additional information, see: - # https://github.com/Cloudzero/cloudzero-charts/blob/develop/charts/cloudzero-agent/docs/istio.md - suppressIstioAnnotations: false # Additional volume mounts to add to the insights controller pods. volumeMounts: [] # Additional volumes to add to the insights controller pods. diff --git a/tests/helm/schema/insightsController.server.suppressIstioAnnotations.null.pass.yaml b/tests/helm/schema/insightsController.server.suppressIstioAnnotations.null.pass.yaml new file mode 100644 index 000000000..0749d33b7 --- /dev/null +++ b/tests/helm/schema/insightsController.server.suppressIstioAnnotations.null.pass.yaml @@ -0,0 +1,3 @@ +insightsController: + server: + suppressIstioAnnotations: null diff --git a/tests/helm/schema/integrations.istio.additionalProperty.fail.yaml b/tests/helm/schema/integrations.istio.additionalProperty.fail.yaml new file mode 100644 index 000000000..eb8f9ad6b --- /dev/null +++ b/tests/helm/schema/integrations.istio.additionalProperty.fail.yaml @@ -0,0 +1,5 @@ +# integrations.istio does not allow additional properties +integrations: + istio: + enabled: true + unknownProperty: "should fail" diff --git a/tests/helm/schema/integrations.istio.clusterID.empty.pass.yaml b/tests/helm/schema/integrations.istio.clusterID.empty.pass.yaml new file mode 100644 index 000000000..debbb3764 --- /dev/null +++ b/tests/helm/schema/integrations.istio.clusterID.empty.pass.yaml @@ -0,0 +1,4 @@ +# Empty string is valid - means no cluster ID configured +integrations: + istio: + clusterID: "" diff --git a/tests/helm/schema/integrations.istio.clusterID.invalid.fail.yaml b/tests/helm/schema/integrations.istio.clusterID.invalid.fail.yaml new file mode 100644 index 000000000..3a549adf6 --- /dev/null +++ b/tests/helm/schema/integrations.istio.clusterID.invalid.fail.yaml @@ -0,0 +1,4 @@ +# clusterID must be a string or null - not a number +integrations: + istio: + clusterID: 12345 diff --git a/tests/helm/schema/integrations.istio.clusterID.null.pass.yaml b/tests/helm/schema/integrations.istio.clusterID.null.pass.yaml new file mode 100644 index 000000000..70f397bb4 --- /dev/null +++ b/tests/helm/schema/integrations.istio.clusterID.null.pass.yaml @@ -0,0 +1,3 @@ +integrations: + istio: + clusterID: null diff --git a/tests/helm/schema/integrations.istio.clusterID.valid.pass.yaml b/tests/helm/schema/integrations.istio.clusterID.valid.pass.yaml new file mode 100644 index 000000000..5782bb742 --- /dev/null +++ b/tests/helm/schema/integrations.istio.clusterID.valid.pass.yaml @@ -0,0 +1,3 @@ +integrations: + istio: + clusterID: "my-istio-cluster-id" diff --git a/tests/helm/schema/integrations.istio.enabled.false.pass.yaml b/tests/helm/schema/integrations.istio.enabled.false.pass.yaml new file mode 100644 index 000000000..c59c253fa --- /dev/null +++ b/tests/helm/schema/integrations.istio.enabled.false.pass.yaml @@ -0,0 +1,3 @@ +integrations: + istio: + enabled: false diff --git a/tests/helm/schema/integrations.istio.enabled.invalid.fail.yaml b/tests/helm/schema/integrations.istio.enabled.invalid.fail.yaml new file mode 100644 index 000000000..aa1ce8779 --- /dev/null +++ b/tests/helm/schema/integrations.istio.enabled.invalid.fail.yaml @@ -0,0 +1,4 @@ +# enabled must be null, true, or false - not a string +integrations: + istio: + enabled: "yes" diff --git a/tests/helm/schema/integrations.istio.enabled.null.pass.yaml b/tests/helm/schema/integrations.istio.enabled.null.pass.yaml new file mode 100644 index 000000000..1944a9b3d --- /dev/null +++ b/tests/helm/schema/integrations.istio.enabled.null.pass.yaml @@ -0,0 +1,3 @@ +integrations: + istio: + enabled: null diff --git a/tests/helm/schema/integrations.istio.enabled.true.pass.yaml b/tests/helm/schema/integrations.istio.enabled.true.pass.yaml new file mode 100644 index 000000000..5765c21a4 --- /dev/null +++ b/tests/helm/schema/integrations.istio.enabled.true.pass.yaml @@ -0,0 +1,3 @@ +integrations: + istio: + enabled: true diff --git a/tests/helm/template/alloy.yaml b/tests/helm/template/alloy.yaml index b370ff204..cb97b15b6 100644 --- a/tests/helm/template/alloy.yaml +++ b/tests/helm/template/alloy.yaml @@ -1144,7 +1144,6 @@ data: replicaCount: null send_interval: 1m send_timeout: 1m - suppressIstioAnnotations: false tolerations: [] write_timeout: 10s service: @@ -1168,6 +1167,10 @@ data: namespaceSelector: {} path: /validate timeoutSeconds: 1 + integrations: + istio: + clusterID: null + enabled: null jobConfigID: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E kubeStateMetrics: affinity: {} @@ -1587,6 +1590,12 @@ data: insights_service: cz-agent-cz-webhook-svc collector_service: cz-agent-cz-aggregator + integrations: + istio: + # Use explicit value only (no fallback) so validator can distinguish user intent + # The validator receives clusterName separately via deployment.cluster_name + cluster_id: + prometheus: kube_state_metrics_service_endpoint: http://cz-agent-cz-ksm.cz-agent.svc.cluster.local:8080 executable: /bin/prometheus @@ -1617,6 +1626,7 @@ data: - prometheus_version - scrape_cfg - webhook_server_reachable + - istio_xcluster_lb - name: pre-stop enforce: false checks: @@ -2274,6 +2284,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] command: - /app/cloudzero-agent-validator - install @@ -2309,6 +2327,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] command: - /checks/bin/cloudzero-agent-validator - diagnose @@ -2401,6 +2427,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] lifecycle: postStart: exec: @@ -2686,7 +2720,6 @@ spec: helm.sh/chart: cloudzero-agent-1.1.0-dev annotations: checksum/config: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E - sidecar.istio.io/inject: "false" spec: serviceAccountName: cz-agent-cz-server @@ -2903,6 +2936,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] command: - /app/cloudzero-cluster-config - load diff --git a/tests/helm/template/cert-manager.yaml b/tests/helm/template/cert-manager.yaml index fcaad8af2..1be1057bc 100644 --- a/tests/helm/template/cert-manager.yaml +++ b/tests/helm/template/cert-manager.yaml @@ -1105,7 +1105,6 @@ data: replicaCount: null send_interval: 1m send_timeout: 1m - suppressIstioAnnotations: false tolerations: [] write_timeout: 10s service: @@ -1129,6 +1128,10 @@ data: namespaceSelector: {} path: /validate timeoutSeconds: 1 + integrations: + istio: + clusterID: null + enabled: null jobConfigID: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E kubeStateMetrics: affinity: {} @@ -1548,6 +1551,12 @@ data: insights_service: cz-agent-cz-webhook-svc collector_service: cz-agent-cz-aggregator + integrations: + istio: + # Use explicit value only (no fallback) so validator can distinguish user intent + # The validator receives clusterName separately via deployment.cluster_name + cluster_id: + prometheus: kube_state_metrics_service_endpoint: http://cz-agent-cz-ksm.cz-agent.svc.cluster.local:8080 executable: /bin/prometheus @@ -1578,6 +1587,7 @@ data: - prometheus_version - scrape_cfg - webhook_server_reachable + - istio_xcluster_lb - name: pre-stop enforce: false checks: @@ -2235,6 +2245,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] command: - /app/cloudzero-agent-validator - install @@ -2270,6 +2288,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] command: - /checks/bin/cloudzero-agent-validator - diagnose @@ -2640,7 +2666,6 @@ spec: helm.sh/chart: cloudzero-agent-1.1.0-dev annotations: checksum/config: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E - sidecar.istio.io/inject: "false" spec: serviceAccountName: cz-agent-cz-server @@ -2857,6 +2882,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] command: - /app/cloudzero-cluster-config - load diff --git a/tests/helm/template/federated.yaml b/tests/helm/template/federated.yaml index 1553b531f..bd7577d7a 100644 --- a/tests/helm/template/federated.yaml +++ b/tests/helm/template/federated.yaml @@ -1172,7 +1172,6 @@ data: replicaCount: null send_interval: 1m send_timeout: 1m - suppressIstioAnnotations: false tolerations: [] write_timeout: 10s service: @@ -1196,6 +1195,10 @@ data: namespaceSelector: {} path: /validate timeoutSeconds: 1 + integrations: + istio: + clusterID: null + enabled: null jobConfigID: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E kubeStateMetrics: affinity: {} @@ -1615,6 +1618,12 @@ data: insights_service: cz-agent-cz-webhook-svc collector_service: cz-agent-cz-aggregator + integrations: + istio: + # Use explicit value only (no fallback) so validator can distinguish user intent + # The validator receives clusterName separately via deployment.cluster_name + cluster_id: + prometheus: kube_state_metrics_service_endpoint: http://cz-agent-cz-ksm.cz-agent.svc.cluster.local:8080 executable: /bin/prometheus @@ -1645,6 +1654,7 @@ data: - prometheus_version - scrape_cfg - webhook_server_reachable + - istio_xcluster_lb - name: pre-stop enforce: false checks: @@ -2497,6 +2507,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] command: - /app/cloudzero-agent-validator - install @@ -2532,6 +2550,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] command: - /checks/bin/cloudzero-agent-validator - diagnose @@ -2902,7 +2928,6 @@ spec: helm.sh/chart: cloudzero-agent-1.1.0-dev annotations: checksum/config: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E - sidecar.istio.io/inject: "false" spec: serviceAccountName: cz-agent-cz-server @@ -3119,6 +3144,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] command: - /app/cloudzero-cluster-config - load diff --git a/tests/helm/template/istio-overrides.yml b/tests/helm/template/istio-overrides.yml new file mode 100644 index 000000000..5bc151b38 --- /dev/null +++ b/tests/helm/template/istio-overrides.yml @@ -0,0 +1,16 @@ +# Istio integration test configuration +# This tests the DestinationRule/VirtualService generation and port exclusion annotations + +cloudAccountId: "1234567890" +clusterName: "istio-test-cluster" +region: "us-east-1" +apiKey: "not-a-real-api-key" + +# For testing only, you should never use this property in production. +jobConfigID: "DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E" + +# Explicitly enable Istio integration (normally auto-detected via CRDs) +integrations: + istio: + enabled: true + clusterID: "istio-cluster-id-for-testing" diff --git a/tests/helm/template/istio.yaml b/tests/helm/template/istio.yaml new file mode 100644 index 000000000..d2b83eec8 --- /dev/null +++ b/tests/helm/template/istio.yaml @@ -0,0 +1,3315 @@ +--- +# Source: cloudzero-agent/charts/kubeStateMetrics/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cz-agent-cz-ksm + namespace: cz-agent + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: ksm + app.kubernetes.io/name: ksm + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" +spec: + selector: + matchLabels: + app.kubernetes.io/name: ksm + minAvailable: 1 +--- +# Source: cloudzero-agent/templates/agent-pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cz-agent-cz-server + namespace: cz-agent + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: server + app.kubernetes.io/instance: cz-agent +--- +# Source: cloudzero-agent/templates/aggregator-pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cz-agent-cz-aggregator + namespace: cz-agent + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: aggregator + app.kubernetes.io/instance: cz-agent +--- +# Source: cloudzero-agent/templates/webhook-pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cz-agent-cz-webhook + namespace: cz-agent + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: webhook-server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: cz-agent +--- +# Source: cloudzero-agent/charts/kubeStateMetrics/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +automountServiceAccountToken: true +metadata: + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: ksm + app.kubernetes.io/name: ksm + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" + name: cz-agent-cz-ksm + namespace: cz-agent +--- +# Source: cloudzero-agent/templates/agent-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-cz-server + namespace: cz-agent +--- +# Source: cloudzero-agent/templates/webhook-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: webhook-server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-cz-webhook-init-cert + namespace: cz-agent +--- +# Source: cloudzero-agent/templates/aggregator-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-api-key + namespace: cz-agent +data: + value: "bm90LWEtcmVhbC1hcGkta2V5" +--- +# Source: cloudzero-agent/templates/webhook-tls-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: webhook-server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-cz-webhook-tls + namespace: cz-agent +--- +# Source: cloudzero-agent/templates/agent-cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + name: cz-agent-configuration + namespace: cz-agent + +data: + # Prometheus YAML configuration + prometheus.yml: |- + global: + scrape_interval: 60s + + storage: + tsdb: + out_of_order_time_window: 5m + + scrape_configs: + # Kube State Metrics Scrape Job + # static-kube-state-metrics + # + # Kube State Metrics provides the CloudZero Agent with information + # regarding the configuration and state of various Kubernetes objects + # (nodes, pods, etc.), including where they are located in the cluster. + - job_name: static-kube-state-metrics + scrape_interval: 60s + + # Given a Kubernetes resource with a structure like: + # + # apiVersion: v1 + # kind: Service + # metadata: + # name: my-service + # namespace: my-namespace + # labels: + # app: my-app + # environment: production + # + # Kube State Metrics should provide labels such as: + # + # __meta_kubernetes_service_name: my-name + # __meta_kubernetes_namespace: my-namespace + # __meta_kubernetes_service_label_app: my-app + # __meta_kubernetes_service_label_environment: production + # + # We read these into the CloudZero Agent as: + # + # service: my-name + # namespace: my-namespace + # app: my-app + # environment: production + relabel_configs: + + # Relabel __meta_kubernetes_service_label_(.+) labels to $1. + - regex: __meta_kubernetes_service_label_(.+) + action: labelmap + + # Replace __meta_kubernetes_namespace labels with "namespace" + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + + # Replace __meta_kubernetes_service_name labels with "service" + - source_labels: [__meta_kubernetes_service_name] + target_label: service + + # Replace "__meta_kubernetes_pod_node_name" labels to "node" + - source_labels: [__meta_kubernetes_pod_node_name] + target_label: node + # We filter out all but a select few metrics and labels. + metric_relabel_configs: + + # Metric names to keep. + - source_labels: [__name__] + regex: ^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info)$ + action: keep + + # Metric labels to keep. + - regex: ^(board_asset_tag|container|created_by_kind|created_by_name|image|instance|name|namespace|node|node_kubernetes_io_instance_type|pod|product_name|provider_id|resource|unit|uid|_.*|label_.*|app.kubernetes.io/*|k8s.*)$ + action: labelkeep + + static_configs: + - targets: + - cz-agent-cz-ksm.cz-agent.svc.cluster.local:8080 + # cAdvisor Scrape Job cloudzero-nodes-cadvisor + # + # This job scrapes metrics about container resource usage (CPU, memory, + # network, etc.). + - job_name: cloudzero-nodes-cadvisor + + scrape_interval: 60s + scheme: https + + # cAdvisor endpoints are protected. In order to access them we need the + # credentials for the ServiceAccount. + authorization: + type: Bearer + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + + # Scrape metrics from cAdvisor. + relabel_configs: + + # Replace the value of __address__ labels with "kubernetes.default.svc.cluster.local:443" + - target_label: __address__ + replacement: kubernetes.default.svc.cluster.local:443 + + # Replace the value of __metrics_path__ in __meta_kubernetes_node_name with + # "/api/v1/nodes/$1/proxy/metrics/cadvisor" + - source_labels: [__meta_kubernetes_node_name] + target_label: __metrics_path__ + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + + # Remove "__meta_kubernetes_node_label_" prefix from labels. + - regex: __meta_kubernetes_node_label_(.+) + action: labelmap + + # Replace __meta_kubernetes_node_name labels with "node" + - source_labels: [__meta_kubernetes_node_name] + target_label: node + + # We only want to keep a select few labels. + metric_relabel_configs: + + # Labels to keep. + - action: labelkeep + regex: ^(board_asset_tag|container|created_by_kind|created_by_name|image|instance|name|namespace|node|node_kubernetes_io_instance_type|pod|product_name|provider_id|resource|unit|uid|_.*|label_.*|app.kubernetes.io/*|k8s.*)$ + + # Metrics to keep. + - source_labels: [__name__] + regex: ^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent)$ + action: keep + + kubernetes_sd_configs: + - role: node + kubeconfig_file: "" + - job_name: cloudzero-webhook-job + scheme: https + tls_config: + insecure_skip_verify: true + + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + + relabel_configs: + # Keep __meta_kubernetes_endpoints_name labels. + - source_labels: [__meta_kubernetes_endpoints_name] + action: keep + regex: cz-agent-cz-webhook + + metric_relabel_configs: + # Metrics to keep. + - source_labels: [__name__] + regex: "^(go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|remote_write_timeseries_total|remote_write_response_codes_total|remote_write_payload_size_bytes|remote_write_failures_total|remote_write_records_processed_total|remote_write_db_failures_total|http_requests_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total)$" + action: keep + - job_name: cloudzero-aggregator-job + scrape_interval: 120s + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + namespaces: + names: + - cz-agent + relabel_configs: + - source_labels: [__meta_kubernetes_service_name] + action: keep + regex: cz-agent-cz-aggregator + - source_labels: [__meta_kubernetes_pod_container_port_name] + action: keep + regex: port-(shipper|collector) + metric_relabel_configs: + - source_labels: [__name__] + regex: "^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent|kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|go_gc_duration_seconds|go_gc_duration_seconds_count|go_gc_duration_seconds_sum|go_gc_gogc_percent|go_gc_gomemlimit_bytes|go_goroutines|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_stack_inuse_bytes|go_threads|http_request_duration_seconds_bucket|http_request_duration_seconds_count|http_request_duration_seconds_sum|http_requests_total|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_exemplars_in_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_in_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_remote_storage_string_interner_zero_reference_releases_total|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds|promhttp_metric_handler_requests_in_flight|promhttp_metric_handler_requests_total|remote_write_db_failures_total|remote_write_failures_total|remote_write_payload_size_bytes|remote_write_records_processed_total|remote_write_response_codes_total|remote_write_timeseries_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|function_execution_seconds|shipper_shutdown_total|shipper_new_files_error_total|shipper_new_files_processing_current|shipper_handle_request_file_count|shipper_handle_request_success_total|shipper_presigned_url_error_total|shipper_replay_request_total|shipper_replay_request_current|shipper_replay_request_file_count|shipper_replay_request_error_total|shipper_replay_request_abandon_files_total|shipper_replay_request_abandon_files_error_total|shipper_disk_total_size_bytes|shipper_current_disk_usage_bytes|shipper_current_disk_usage_percentage|shipper_current_disk_unsent_file|shipper_current_disk_sent_file|shipper_disk_replay_request_current|shipper_disk_cleanup_failure_total|shipper_disk_cleanup_success_total|shipper_disk_cleanup_percentage)$|^(cloudzero_|czo_)" + action: keep + - job_name: static-prometheus + scrape_interval: 120s + static_configs: + - targets: + - localhost:9090 + metric_relabel_configs: + - source_labels: [__name__] + regex: "^(go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds)$" + action: keep + remote_write: + - url: 'http://cz-agent-cz-aggregator.cz-agent.svc.cluster.local/collector' + authorization: + credentials_file: /etc/config/secrets/value + write_relabel_configs: + - source_labels: [__name__] + regex: "^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|remote_write_timeseries_total|remote_write_response_codes_total|remote_write_payload_size_bytes|remote_write_failures_total|remote_write_records_processed_total|remote_write_db_failures_total|http_requests_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds)$" + action: keep + metadata_config: + send: false +--- +# Source: cloudzero-agent/templates/aggregator-cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: cz-agent-cz-aggregator + namespace: cz-agent + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +data: + config.yml: |- + cloud_account_id: 1234567890 + cluster_name: istio-test-cluster + region: us-east-1 + + metrics: + + cost: + - pattern: "container_cpu_usage_seconds_total" + match: exact + - pattern: "container_memory_working_set_bytes" + match: exact + - pattern: "container_network_receive_bytes_total" + match: exact + - pattern: "container_network_transmit_bytes_total" + match: exact + - pattern: "container_resources_gpu_usage_percent" + match: exact + - pattern: "container_resources_gpu_memory_usage_percent" + match: exact + - pattern: "kube_node_info" + match: exact + - pattern: "kube_node_status_capacity" + match: exact + - pattern: "kube_pod_container_resource_limits" + match: exact + - pattern: "kube_pod_container_resource_requests" + match: exact + - pattern: "kube_pod_labels" + match: exact + - pattern: "kube_pod_info" + match: exact + - pattern: "cloudzero_" + match: prefix + + cost_labels: + - pattern: "board_asset_tag" + match: exact + - pattern: "container" + match: exact + - pattern: "created_by_kind" + match: exact + - pattern: "created_by_name" + match: exact + - pattern: "image" + match: exact + - pattern: "instance" + match: exact + - pattern: "name" + match: exact + - pattern: "namespace" + match: exact + - pattern: "node" + match: exact + - pattern: "node_kubernetes_io_instance_type" + match: exact + - pattern: "pod" + match: exact + - pattern: "product_name" + match: exact + - pattern: "provider_id" + match: exact + - pattern: "resource" + match: exact + - pattern: "resource_type" + match: exact + - pattern: "unit" + match: exact + - pattern: "uid" + match: exact + - pattern: "workload" + match: exact + - pattern: "_" + match: prefix + - pattern: "label_" + match: prefix + - pattern: "app.kubernetes.io/" + match: prefix + - pattern: "k8s." + match: prefix + + observability: + - pattern: "go_gc_duration_seconds" + match: exact + - pattern: "go_gc_duration_seconds_count" + match: exact + - pattern: "go_gc_duration_seconds_sum" + match: exact + - pattern: "go_gc_gogc_percent" + match: exact + - pattern: "go_gc_gomemlimit_bytes" + match: exact + - pattern: "go_goroutines" + match: exact + - pattern: "go_memstats_alloc_bytes" + match: exact + - pattern: "go_memstats_heap_alloc_bytes" + match: exact + - pattern: "go_memstats_heap_idle_bytes" + match: exact + - pattern: "go_memstats_heap_inuse_bytes" + match: exact + - pattern: "go_memstats_heap_objects" + match: exact + - pattern: "go_memstats_last_gc_time_seconds" + match: exact + - pattern: "go_memstats_stack_inuse_bytes" + match: exact + - pattern: "go_threads" + match: exact + - pattern: "http_request_duration_seconds_bucket" + match: exact + - pattern: "http_request_duration_seconds_count" + match: exact + - pattern: "http_request_duration_seconds_sum" + match: exact + - pattern: "http_requests_total" + match: exact + - pattern: "process_cpu_seconds_total" + match: exact + - pattern: "process_max_fds" + match: exact + - pattern: "process_open_fds" + match: exact + - pattern: "process_resident_memory_bytes" + match: exact + - pattern: "process_start_time_seconds" + match: exact + - pattern: "process_virtual_memory_bytes" + match: exact + - pattern: "process_virtual_memory_max_bytes" + match: exact + - pattern: "prometheus_agent_corruptions_total" + match: exact + - pattern: "prometheus_api_remote_read_queries" + match: exact + - pattern: "prometheus_http_requests_total" + match: exact + - pattern: "prometheus_notifications_alertmanagers_discovered" + match: exact + - pattern: "prometheus_notifications_dropped_total" + match: exact + - pattern: "prometheus_remote_storage_bytes_total" + match: exact + - pattern: "prometheus_remote_storage_exemplars_in_total" + match: exact + - pattern: "prometheus_remote_storage_histograms_failed_total" + match: exact + - pattern: "prometheus_remote_storage_histograms_in_total" + match: exact + - pattern: "prometheus_remote_storage_histograms_total" + match: exact + - pattern: "prometheus_remote_storage_metadata_bytes_total" + match: exact + - pattern: "prometheus_remote_storage_metadata_failed_total" + match: exact + - pattern: "prometheus_remote_storage_metadata_retried_total" + match: exact + - pattern: "prometheus_remote_storage_metadata_total" + match: exact + - pattern: "prometheus_remote_storage_samples_dropped_total" + match: exact + - pattern: "prometheus_remote_storage_samples_failed_total" + match: exact + - pattern: "prometheus_remote_storage_samples_in_total" + match: exact + - pattern: "prometheus_remote_storage_samples_total" + match: exact + - pattern: "prometheus_remote_storage_shard_capacity" + match: exact + - pattern: "prometheus_remote_storage_shards" + match: exact + - pattern: "prometheus_remote_storage_shards_desired" + match: exact + - pattern: "prometheus_remote_storage_shards_max" + match: exact + - pattern: "prometheus_remote_storage_shards_min" + match: exact + - pattern: "prometheus_remote_storage_string_interner_zero_reference_releases_total" + match: exact + - pattern: "prometheus_sd_azure_cache_hit_total" + match: exact + - pattern: "prometheus_sd_azure_failures_total" + match: exact + - pattern: "prometheus_sd_discovered_targets" + match: exact + - pattern: "prometheus_sd_dns_lookup_failures_total" + match: exact + - pattern: "prometheus_sd_failed_configs" + match: exact + - pattern: "prometheus_sd_file_read_errors_total" + match: exact + - pattern: "prometheus_sd_file_scan_duration_seconds" + match: exact + - pattern: "prometheus_sd_file_watcher_errors_total" + match: exact + - pattern: "prometheus_sd_http_failures_total" + match: exact + - pattern: "prometheus_sd_kubernetes_events_total" + match: exact + - pattern: "prometheus_sd_kubernetes_http_request_duration_seconds" + match: exact + - pattern: "prometheus_sd_kubernetes_http_request_total" + match: exact + - pattern: "prometheus_sd_kubernetes_workqueue_depth" + match: exact + - pattern: "prometheus_sd_kubernetes_workqueue_items_total" + match: exact + - pattern: "prometheus_sd_kubernetes_workqueue_latency_seconds" + match: exact + - pattern: "prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds" + match: exact + - pattern: "prometheus_sd_kubernetes_workqueue_unfinished_work_seconds" + match: exact + - pattern: "prometheus_sd_kubernetes_workqueue_work_duration_seconds" + match: exact + - pattern: "prometheus_sd_received_updates_total" + match: exact + - pattern: "prometheus_sd_updates_delayed_total" + match: exact + - pattern: "prometheus_sd_updates_total" + match: exact + - pattern: "prometheus_target_scrape_pool_reloads_failed_total" + match: exact + - pattern: "prometheus_target_scrape_pool_reloads_total" + match: exact + - pattern: "prometheus_target_scrape_pool_sync_total" + match: exact + - pattern: "prometheus_target_scrape_pools_failed_total" + match: exact + - pattern: "prometheus_target_scrape_pools_total" + match: exact + - pattern: "prometheus_target_sync_failed_total" + match: exact + - pattern: "prometheus_target_sync_length_seconds" + match: exact + - pattern: "promhttp_metric_handler_requests_in_flight" + match: exact + - pattern: "promhttp_metric_handler_requests_total" + match: exact + - pattern: "remote_write_db_failures_total" + match: exact + - pattern: "remote_write_failures_total" + match: exact + - pattern: "remote_write_payload_size_bytes" + match: exact + - pattern: "remote_write_records_processed_total" + match: exact + - pattern: "remote_write_response_codes_total" + match: exact + - pattern: "remote_write_timeseries_total" + match: exact + - pattern: "storage_write_failure_total" + match: exact + - pattern: "czo_webhook_types_total" + match: exact + - pattern: "czo_storage_types_total" + match: exact + - pattern: "czo_ingress_types_total" + match: exact + - pattern: "czo_gateway_types_total" + match: exact + - pattern: "function_execution_seconds" + match: exact + - pattern: "shipper_shutdown_total" + match: exact + - pattern: "shipper_new_files_error_total" + match: exact + - pattern: "shipper_new_files_processing_current" + match: exact + - pattern: "shipper_handle_request_file_count" + match: exact + - pattern: "shipper_handle_request_success_total" + match: exact + - pattern: "shipper_presigned_url_error_total" + match: exact + - pattern: "shipper_replay_request_total" + match: exact + - pattern: "shipper_replay_request_current" + match: exact + - pattern: "shipper_replay_request_file_count" + match: exact + - pattern: "shipper_replay_request_error_total" + match: exact + - pattern: "shipper_replay_request_abandon_files_total" + match: exact + - pattern: "shipper_replay_request_abandon_files_error_total" + match: exact + - pattern: "shipper_disk_total_size_bytes" + match: exact + - pattern: "shipper_current_disk_usage_bytes" + match: exact + - pattern: "shipper_current_disk_usage_percentage" + match: exact + - pattern: "shipper_current_disk_unsent_file" + match: exact + - pattern: "shipper_current_disk_sent_file" + match: exact + - pattern: "shipper_disk_replay_request_current" + match: exact + - pattern: "shipper_disk_cleanup_failure_total" + match: exact + - pattern: "shipper_disk_cleanup_success_total" + match: exact + - pattern: "shipper_disk_cleanup_percentage" + match: exact + - pattern: "czo_" + match: prefix + + server: + mode: http + port: 8080 + profiling: false + reconnect_frequency: 16 + logging: + level: "info" + capture: true + database: + storage_path: /cloudzero/data + max_records: 1.5e+06 + cost_max_interval: 30m + observability_max_interval: 10m + compression_level: 8 + purge_rules: + metrics_older_than: 168h + lazy: true + percent: 20 + available_storage: + cloudzero: + api_key_path: /etc/config/secrets/value + send_interval: 10m + send_timeout: 120s + rotate_interval: 30m + host: api.cloudzero.com + http_max_retries: 10 + http_max_wait: 30s +--- +# Source: cloudzero-agent/templates/helmless-cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: cz-agent-helmless-cm + namespace: cz-agent + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: helmless + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +data: + values.yaml: |- + aggregator: + affinity: {} + cloudzero: + httpMaxRetries: 10 + httpMaxWait: 30s + rotateInterval: 30m + sendInterval: 10m + sendTimeout: 120s + collector: + port: 8080 + resources: + limits: + cpu: "" + memory: "" + requests: + cpu: "" + memory: "" + database: + compressionLevel: 8 + costMaxInterval: 30m + emptyDir: + enabled: true + sizeLimit: "" + maxRecords: 1500000 + observabilityMaxInterval: 10m + purgeRules: + lazy: true + metricsOlderThan: 168h + percent: 20 + image: + digest: null + pullPolicy: null + repository: null + tag: null + logging: + capture: true + level: info + mountRoot: /cloudzero + name: null + nodeSelector: {} + profiling: false + reconnectFrequency: 16 + shipper: + port: 8081 + resources: + limits: + cpu: "" + memory: "" + requests: + cpu: "" + memory: "" + tolerations: [] + apiKey: '***' + cloudAccountId: "1234567890" + clusterName: istio-test-cluster + commonMetaLabels: {} + components: + agent: + annotations: {} + autoscaling: null + clusteredNode: + image: + repository: docker.io/grafana/alloy + tag: v1.11.3 + resources: + limits: + cpu: 100m + memory: 512Mi + requests: + cpu: 50m + memory: 256Mi + federatedNode: + annotations: {} + labels: {} + podAnnotations: {} + podLabels: {} + resources: + limits: + cpu: 1000m + memory: 1024Mi + requests: + cpu: 250m + memory: 512Mi + securityContext: {} + image: + repository: ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent + tag: 1.2.9 + labels: {} + mode: null + podAnnotations: {} + podDisruptionBudget: null + podLabels: {} + reloader: + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 50m + memory: 64Mi + securityContext: {} + resources: + limits: + cpu: 1000m + memory: 1024Mi + requests: + cpu: 250m + memory: 512Mi + securityContext: {} + aggregator: + annotations: {} + collector: + resources: + limits: + cpu: 2000m + memory: 1024Mi + requests: + cpu: 100m + memory: 64Mi + securityContext: {} + labels: {} + podAnnotations: {} + podDisruptionBudget: null + podLabels: {} + replicas: null + securityContext: {} + shipper: + resources: + limits: + cpu: 2000m + memory: 1024Mi + requests: + cpu: 100m + memory: 64Mi + securityContext: {} + tolerations: [] + miscellaneous: + configLoader: + annotations: {} + labels: {} + podAnnotations: {} + podLabels: {} + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: {} + helmless: + annotations: {} + labels: {} + podAnnotations: {} + podLabels: {} + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 50m + memory: 64Mi + securityContext: {} + initCert: + annotations: {} + labels: {} + podAnnotations: {} + podLabels: {} + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 50m + memory: 64Mi + securityContext: {} + prometheus: + image: + repository: quay.io/prometheus/prometheus + tag: null + prometheusReloader: + image: + repository: quay.io/prometheus-operator/prometheus-config-reloader + tag: v0.87.0 + validator: + annotations: {} + labels: {} + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 50m + memory: 64Mi + webhookServer: + annotations: {} + autoscaling: null + backfill: + annotations: {} + labels: {} + podAnnotations: {} + podLabels: {} + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + schedule: 0 */12 * * * + securityContext: {} + labels: {} + podAnnotations: {} + podDisruptionBudget: null + podLabels: {} + replicas: null + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: {} + configmapReload: + prometheus: + enabled: true + image: + digest: null + pullPolicy: null + repository: null + tag: null + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 50m + memory: 64Mi + defaults: + affinity: {} + annotations: {} + autoscaling: + enabled: false + maxReplicas: 10 + minReplicas: 1 + targetCPUUtilizationPercentage: 80 + targetMemoryUtilizationPercentage: 80 + dns: + config: {} + policy: null + image: + pullPolicy: IfNotPresent + pullSecrets: null + labels: {} + nodeSelector: {} + podDisruptionBudget: + enabled: true + minAvailable: 1 + priorityClassName: null + replicas: 3 + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + tolerations: [] + helmless: + resources: + limits: + cpu: 50m + memory: 32Mi + requests: + cpu: 5m + memory: 8Mi + host: api.cloudzero.com + imagePullSecrets: [] + initBackfillJob: + annotations: {} + enabled: true + image: + digest: null + pullPolicy: null + repository: null + tag: null + imagePullSecrets: null + nodeSelector: null + tolerations: [] + initCertJob: + annotations: {} + enabled: true + image: + digest: null + pullPolicy: null + repository: null + tag: null + imagePullSecrets: null + nodeSelector: {} + rbac: + clusterRoleBindingName: "" + clusterRoleName: "" + create: true + serviceAccountName: "" + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 50m + memory: 64Mi + tolerations: [] + initScrapeJob: + annotations: null + image: + digest: null + pullPolicy: null + repository: null + tag: null + imagePullSecrets: null + nodeSelector: null + tolerations: null + insightsController: + ConfigMapNameOverride: null + annotations: + enabled: false + patterns: + - .* + resources: + cronjobs: false + daemonsets: false + deployments: false + jobs: false + namespaces: true + nodes: false + pods: true + statefulsets: false + configurationMountPath: null + enabled: true + labels: + enabled: true + patterns: + - app.kubernetes.io/component + resources: + cronjobs: false + daemonsets: false + deployments: false + jobs: false + namespaces: true + nodes: false + pods: true + statefulsets: false + podAnnotations: {} + podLabels: {} + resources: + limits: + cpu: "" + memory: "" + requests: + cpu: "" + memory: "" + server: + affinity: {} + deploymentAnnotations: {} + healthCheck: + enabled: true + failureThreshold: 5 + initialDelaySeconds: 15 + path: /healthz + periodSeconds: 20 + port: 8443 + successThreshold: 1 + timeoutSeconds: 3 + idle_timeout: 120s + image: + pullPolicy: null + repository: null + tag: null + imagePullSecrets: [] + logging: + level: info + name: webhook-server + nodeSelector: {} + podAnnotations: {} + port: 8443 + read_timeout: 10s + reconnectFrequency: 16 + replicaCount: null + send_interval: 1m + send_timeout: 1m + tolerations: [] + write_timeout: 10s + service: + port: 443 + tls: + caBundle: "" + crt: "" + enabled: true + issuerSpec: {} + key: "" + mountPath: /etc/certs + secret: + create: true + name: "" + useCertManager: false + volumeMounts: [] + volumes: [] + webhooks: + annotations: {} + caInjection: null + namespaceSelector: {} + path: /validate + timeoutSeconds: 1 + integrations: + istio: + clusterID: istio-cluster-id-for-testing + enabled: true + jobConfigID: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + kubeStateMetrics: + affinity: {} + annotations: {} + automountServiceAccountToken: true + autosharding: + enabled: false + collectors: + - certificatesigningrequests + - configmaps + - cronjobs + - daemonsets + - deployments + - endpoints + - horizontalpodautoscalers + - ingresses + - jobs + - leases + - limitranges + - mutatingwebhookconfigurations + - namespaces + - networkpolicies + - nodes + - persistentvolumeclaims + - persistentvolumes + - poddisruptionbudgets + - pods + - replicasets + - replicationcontrollers + - resourcequotas + - secrets + - services + - statefulsets + - storageclasses + - validatingwebhookconfigurations + - volumeattachments + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + containers: [] + customLabels: {} + customResourceState: + config: {} + create: true + enabled: false + key: config.yaml + name: "" + dnsConfig: {} + dnsPolicy: ClusterFirst + enabled: true + env: [] + extraArgs: [] + extraManifests: [] + global: + imagePullSecrets: [] + imageRegistry: "" + hostNetwork: false + image: + pullPolicy: IfNotPresent + registry: registry.k8s.io + repository: kube-state-metrics/kube-state-metrics + tag: v2.17.0 + imagePullSecrets: [] + initContainers: [] + kubeRBACProxy: + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + enabled: false + extraArgs: [] + image: + pullPolicy: IfNotPresent + registry: quay.io + repository: brancz/kube-rbac-proxy + sha: "" + tag: v0.19.1 + resources: {} + volumeMounts: [] + kubeTargetVersionOverride: "" + kubeconfig: + enabled: false + labels: {} + livenessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: [] + scheme: http + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + metricAllowlist: [] + metricAnnotationsAllowList: [] + metricDenylist: [] + metricLabelsAllowlist: [] + nameOverride: null + namespaceOverride: "" + namespaces: "" + namespacesDenylist: "" + networkPolicy: + enabled: false + flavor: kubernetes + nodeSelector: {} + podAnnotations: {} + podDisruptionBudget: + minAvailable: 1 + podLabels: {} + podSecurityPolicy: + additionalVolumes: [] + annotations: {} + enabled: false + prometheus: + monitor: + additionalLabels: {} + annotations: {} + enabled: false + http: + bearerTokenFile: "" + bearerTokenSecret: {} + enableHttp2: false + honorLabels: false + interval: "" + metricRelabelings: [] + proxyUrl: "" + relabelings: [] + scheme: "" + scrapeTimeout: "" + tlsConfig: {} + jobLabel: "" + labelLimit: 0 + labelNameLengthLimit: 0 + labelValueLengthLimit: 0 + metrics: + bearerTokenFile: "" + bearerTokenSecret: {} + enableHttp2: false + honorLabels: false + interval: "" + metricRelabelings: [] + proxyUrl: "" + relabelings: [] + scheme: "" + scrapeTimeout: "" + tlsConfig: {} + namespace: "" + namespaceSelector: [] + podTargetLabels: [] + sampleLimit: 0 + selectorOverride: {} + targetLabels: [] + targetLimit: 0 + scrapeconfig: + additionalLabels: {} + annotations: {} + enableHttp2: false + enabled: false + honorLabels: true + jobName: kube-state-metrics + labelLimit: 0 + labelNameLengthLimit: 0 + labelValueLengthLimit: 0 + metricRelabelings: [] + proxyUrl: "" + relabelings: [] + sampleLimit: 0 + scheme: "" + scrapeInterval: "" + scrapeTimeout: "" + staticConfigLabels: {} + targetLimit: 0 + tlsConfig: {} + prometheusScrape: false + rbac: + create: true + extraRules: [] + useClusterRole: true + readinessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: [] + scheme: http + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + releaseLabel: false + releaseNamespace: false + replicas: 1 + resources: + limits: + cpu: 200m + memory: 512Mi + requests: + cpu: 100m + memory: 256Mi + revisionHistoryLimit: 10 + securityContext: + enabled: true + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + selectorOverride: {} + selfMonitor: + enabled: false + service: + annotations: {} + clusterIP: "" + ipDualStack: + enabled: false + ipFamilies: + - IPv6 + - IPv4 + ipFamilyPolicy: PreferDualStack + loadBalancerIP: "" + loadBalancerSourceRanges: [] + nodePort: 0 + port: 8080 + type: ClusterIP + serviceAccount: + annotations: {} + automountServiceAccountToken: true + create: true + imagePullSecrets: [] + startupProbe: + enabled: false + failureThreshold: 3 + httpGet: + httpHeaders: [] + scheme: http + initialDelaySeconds: 0 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + tolerations: [] + topologySpreadConstraints: [] + verticalPodAutoscaler: + controlledResources: [] + enabled: false + maxAllowed: {} + minAllowed: {} + volumeMounts: [] + volumes: [] + prometheusConfig: + configMapAnnotations: {} + configMapNameOverride: "" + configOverride: "" + globalScrapeInterval: 60s + outOfOrderTimeWindow: 5m + scrapeJobs: + additionalScrapeJobs: [] + aggregator: + enabled: true + scrapeInterval: 120s + cadvisor: + enabled: true + scrapeInterval: 60s + gpu: + enabled: false + scrapeInterval: 30s + kubeStateMetrics: + enabled: true + scrapeInterval: 60s + prometheus: + enabled: true + scrapeInterval: 120s + rbac: + create: true + region: us-east-1 + secretAnnotations: {} + server: + affinity: {} + agentMode: null + args: + - --config.file=/etc/config/prometheus/configmaps/prometheus.yml + - --web.enable-lifecycle + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + automountServiceAccountToken: null + clusterRoleNameOverride: null + deploymentAnnotations: {} + emptyDir: + sizeLimit: 8Gi + env: [] + fullnameOverride: null + image: + digest: null + pullPolicy: null + repository: null + tag: null + livenessProbe: + failureThreshold: 3 + initialDelaySeconds: 30 + periodSeconds: 15 + successThreshold: 1 + timeoutSeconds: 10 + livenessProbeFailureThreshold: null + livenessProbeInitialDelay: null + livenessProbePeriodSeconds: null + livenessProbeSuccessThreshold: null + livenessProbeTimeout: null + logging: + level: null + name: server + nodeSelector: {} + persistentVolume: + accessModes: + - ReadWriteOnce + annotations: {} + enabled: false + existingClaim: "" + labels: {} + mountPath: /data + selector: {} + size: 8Gi + storageClass: "" + subPath: "" + volumeBindingMode: null + volumeName: null + podAnnotations: {} + podLabels: {} + priorityClassName: null + readinessProbe: + failureThreshold: 3 + initialDelaySeconds: 30 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 4 + readinessProbeFailureThreshold: null + readinessProbeInitialDelay: null + readinessProbePeriodSeconds: null + readinessProbeSuccessThreshold: null + readinessProbeTimeout: null + resources: + limits: + cpu: "" + memory: "" + requests: + cpu: "" + memory: "" + serviceAccount: + name: null + terminationGracePeriodSeconds: 300 + tolerations: [] + topologySpreadConstraints: [] + useExistingClusterRoleName: null + serverConfig: + containerSecretFileName: value + containerSecretFilePath: /etc/config/secrets/ + serviceAccount: + annotations: {} + automountServiceAccountToken: null + create: true + name: "" + validator: + image: + digest: null + pullPolicy: null + pullSecrets: null + repository: null + tag: null + name: env-validator + resources: + limits: + cpu: "" + memory: "" + requests: + cpu: "" + memory: "" + serviceEndpoints: + kubeStateMetrics: null +--- +# Source: cloudzero-agent/templates/validator-cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: validator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + name: cz-agent-validator-configuration + namespace: cz-agent + +data: + validator.yml: |- + versions: + chart_version: 1.1.0-dev + agent_version: + + logging: + level: info + location: ./cloudzero-agent-validator.log + + deployment: + account_id: 1234567890 + cluster_name: istio-test-cluster + region: us-east-1 + + cloudzero: + host: https://api.cloudzero.com + credentials_file: /etc/config/secrets/value + disable_telemetry: false + + services: + namespace: cz-agent + insights_service: cz-agent-cz-webhook-svc + collector_service: cz-agent-cz-aggregator + + integrations: + istio: + # Use explicit value only (no fallback) so validator can distinguish user intent + # The validator receives clusterName separately via deployment.cluster_name + cluster_id: "istio-cluster-id-for-testing" + + prometheus: + kube_state_metrics_service_endpoint: http://cz-agent-cz-ksm.cz-agent.svc.cluster.local:8080 + executable: /bin/prometheus + kube_metrics: + - kube_node_info + - kube_node_status_capacity + - kube_pod_container_resource_limits + - kube_pod_container_resource_requests + - kube_pod_labels + - kube_pod_info + configurations: + - /etc/prometheus/prometheus.yml + - /etc/config/prometheus/configmaps/prometheus.yml + + diagnostics: + stages: + - name: pre-start + enforce: true + checks: + - api_key_valid + - name: post-start + enforce: false + checks: + - k8s_version + - k8s_namespace + - k8s_provider + - kube_state_metrics_reachable + - prometheus_version + - scrape_cfg + - webhook_server_reachable + - istio_xcluster_lb + - name: pre-stop + enforce: false + checks: + - name: config-load + enforce: false + checks: + - api_key_valid + - k8s_version + - k8s_namespace + - k8s_provider + - kube_state_metrics_reachable + - agent_settings +--- +# Source: cloudzero-agent/templates/webhook-cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: webhook-server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + name: cz-agent-cz-webhook-configuration + namespace: cz-agent + +data: + server-config.yaml: |- + cloud_account_id: 1234567890 + region: us-east-1 + cluster_name: istio-test-cluster + destination: 'http://cz-agent-cz-aggregator.cz-agent.svc.cluster.local/collector' + logging: + level: info + remote_write: + send_interval: 1m + max_bytes_per_send: 500000 + send_timeout: 1m + max_retries: 3 + k8s_client: + timeout: 30s + database: + retention_time: 24h + cleanup_interval: 3h + batch_update_size: 500 + api_key_path: /etc/config/secrets/value + certificate: + key: /etc/certs/tls.key + cert: /etc/certs/tls.crt + server: + namespace: cz-agent + domain: cz-agent-cz-webhook + port: 8443 + read_timeout: 10s + write_timeout: 10s + idle_timeout: 120s + reconnect_frequency: 16 + filters: + labels: + enabled: true + patterns: + - app.kubernetes.io/component + resources: + cronjobs: false + daemonsets: false + deployments: false + jobs: false + namespaces: true + nodes: false + pods: true + statefulsets: false + annotations: + enabled: false + patterns: + - .* + resources: + cronjobs: false + daemonsets: false + deployments: false + jobs: false + namespaces: true + nodes: false + pods: true + statefulsets: false +--- +# Source: cloudzero-agent/charts/kubeStateMetrics/templates/role.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: ksm + app.kubernetes.io/name: ksm + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" + name: cz-agent-cz-ksm +rules: + +- apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - configmaps + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - cronjobs + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - daemonsets + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - deployments + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - endpoints + verbs: ["list", "watch"] + +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + +- apiGroups: ["networking.k8s.io"] + resources: + - ingresses + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - jobs + verbs: ["list", "watch"] + +- apiGroups: ["coordination.k8s.io"] + resources: + - leases + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - limitranges + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - mutatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - namespaces + verbs: ["list", "watch"] + +- apiGroups: ["networking.k8s.io"] + resources: + - networkpolicies + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - nodes + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumeclaims + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumes + verbs: ["list", "watch"] + +- apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - pods + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - replicasets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - replicationcontrollers + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - resourcequotas + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - secrets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - services + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - validatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - volumeattachments + verbs: ["list", "watch"] +--- +# Source: cloudzero-agent/templates/agent-clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + name: cz-agent-cz-server +rules: + - apiGroups: + - "apps" + resources: + - "deployments" + - "statefulsets" + - "daemonsets" + verbs: + - "get" + - "list" + - apiGroups: + - "batch" + resources: + - "jobs" + - "cronjobs" + verbs: + - "get" + - "list" + - apiGroups: + - "" + resources: + - endpoints + - namespaces + - nodes + - nodes/proxy + - nodes/metrics + - services + - pods + - persistentvolumes + - persistentvolumeclaims + verbs: + - get + - list + - watch + - apiGroups: + - "extensions" + - "networking.k8s.io" + resources: + - ingresses/status + - ingresses + - ingressclasses + verbs: + - get + - list + - watch + - apiGroups: + - "gateway.networking.k8s.io" + resources: + - gatewayclasses + verbs: + - get + - list + - watch + - apiGroups: + - "storage.k8s.io" + resources: + - storageclasses + verbs: + - get + - list + - watch + - apiGroups: + - "discovery.k8s.io" + resources: + - endpointslices + verbs: + - get + - list + - watch + - nonResourceURLs: + - "/metrics" + verbs: + - get +--- +# Source: cloudzero-agent/templates/init-cert-clusterrole.yaml +# ClusterRole for the init-cert Job +# +# This ClusterRole grants the init-cert job (which runs during +# deployment/upgrade) permission to manage TLS certificates and update the +# ValidatingWebhookConfiguration. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: init-cert + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + annotations: + checkov.io/skip_1: CKV_K8S_155 + name: cz-agent-cz-webhook-init-cert + +rules: + # Read/Update access to Kubernetes secret containing the TLS certificate for + # the webhook server. + - apiGroups: [""] # Empty string means "core" API group (v1) - contains + resources: ["secrets"] + resourceNames: [cz-agent-cz-webhook-tls] + verbs: [ + # "get" - Read existing TLS certificate to determine if regeneration is needed + "get", + # "patch" - Update TLS secret with newly generated certificate data + "patch" + ] + + # Read/Update the Validating Webhook Configuration + # + # When regenerating the TLS certificate, the init-cert job updates the + # ValidatingWebhookConfiguration to use the CA bundle. This allows us to use + # our own self-signed certificate in the CloudZero Webhook Server, but only + # with this specific ValidatingWebhookConfiguration. + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + resourceNames: + - cz-agent-cz-webhook + verbs: [ + # "get" - Read current webhook configuration to check if caBundle updates are needed + "get", + # "patch" - Update caBundle field with new certificate data for webhook trust + "patch" + ] +--- +# Source: cloudzero-agent/charts/kubeStateMetrics/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: ksm + app.kubernetes.io/name: ksm + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" + name: cz-agent-cz-ksm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cz-agent-cz-ksm +subjects: +- kind: ServiceAccount + name: cz-agent-cz-ksm + namespace: cz-agent +--- +# Source: cloudzero-agent/templates/agent-clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-cz-server +subjects: + - kind: ServiceAccount + name: cz-agent-cz-server + namespace: cz-agent +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cz-agent-cz-server +--- +# Source: cloudzero-agent/templates/init-cert-clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: init-cert + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-cz-webhook-init-cert +subjects: + - kind: ServiceAccount + name: cz-agent-cz-webhook-init-cert + namespace: cz-agent +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cz-agent-cz-webhook-init-cert +--- +# Source: cloudzero-agent/charts/kubeStateMetrics/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: cz-agent-cz-ksm + namespace: cz-agent + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: ksm + app.kubernetes.io/name: ksm + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" + annotations: +spec: + type: "ClusterIP" + ports: + - name: "http" + protocol: TCP + port: 8080 + targetPort: 8080 + + selector: + app.kubernetes.io/name: ksm + app.kubernetes.io/instance: cz-agent +--- +# Source: cloudzero-agent/templates/aggregator-service.yaml +apiVersion: v1 +kind: Service +metadata: + namespace: cz-agent + name: cz-agent-cz-aggregator + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-cz-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +spec: + selector: + app.kubernetes.io/name: aggregator + app.kubernetes.io/instance: cz-agent + ports: + - protocol: TCP + port: 80 + targetPort: 8080 + type: ClusterIP +--- +# Source: cloudzero-agent/templates/webhook-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: cz-agent-cz-webhook + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: webhook-server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + annotations: + nginx.ingress.kubernetes.io/ssl-redirect: "false" + namespace: cz-agent +spec: + type: ClusterIP + ports: + - port: 443 + targetPort: 8443 + name: http + appProtocol: https + selector: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: cz-agent +--- +# Source: cloudzero-agent/charts/kubeStateMetrics/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cz-agent-cz-ksm + namespace: cz-agent + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: ksm + app.kubernetes.io/name: ksm + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" +spec: + selector: + matchLabels: + app.kubernetes.io/name: ksm + app.kubernetes.io/instance: cz-agent + replicas: 1 + strategy: + type: RollingUpdate + revisionHistoryLimit: 10 + template: + metadata: + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: ksm + app.kubernetes.io/name: ksm + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" + spec: + automountServiceAccountToken: true + hostNetwork: false + serviceAccountName: cz-agent-cz-ksm + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + dnsPolicy: ClusterFirst + containers: + - name: ksm + args: + - --port=8080 + - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + imagePullPolicy: IfNotPresent + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.17.0 + ports: + - containerPort: 8080 + name: "http" + livenessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: + path: /livez + port: 8080 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + readinessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: + path: /readyz + port: 8081 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + resources: + limits: + cpu: 200m + memory: 512Mi + requests: + cpu: 100m + memory: 256Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true +--- +# Source: cloudzero-agent/templates/agent-deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-cz-server + namespace: cz-agent +spec: + selector: + matchLabels: + app.kubernetes.io/name: server + app.kubernetes.io/instance: cz-agent + replicas: 1 + template: + metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + spec: + + serviceAccountName: cz-agent-cz-server + initContainers: + - name: env-validator-copy + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.9" + imagePullPolicy: "IfNotPresent" + env: + - name: K8S_NAMESPACE + value: cz-agent + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] + command: + - /app/cloudzero-agent-validator + - install + - --destination + - /checks/bin/cloudzero-agent-validator + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 50m + memory: 64Mi + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + - name: lifecycle-volume + mountPath: /checks/bin/ + - name: validator-config-volume + mountPath: /checks/config/ + - name: env-validator-run + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.9" + imagePullPolicy: "IfNotPresent" + env: + - name: K8S_NAMESPACE + value: cz-agent + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] + command: + - /checks/bin/cloudzero-agent-validator + - diagnose + - pre-start + - -f + - /checks/config/validator.yml + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 50m + memory: 64Mi + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + - name: lifecycle-volume + mountPath: /checks/bin/ + - name: validator-config-volume + mountPath: /checks/config/ + containers: + # ConfigMap reloader sidecar for Prometheus/Alloy + - name: cloudzero-agent-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.87.0" + imagePullPolicy: "IfNotPresent" + args: + - --watched-dir=/etc/config + - --reload-url=http://127.0.0.1:9090/-/reload + - --listen-address=:8080 + ports: + - containerPort: 8080 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + readinessProbe: + httpGet: + path: /healthz + port: 8080 + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 4 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 50m + memory: 64Mi + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + # Prometheus server container + - name: cloudzero-agent-server + + image: "quay.io/prometheus/prometheus:v3.7.3" + imagePullPolicy: "IfNotPresent" + lifecycle: + postStart: + exec: + command: + - /checks/cloudzero-agent-validator + - diagnose + - post-start + - -f + - /checks/app/config/validator.yml + preStop: + exec: + command: + - /checks/cloudzero-agent-validator + - diagnose + - pre-stop + - -f + - /checks/app/config/validator.yml + args: + + - --config.file=/etc/config/prometheus/configmaps/prometheus.yml + - --web.enable-lifecycle + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --agent + - --log.level=info + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 + failureThreshold: 3 + successThreshold: 1 + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: 1000m + memory: 1024Mi + requests: + cpu: 250m + memory: 512Mi + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - name: config-volume + mountPath: /etc/config/prometheus/configmaps/ + - name: cloudzero-agent-storage-volume + mountPath: /data + subPath: "" + - name: lifecycle-volume + mountPath: /checks/ + - name: validator-config-volume + mountPath: /checks/app/config/ + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + + + + + + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: cz-agent-configuration + - name: validator-config-volume + configMap: + name: cz-agent-validator-configuration + - name: lifecycle-volume + emptyDir: {} + - name: cloudzero-api-key + secret: + secretName: cz-agent-api-key + - name: cloudzero-agent-storage-volume + emptyDir: + sizeLimit: 8Gi +--- +# Source: cloudzero-agent/templates/aggregator-deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cz-agent-cz-aggregator + namespace: cz-agent + # Annotations: Merge default annotations with aggregator-specific annotations + # Enables custom metadata for monitoring, backup policies, and operational tooling + + # Labels: Combine standard aggregator labels with user-defined metadata labels + # Provides consistent labeling for service discovery, monitoring, and resource management + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev +spec: + selector: + matchLabels: + app.kubernetes.io/name: aggregator + app.kubernetes.io/instance: cz-agent + replicas: 3 + template: + metadata: + annotations: + checksum/config: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + spec: + serviceAccountName: cz-agent-cz-server + + containers: + - name: cz-agent-cz-aggregator-collector + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.9" + imagePullPolicy: "IfNotPresent" + ports: + - name: port-collector + containerPort: 8080 + command: ["/app/cloudzero-collector", "-config", "/cloudzero/config/config.yml"] + env: + - name: SERVER_PORT + value: "8080" + volumeMounts: + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + - name: aggregator-config-volume + mountPath: /cloudzero/config + readOnly: true + - name: aggregator-persistent-storage + mountPath: /cloudzero/data + readinessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + resources: + limits: + cpu: 2000m + memory: 1024Mi + requests: + cpu: 100m + memory: 64Mi + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + + - name: cz-agent-cz-aggregator-shipper + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.9" + imagePullPolicy: "IfNotPresent" + ports: + - name: port-shipper + containerPort: 8081 + command: ["/app/cloudzero-shipper", "-config", "/cloudzero/config/config.yml"] + env: + - name: SERVER_PORT + value: "8081" + volumeMounts: + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + - name: aggregator-config-volume + mountPath: /cloudzero/config + readOnly: true + - name: aggregator-persistent-storage + mountPath: /cloudzero/data + readinessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + resources: + limits: + cpu: 2000m + memory: 1024Mi + requests: + cpu: 100m + memory: 64Mi + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + + + + + + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: cz-agent-configuration + - name: validator-config-volume + configMap: + name: cz-agent-validator-configuration + - name: lifecycle-volume + emptyDir: {} + - name: cloudzero-api-key + secret: + secretName: cz-agent-api-key + - name: aggregator-config-volume + configMap: + name: cz-agent-cz-aggregator + - name: aggregator-persistent-storage + emptyDir: + {} +--- +# Source: cloudzero-agent/templates/webhook-deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cz-agent-cz-webhook + namespace: cz-agent + # Standard webhook server labels for consistent resource identification + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: webhook-server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + # Deployment annotations: Merge defaults with webhook-specific annotations + # Supports monitoring, backup policies, and operational tooling integration + +spec: + replicas: 3 + selector: + matchLabels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: cz-agent + template: + metadata: + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: webhook-server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + annotations: + checksum/config: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + traffic.sidecar.istio.io/excludeInboundPorts: "8443" + spec: + serviceAccountName: cz-agent-cz-server + + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: webhook-server + topologyKey: kubernetes.io/hostname + weight: 100 + + + + + containers: + - name: webhook-server + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.9" + imagePullPolicy: "IfNotPresent" + command: + - /app/cloudzero-webhook + args: + - -config + - "/etc/cloudzero-agent-insights/server-config.yaml" + ports: + - containerPort: 8443 + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - name: insights-server-config + mountPath: /etc/cloudzero-agent-insights + - name: tls-certs + mountPath: /etc/certs + readOnly: true + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + livenessProbe: + httpGet: + scheme: HTTPS + path: /healthz + port: 8443 + initialDelaySeconds: 15 + periodSeconds: 20 + timeoutSeconds: 3 + successThreshold: 1 + failureThreshold: 5 + readinessProbe: + httpGet: + scheme: HTTPS + path: /healthz + port: 8443 + initialDelaySeconds: 15 + periodSeconds: 20 + timeoutSeconds: 3 + successThreshold: 1 + failureThreshold: 5 + volumes: + - name: insights-server-config + configMap: + name: cz-agent-cz-webhook-configuration + - name: tls-certs + secret: + secretName: cz-agent-cz-webhook-tls + - name: cloudzero-api-key + secret: + secretName: cz-agent-api-key +--- +# Source: cloudzero-agent/templates/backfill-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: cz-agent-backfill-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA + namespace: cz-agent + + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: backfill + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + job-category: onetime + job-type: backfill +spec: + template: + metadata: + name: cz-agent-backfill-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA + namespace: cz-agent + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: backfill + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + job-category: onetime + job-type: backfill + annotations: + traffic.sidecar.istio.io/excludeOutboundPorts: "443" + spec: + serviceAccountName: cz-agent-cz-server + restartPolicy: OnFailure + + + + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: init-scrape + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.9" + imagePullPolicy: "IfNotPresent" + command: + - /app/cloudzero-webhook + args: + - -config + - "/etc/cloudzero-agent-insights/server-config.yaml" + - -backfill + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - name: insights-server-config + mountPath: /etc/cloudzero-agent-insights + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + volumes: + - name: insights-server-config + configMap: + name: cz-agent-cz-webhook-configuration + - name: tls-certs + secret: + secretName: cz-agent-cz-webhook-tls + - name: cloudzero-api-key + secret: + secretName: cz-agent-api-key +--- +# Source: cloudzero-agent/templates/config-loader-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: cz-agent-confload-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + namespace: cz-agent + annotations: + checksum/values: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: validator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev +spec: + template: + metadata: + name: cz-agent-confload-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + namespace: cz-agent + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: validator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + spec: + + serviceAccountName: cz-agent-cz-server + restartPolicy: OnFailure + + + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: run-validator + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.9" + imagePullPolicy: "IfNotPresent" + env: + - name: K8S_NAMESPACE + value: cz-agent + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] + command: + - /app/cloudzero-cluster-config + - load + - --account + - "1234567890" + - --region + - us-east-1 + - --cluster-name + - istio-test-cluster + - --release-name + - cz-agent + - --chart-version + - 1.1.0-dev + - --agent-version + - "1.2.9" + - --values-file + - /cloudzero/config/values/values.yaml + - --config-validator + - /cloudzero/config/validator/validator.yml + - --config-webhook + - /etc/cloudzero-agent-insights/server-config.yaml + - --config-aggregator + - /cloudzero/config/config.yml + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + - name: config-values + mountPath: /cloudzero/config/values # values.yaml + - name: config-volume + mountPath: /etc/config/prometheus/configmaps/ + - name: config-validator + mountPath: /cloudzero/config/validator # validator.yml + - name: config-webhook + mountPath: /etc/cloudzero-agent-insights # server-config.yaml + - name: config-aggregator + mountPath: /cloudzero/config # config.yaml + - name: aggregator-persistent-storage + mountPath: /cloudzero/data + volumes: + - name: config-values + configMap: + name: cz-agent-helmless-cm + - name: config-volume + configMap: + name: cz-agent-configuration + - name: config-validator + configMap: + name: cz-agent-validator-configuration + - name: config-webhook + configMap: + name: cz-agent-cz-webhook-configuration + - name: config-aggregator + configMap: + name: cz-agent-cz-aggregator + - name: cloudzero-api-key + secret: + secretName: cz-agent-api-key + - name: aggregator-persistent-storage + emptyDir: {} +--- +# Source: cloudzero-agent/templates/helmless-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: cz-agent-helmless-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + namespace: cz-agent + + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: helmless + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev +spec: + template: + metadata: + name: cz-agent-helmless-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + namespace: cz-agent + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: helmless + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + spec: + restartPolicy: OnFailure + + + + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: helmless + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.9" + imagePullPolicy: "IfNotPresent" + command: + - /app/cloudzero-helmless + args: + - --configured + - /etc/config/values/values.yaml + - --output + - "-" + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 50m + memory: 64Mi + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - name: helmless-cm + mountPath: /etc/config/values + readOnly: true + volumes: + - name: helmless-cm + configMap: + name: cz-agent-helmless-cm +--- +# Source: cloudzero-agent/templates/init-cert-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: cz-agent-init-cert-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + namespace: cz-agent + + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: init-cert + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev +spec: + template: + metadata: + name: cz-agent-init-cert-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: init-cert + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + spec: + + + + serviceAccountName: cz-agent-cz-webhook-init-cert + restartPolicy: Never + + + + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: init-cert + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.9" + imagePullPolicy: "IfNotPresent" + command: ["/app/cloudzero-certifik8s"] + workingDir: /var/tmp + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 50m + memory: 64Mi + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + args: + - "generate" + - "--secret-name=cz-agent-cz-webhook-tls" + - "--namespace=cz-agent" + - "--service-name=cz-agent-cz-webhook" + - "--webhook-name=cz-agent-cz-webhook" + - "--enable-labels" +--- +# Source: cloudzero-agent/templates/backfill-job.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: cz-agent-backfill + namespace: cz-agent + + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: backfill + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + job-category: cronjob + job-type: backfill +spec: + schedule: "0 */12 * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + template: + metadata: + name: cz-agent-backfill + namespace: cz-agent + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: backfill + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + job-category: cronjob + job-type: backfill + annotations: + traffic.sidecar.istio.io/excludeOutboundPorts: "443" + spec: + serviceAccountName: cz-agent-cz-server + restartPolicy: OnFailure + + + + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: init-scrape + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.9" + imagePullPolicy: "IfNotPresent" + command: + - /app/cloudzero-webhook + args: + - -config + - "/etc/cloudzero-agent-insights/server-config.yaml" + - -backfill + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - name: insights-server-config + mountPath: /etc/cloudzero-agent-insights + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + volumes: + - name: insights-server-config + configMap: + name: cz-agent-cz-webhook-configuration + - name: tls-certs + secret: + secretName: cz-agent-cz-webhook-tls + - name: cloudzero-api-key + secret: + secretName: cz-agent-api-key +--- +# Source: cloudzero-agent/templates/aggregator-destinationrule.yaml +apiVersion: networking.istio.io/v1 +kind: DestinationRule +metadata: + name: cz-agent-cz-aggregator-cluster-local + namespace: cz-agent + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-cz-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +spec: + # The service FQDN we are configuring routing for + host: cz-agent-cz-aggregator.cz-agent.svc.cluster.local + # Define a subset that matches only pods in the local cluster + subsets: + - name: local-cluster + labels: + # Istio automatically adds this label to all pods with the cluster ID + # This ensures we only route to pods in our local cluster + topology.istio.io/cluster: istio-cluster-id-for-testing +--- +# Source: cloudzero-agent/templates/webhook-validating-config.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: cz-agent-cz-webhook + namespace: cz-agent + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: webhook-server + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + # Certificate management annotations for automatic TLS certificate injection + # When cert-manager is enabled, automatically injects CA bundle for webhook TLS validation + +webhooks: + - name: cz-agent-cz-webhook.cz-agent.svc + # Namespace selector: Controls which namespaces trigger webhook processing + # Supports inclusion/exclusion patterns for fine-grained resource selection + namespaceSelector: + {} + # Failure policy: 'Ignore' ensures fail-open behavior - never blocks cluster operations + # Critical for production stability: webhook failures don't impact workload deployments + failurePolicy: Ignore + rules: + - operations: [ "CREATE", "UPDATE", "DELETE" ] + apiGroups: ["*"] + apiVersions: ["*"] + resources: + + - deployments + - statefulsets + - daemonsets + - replicasets + - pods + - namespaces + - nodes + - services + - storageclasses + - persistentvolumes + - persistentvolumeclaims + - jobs + - cronjobs + - customresourcedefinitions + - ingresses + - ingressclasses + - gateways + - gatewayclasses + scope: "*" + clientConfig: + service: + namespace: cz-agent + name: cz-agent-cz-webhook + path: /validate + port: 443 + admissionReviewVersions: ["v1"] + sideEffects: None + timeoutSeconds: 1 +--- +# Source: cloudzero-agent/templates/aggregator-virtualservice.yaml +apiVersion: networking.istio.io/v1 +kind: VirtualService +metadata: + name: cz-agent-cz-aggregator-cluster-local + namespace: cz-agent + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-cz-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v3.7.3 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +spec: + # The service FQDN we are configuring routing for + hosts: + - cz-agent-cz-aggregator.cz-agent.svc.cluster.local + # Route all HTTP traffic to the local-cluster subset + http: + - route: + - destination: + host: cz-agent-cz-aggregator.cz-agent.svc.cluster.local + subset: local-cluster + weight: 100 diff --git a/tests/helm/template/manifest.yaml b/tests/helm/template/manifest.yaml index 481e74e89..b8c4cc0c7 100644 --- a/tests/helm/template/manifest.yaml +++ b/tests/helm/template/manifest.yaml @@ -1120,7 +1120,6 @@ data: replicaCount: null send_interval: 1m send_timeout: 1m - suppressIstioAnnotations: false tolerations: [] write_timeout: 10s service: @@ -1144,6 +1143,10 @@ data: namespaceSelector: {} path: /validate timeoutSeconds: 1 + integrations: + istio: + clusterID: null + enabled: null jobConfigID: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E kubeStateMetrics: affinity: {} @@ -1563,6 +1566,12 @@ data: insights_service: cz-agent-cz-webhook-svc collector_service: cz-agent-cz-aggregator + integrations: + istio: + # Use explicit value only (no fallback) so validator can distinguish user intent + # The validator receives clusterName separately via deployment.cluster_name + cluster_id: + prometheus: kube_state_metrics_service_endpoint: http://cz-agent-cz-ksm.cz-agent.svc.cluster.local:8080 executable: /bin/prometheus @@ -1593,6 +1602,7 @@ data: - prometheus_version - scrape_cfg - webhook_server_reachable + - istio_xcluster_lb - name: pre-stop enforce: false checks: @@ -2250,6 +2260,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] command: - /app/cloudzero-agent-validator - install @@ -2285,6 +2303,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] command: - /checks/bin/cloudzero-agent-validator - diagnose @@ -2655,7 +2681,6 @@ spec: helm.sh/chart: cloudzero-agent-1.1.0-dev annotations: checksum/config: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E - sidecar.istio.io/inject: "false" spec: serviceAccountName: cz-agent-cz-server @@ -2872,6 +2897,14 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: ISTIO_AMBIENT_REDIRECTION + valueFrom: + fieldRef: + fieldPath: metadata.annotations['ambient.istio.io/redirection'] + - name: ISTIO_TOPOLOGY_CLUSTER + valueFrom: + fieldRef: + fieldPath: metadata.labels['topology.istio.io/cluster'] command: - /app/cloudzero-cluster-config - load