diff --git a/controlplane/provisioner/lakekeeper_k8s.go b/controlplane/provisioner/lakekeeper_k8s.go index 67ff156e..97a0a728 100644 --- a/controlplane/provisioner/lakekeeper_k8s.go +++ b/controlplane/provisioner/lakekeeper_k8s.go @@ -32,6 +32,20 @@ func isValidOrgIDLabel(orgID string) bool { // by default but we co-locate the CRs to keep RBAC tight. const LakekeeperNamespace = "lakekeeper" +// Per-org Lakekeeper pod resource shape. Requests == limits → Guaranteed QoS. +// Lakekeeper is a light Rust REST catalog (mostly idle metadata ops), so a +// modest fixed shape is plenty; bump these if a tenant needs more headroom. +const ( + lakekeeperPodCPU = "500m" + lakekeeperPodMemory = "512Mi" +) + +// lakekeeperMetricsPort is the operator's default metrics container port +// (lakekeeper-operator getMetricsPort default). We don't set +// spec.server.metricsPort, so this is where the metrics endpoint listens and +// the value advertised to vmagent via the prometheus.io/port pod annotation. +const lakekeeperMetricsPort = "9000" + // lakekeeperGVR matches the operator at /Users/james/opt/ph/lakekeeper-operator. var lakekeeperGVR = schema.GroupVersionResource{ Group: "lakekeeper.k8s.lakekeeper.io", @@ -327,6 +341,39 @@ func (c *LakekeeperK8sClient) EnsureCR(ctx context.Context, spec LakekeeperCRSpe }, }, }, + // Pin a fixed pod shape with requests == limits → Guaranteed QoS. + // The managed-warehouse clusters require it; an unbounded catalog + // pod runs BestEffort and is first evicted under node pressure. + // Lakekeeper is a light Rust REST catalog, so a modest shape is + // plenty — tune the consts if a tenant needs more. + "resources": map[string]interface{}{ + "requests": map[string]interface{}{ + "cpu": lakekeeperPodCPU, + "memory": lakekeeperPodMemory, + }, + "limits": map[string]interface{}{ + "cpu": lakekeeperPodCPU, + "memory": lakekeeperPodMemory, + }, + }, + // Stamp Prometheus scrape annotations onto the operator-managed + // pods. The managed-warehouse clusters have no prometheus-operator; + // vmagent discovers targets by pod annotation (kubernetes_sd), and + // the Lakekeeper CRD exposes no other pod-metadata hook — so without + // this the per-org catalog pods are never scraped. Lakekeeper serves + // metrics on the operator's "metrics" container port (its + // getMetricsPort default = lakekeeperMetricsPort). Requires the + // spec.podMetadata passthrough from PostHog's operator fork (branch + // posthog/serviceaccountname); on an operator without it the CRD + // prunes the field and these annotations are dropped — a safe no-op + // until the new operator image ships. + "podMetadata": map[string]interface{}{ + "annotations": map[string]interface{}{ + "prometheus.io/scrape": "true", + "prometheus.io/port": lakekeeperMetricsPort, + "prometheus.io/path": "/metrics", + }, + }, }, }, } diff --git a/controlplane/provisioner/lakekeeper_k8s_test.go b/controlplane/provisioner/lakekeeper_k8s_test.go index 1a3ed15e..0f50defe 100644 --- a/controlplane/provisioner/lakekeeper_k8s_test.go +++ b/controlplane/provisioner/lakekeeper_k8s_test.go @@ -225,6 +225,27 @@ func TestEnsureCR_CreateAndShape(t *testing.T) { if pg["host"] != spec.PGHost || pg["database"] != spec.PGDatabase { t.Errorf("pg host/db = %v/%v, want %s/%s", pg["host"], pg["database"], spec.PGHost, spec.PGDatabase) } + // Resources are pinned with requests == limits (Guaranteed QoS). + res := specMap["resources"].(map[string]interface{}) + reqs := res["requests"].(map[string]interface{}) + lims := res["limits"].(map[string]interface{}) + if reqs["cpu"] != lakekeeperPodCPU || lims["cpu"] != lakekeeperPodCPU { + t.Errorf("cpu req/lim = %v/%v, want %s/%s", reqs["cpu"], lims["cpu"], lakekeeperPodCPU, lakekeeperPodCPU) + } + if reqs["memory"] != lakekeeperPodMemory || lims["memory"] != lakekeeperPodMemory { + t.Errorf("memory req/lim = %v/%v, want %s/%s", reqs["memory"], lims["memory"], lakekeeperPodMemory, lakekeeperPodMemory) + } + // Prometheus scrape annotations are stamped onto the pod via podMetadata. + ann := specMap["podMetadata"].(map[string]interface{})["annotations"].(map[string]interface{}) + if ann["prometheus.io/scrape"] != "true" { + t.Errorf("prometheus.io/scrape = %v, want true", ann["prometheus.io/scrape"]) + } + if ann["prometheus.io/port"] != lakekeeperMetricsPort { + t.Errorf("prometheus.io/port = %v, want %s", ann["prometheus.io/port"], lakekeeperMetricsPort) + } + if ann["prometheus.io/path"] != "/metrics" { + t.Errorf("prometheus.io/path = %v, want /metrics", ann["prometheus.io/path"]) + } } func TestEnsureCR_KubernetesAuthOff_OmitsAuthenticationBlock(t *testing.T) {