Skip to content

Commit f4b2882

Browse files
committed
wip
Signed-off-by: Attila Mészáros <a_meszaros@apple.com>
1 parent 1a48e38 commit f4b2882

7 files changed

Lines changed: 69 additions & 240 deletions

File tree

micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java

Lines changed: 66 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
*/
1616
package io.javaoperatorsdk.operator.monitoring.micrometer;
1717

18+
import java.time.Duration;
1819
import java.util.*;
1920
import java.util.concurrent.ConcurrentHashMap;
2021
import java.util.concurrent.atomic.AtomicInteger;
@@ -40,6 +41,7 @@
4041
public class MicrometerMetricsV2 implements Metrics {
4142

4243
private static final String CONTROLLER_NAME = "controller.name";
44+
private static final String NAMESPACE = "namespace";
4345
private static final String EVENT = "event";
4446
private static final String ACTION = "action";
4547
private static final String EVENTS_RECEIVED = "events.received";
@@ -59,13 +61,6 @@ public class MicrometerMetricsV2 implements Metrics {
5961
RECONCILIATIONS + "retries" + TOTAL_SUFFIX;
6062
private static final String RECONCILIATIONS_STARTED = RECONCILIATIONS + "started" + TOTAL_SUFFIX;
6163

62-
private static final String CONTROLLERS = "controllers.";
63-
64-
private static final String CONTROLLERS_SUCCESSFUL_EXECUTION =
65-
CONTROLLERS + SUCCESS_SUFFIX + TOTAL_SUFFIX;
66-
private static final String CONTROLLERS_FAILED_EXECUTION =
67-
CONTROLLERS + FAILURE_SUFFIX + TOTAL_SUFFIX;
68-
6964
private static final String RECONCILIATIONS_EXECUTIONS_GAUGE = RECONCILIATIONS + "executions";
7065
private static final String RECONCILIATIONS_QUEUE_SIZE_GAUGE = RECONCILIATIONS + "active";
7166
private static final String NUMBER_OF_RESOURCE_GAUGE = "custom_resources";
@@ -77,6 +72,7 @@ public class MicrometerMetricsV2 implements Metrics {
7772
private final Map<String, AtomicInteger> gauges = new ConcurrentHashMap<>();
7873
private final Map<String, Timer> executionTimers = new ConcurrentHashMap<>();
7974
private final Function<Timer.Builder, Timer.Builder> timerConfig;
75+
private final boolean includeNamespaceTag;
8076

8177
/**
8278
* Creates a new builder to configure how the eventual MicrometerMetricsV2 instance will behave,
@@ -98,15 +94,34 @@ public static MicrometerMetricsV2Builder newPerResourceCollectingMicrometerMetri
9894
* @param timerConfig optional configuration for timers, defaults to publishing percentiles 0.5,
9995
* 0.95, 0.99 and histogram
10096
*/
101-
private MicrometerMetricsV2(MeterRegistry registry, Consumer<Timer.Builder> timerConfig) {
97+
private MicrometerMetricsV2(
98+
MeterRegistry registry, Consumer<Timer.Builder> timerConfig, boolean includeNamespaceTag) {
10299
this.registry = registry;
100+
this.includeNamespaceTag = includeNamespaceTag;
103101
this.timerConfig =
104102
timerConfig != null
105103
? builder -> {
106104
timerConfig.accept(builder);
107105
return builder;
108106
}
109-
: Timer.Builder::publishPercentileHistogram;
107+
// Use explicit SLO buckets rather than publishPercentileHistogram(). When using
108+
// OtlpMeterRegistry (Micrometer 1.12+), publishPercentileHistogram() sends Base2
109+
// Exponential Histograms over OTLP, which the OTel collector exposes as Prometheus
110+
// native histograms — incompatible with histogram_quantile() and classic _bucket
111+
// queries. Explicit SLO boundaries force EXPLICIT_BUCKET_HISTOGRAM format, which the
112+
// collector reliably exposes as _bucket metrics.
113+
: builder ->
114+
builder.serviceLevelObjectives(
115+
Duration.ofMillis(10),
116+
Duration.ofMillis(50),
117+
Duration.ofMillis(100),
118+
Duration.ofMillis(250),
119+
Duration.ofMillis(500),
120+
Duration.ofSeconds(1),
121+
Duration.ofSeconds(2),
122+
Duration.ofSeconds(5),
123+
Duration.ofSeconds(10),
124+
Duration.ofSeconds(30));
110125
}
111126

112127
@Override
@@ -138,29 +153,18 @@ private String numberOfResourcesRefName(String name) {
138153
return NUMBER_OF_RESOURCE_GAUGE + name;
139154
}
140155

141-
// todo make the implementation more extensible, like easily add tags for namespace into metrics
142-
// todo does it make sense to have both controller and reconciler execution counters?
143156
@Override
144157
public <T> T timeControllerExecution(ControllerExecution<T> execution) {
145158
final var name = execution.controllerName();
146-
147159
final var timer = executionTimers.get(name);
148-
try {
149-
final var result =
150-
timer.record(
151-
() -> {
152-
try {
153-
return execution.execute();
154-
} catch (Exception e) {
155-
throw new OperatorException(e);
156-
}
157-
});
158-
registry.counter(CONTROLLERS_SUCCESSFUL_EXECUTION, CONTROLLER_NAME, name).increment();
159-
return result;
160-
} catch (Exception e) {
161-
registry.counter(CONTROLLERS_FAILED_EXECUTION, CONTROLLER_NAME, name).increment();
162-
throw e;
163-
}
160+
return timer.record(
161+
() -> {
162+
try {
163+
return execution.execute();
164+
} catch (Exception e) {
165+
throw new OperatorException(e);
166+
}
167+
});
164168
}
165169

166170
@Override
@@ -172,14 +176,17 @@ public void receivedEvent(Event event, Map<String, Object> metadata) {
172176
if (resourceEvent.getAction() == ResourceAction.DELETED) {
173177
gauges.get(numberOfResourcesRefName(getControllerName(metadata))).decrementAndGet();
174178
}
179+
var namespace = resourceEvent.getRelatedCustomResourceID().getNamespace().orElse(null);
175180
incrementCounter(
176181
EVENTS_RECEIVED,
182+
namespace,
177183
metadata,
178184
Tag.of(EVENT, event.getClass().getSimpleName()),
179185
Tag.of(ACTION, resourceEvent.getAction().toString()));
180186
} else {
181187
incrementCounter(
182188
EVENTS_RECEIVED,
189+
null,
183190
metadata,
184191
Tag.of(EVENT, event.getClass().getSimpleName()),
185192
Tag.of(ACTION, UNKNOWN_ACTION));
@@ -188,20 +195,20 @@ public void receivedEvent(Event event, Map<String, Object> metadata) {
188195

189196
@Override
190197
public void cleanupDoneFor(ResourceID resourceID, Map<String, Object> metadata) {
191-
incrementCounter(EVENTS_DELETE, metadata);
198+
incrementCounter(EVENTS_DELETE, resourceID.getNamespace().orElse(null), metadata);
192199
}
193200

194201
@Override
195202
public void submittedForReconciliation(
196203
HasMetadata resource, RetryInfo retryInfoNullable, Map<String, Object> metadata) {
197204
Optional<RetryInfo> retryInfo = Optional.ofNullable(retryInfoNullable);
198205

199-
// Record the counter without retry tags
200-
incrementCounter(RECONCILIATIONS_STARTED, metadata);
206+
var namespace = resource.getMetadata().getNamespace();
207+
incrementCounter(RECONCILIATIONS_STARTED, namespace, metadata);
201208

202209
int retryNumber = retryInfo.map(RetryInfo::getAttemptCount).orElse(0);
203210
if (retryNumber > 0) {
204-
incrementCounter(RECONCILIATIONS_RETRIES_NUMBER, metadata);
211+
incrementCounter(RECONCILIATIONS_RETRIES_NUMBER, namespace, metadata);
205212
}
206213

207214
var controllerQueueSize =
@@ -212,7 +219,7 @@ public void submittedForReconciliation(
212219
@Override
213220
public void successfullyFinishedReconciliation(
214221
HasMetadata resource, Map<String, Object> metadata) {
215-
incrementCounter(RECONCILIATIONS_SUCCESS, metadata);
222+
incrementCounter(RECONCILIATIONS_SUCCESS, resource.getMetadata().getNamespace(), metadata);
216223
}
217224

218225
@Override
@@ -237,7 +244,7 @@ public void reconciliationExecutionFinished(
237244
@Override
238245
public void failedReconciliation(
239246
HasMetadata resource, RetryInfo retry, Exception exception, Map<String, Object> metadata) {
240-
incrementCounter(RECONCILIATIONS_FAILED, metadata);
247+
incrementCounter(RECONCILIATIONS_FAILED, resource.getMetadata().getNamespace(), metadata);
241248
}
242249

243250
private static void addTag(String name, String value, List<Tag> tags) {
@@ -252,11 +259,17 @@ private static void addControllerNameTag(String name, List<Tag> tags) {
252259
addTag(CONTROLLER_NAME, name, tags);
253260
}
254261

255-
private void incrementCounter(
256-
String counterName, Map<String, Object> metadata, Tag... additionalTags) {
262+
private void addNamespaceTag(String namespace, List<Tag> tags) {
263+
if (includeNamespaceTag && namespace != null && !namespace.isBlank()) {
264+
tags.add(Tag.of(NAMESPACE, namespace));
265+
}
266+
}
257267

258-
final var tags = new ArrayList<Tag>(1 + additionalTags.length);
268+
private void incrementCounter(
269+
String counterName, String namespace, Map<String, Object> metadata, Tag... additionalTags) {
270+
final var tags = new ArrayList<Tag>(2 + additionalTags.length);
259271
addControllerNameTag(metadata, tags);
272+
addNamespaceTag(namespace, tags);
260273
if (additionalTags.length > 0) {
261274
tags.addAll(List.of(additionalTags));
262275
}
@@ -278,6 +291,7 @@ public static String getControllerName(Map<String, Object> metadata) {
278291
public static class MicrometerMetricsV2Builder {
279292
protected final MeterRegistry registry;
280293
protected Consumer<Timer.Builder> executionTimerConfig = null;
294+
protected boolean includeNamespaceTag = false;
281295

282296
public MicrometerMetricsV2Builder(MeterRegistry registry) {
283297
this.registry = registry;
@@ -297,8 +311,22 @@ public MicrometerMetricsV2Builder withExecutionTimerConfig(
297311
return this;
298312
}
299313

314+
/**
315+
* When enabled, a {@code namespace} tag is added to all per-reconciliation counters (started,
316+
* success, failure, retries, events, deletes). Gauges remain controller-scoped because
317+
* namespaces are not known at controller registration time.
318+
*
319+
* <p>Disabled by default to avoid unexpected cardinality increases in existing deployments.
320+
*
321+
* @return this builder for method chaining
322+
*/
323+
public MicrometerMetricsV2Builder withNamespaceAsTag() {
324+
this.includeNamespaceTag = true;
325+
return this;
326+
}
327+
300328
public MicrometerMetricsV2 build() {
301-
return new MicrometerMetricsV2(registry, executionTimerConfig);
329+
return new MicrometerMetricsV2(registry, executionTimerConfig, includeNamespaceTag);
302330
}
303331
}
304332
}

observability/josdk-operator-metrics-dashboard.json

Lines changed: 1 addition & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -865,107 +865,6 @@
865865
"title": "Failures by Controller",
866866
"type": "timeseries"
867867
},
868-
{
869-
"datasource": {
870-
"type": "prometheus",
871-
"uid": "prometheus"
872-
},
873-
"description": "Controller execution success vs failure",
874-
"fieldConfig": {
875-
"defaults": {
876-
"color": {
877-
"mode": "palette-classic"
878-
},
879-
"custom": {
880-
"axisCenteredZero": false,
881-
"axisColorMode": "text",
882-
"axisLabel": "",
883-
"axisPlacement": "auto",
884-
"barAlignment": 0,
885-
"drawStyle": "line",
886-
"fillOpacity": 10,
887-
"gradientMode": "none",
888-
"hideFrom": {
889-
"tooltip": false,
890-
"viz": false,
891-
"legend": false
892-
},
893-
"lineInterpolation": "linear",
894-
"lineWidth": 1,
895-
"pointSize": 5,
896-
"scaleDistribution": {
897-
"type": "linear"
898-
},
899-
"showPoints": "never",
900-
"spanNulls": false,
901-
"stacking": {
902-
"group": "A",
903-
"mode": "none"
904-
},
905-
"thresholdsStyle": {
906-
"mode": "off"
907-
}
908-
},
909-
"mappings": [],
910-
"thresholds": {
911-
"mode": "absolute",
912-
"steps": [
913-
{
914-
"color": "green",
915-
"value": null
916-
}
917-
]
918-
},
919-
"unit": "ops"
920-
},
921-
"overrides": []
922-
},
923-
"gridPos": {
924-
"h": 8,
925-
"w": 12,
926-
"x": 0,
927-
"y": 32
928-
},
929-
"id": 10,
930-
"options": {
931-
"legend": {
932-
"calcs": ["last", "mean"],
933-
"displayMode": "table",
934-
"placement": "bottom",
935-
"showLegend": true
936-
},
937-
"tooltip": {
938-
"mode": "single",
939-
"sort": "none"
940-
}
941-
},
942-
"targets": [
943-
{
944-
"datasource": {
945-
"type": "prometheus",
946-
"uid": "prometheus"
947-
},
948-
"editorMode": "code",
949-
"expr": "sum(rate(controllers_success_total{service_name=~\"$service_name\"}[5m])) by (controller_name)",
950-
"legendFormat": "Success - {{controller_name}}",
951-
"range": true,
952-
"refId": "A"
953-
},
954-
{
955-
"datasource": {
956-
"type": "prometheus",
957-
"uid": "prometheus"
958-
},
959-
"editorMode": "code",
960-
"expr": "sum(rate(controllers_failure_total{service_name=~\"$service_name\"}[5m])) by (controller_name)",
961-
"legendFormat": "Failure - {{controller_name}}",
962-
"range": true,
963-
"refId": "B"
964-
}
965-
],
966-
"title": "Controller Execution Success vs Failure",
967-
"type": "timeseries"
968-
},
969868
{
970869
"datasource": {
971870
"type": "prometheus",
@@ -1165,7 +1064,7 @@
11651064
"current": {},
11661065
"datasource": {
11671066
"type": "prometheus",
1168-
"uid": "${DS_PROMETHEUS}"
1067+
"uid": "prometheus"
11691068
},
11701069
"definition": "label_values(reconciliations_started_total, service_name)",
11711070
"hide": 0,

sample-operators/metrics-processing/k8s/test-resource1-fail.yaml

Lines changed: 0 additions & 22 deletions
This file was deleted.

sample-operators/metrics-processing/k8s/test-resource1-success.yaml

Lines changed: 0 additions & 22 deletions
This file was deleted.

0 commit comments

Comments
 (0)