Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ public final class GeneralConfig {
public static final String TRACER_METRICS_MAX_PENDING = "trace.tracer.metrics.max.pending";
public static final String TRACER_METRICS_IGNORED_RESOURCES =
"trace.tracer.metrics.ignored.resources";
public static final String TRACE_STATS_ADDITIONAL_TAGS = "trace.stats.additional.tags";
public static final String TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT =
"trace.stats.additional.tags.cardinality.limit";

public static final String AZURE_APP_SERVICES = "azure.app.services";
public static final String INTERNAL_EXIT_ON_FAILURE = "trace.internal.exit.on.failure";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ public class ConflatingMetricsAggregatorBenchmark {
new ConflatingMetricsAggregator(
new WellKnownTags("", "", "", "", "", ""),
Collections.emptySet(),
Collections.emptySet(),
100,
featuresDiscovery,
HealthMetrics.NO_OP,
new NullSink(),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package datadog.trace.common.metrics;

import datadog.trace.core.monitor.HealthMetrics;
import java.util.Collections;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Bounded per-tag cardinality protection for `additional_metric_tags`.
*
* <p>For each configured tag key, admits at most {@code limitPerTag} distinct values within a
* rolling window. Excess values are replaced with {@link #BLOCKED_VALUE} so the span's base stats
* still flow through but the extra dimension is suppressed.
*
* <p>The rolling window is implemented as a hard reset: callers schedule {@link #reset()} on a
* fixed interval (10 minutes by default). After a reset, previously blocked values get a fresh
* chance to be admitted.
*/
final class AdditionalTagsCardinalityLimiter {

static final String BLOCKED_VALUE = "blocked_by_tracer";

private static final Logger log = LoggerFactory.getLogger(AdditionalTagsCardinalityLimiter.class);

private final int limitPerTag;
private final HealthMetrics healthMetrics;
private final ConcurrentHashMap<String, Set<String>> seenValuesPerTag = new ConcurrentHashMap<>();
Copy link
Copy Markdown
Contributor

@dougqh dougqh May 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we're going to want a lighter weight collection for this purpose.
ConcurrentHashMap is very allocation heavy.

As mentioned elsewhere, I think a better would be to have an AdditionalTagCardinalityLimiter per tag. Then we don't need the extra allocation heft of one of the ConcurrentHashMap.

I'm also debating whether we can do something cheaper for the Set, but that I think we could leave for later.

I'm also hoping to get away from the extra coordination overhead needed for ConcurrentHashMap by moving more of the metrics work into a background thread.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you propose we use instead? An array of length 100 would work but then our lookup time is less efficient.

Copy link
Copy Markdown
Contributor

@dougqh dougqh May 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not necessarily.

Pointer chasing actually isn't great on modern hardware.
And arrays benefit from prefetch from caches, so often a small array beats a linked list.

Admittedly, I need to benchmark a bit to determine what's best here.

private final Set<String> warnedAboutCardinality =
Collections.newSetFromMap(new ConcurrentHashMap<>());
private final Set<String> warnedAboutLength =
Collections.newSetFromMap(new ConcurrentHashMap<>());

AdditionalTagsCardinalityLimiter(int limitPerTag, HealthMetrics healthMetrics) {
this.limitPerTag = limitPerTag;
this.healthMetrics = healthMetrics;
}

/**
* @return {@code value} if admitted under the cap, otherwise {@link #BLOCKED_VALUE}.
*/
String admitOrBlock(String tagKey, String value) {
Set<String> seen =
seenValuesPerTag.computeIfAbsent(
tagKey, k -> Collections.newSetFromMap(new ConcurrentHashMap<>()));
if (seen.contains(value)) {
return value;
}
if (seen.size() >= limitPerTag) {
healthMetrics.onAdditionalTagValueCardinalityBlocked(tagKey);
if (warnedAboutCardinality.add(tagKey)) {
log.warn(
"Additional metric tag '{}' exceeded the per-tag cardinality limit of {}; "
+ "replacing values with '{}' for the rest of the current window",
tagKey,
limitPerTag,
BLOCKED_VALUE);
}
return BLOCKED_VALUE;
}
seen.add(value);
return value;
}

/**
* Records that a value for {@code tagKey} was blocked due to exceeding the per-value length cap.
* Fires the same health metric as a cardinality block and emits a distinct warn log line once per
* tag key per window.
*/
void noteBlockedDueToLength(String tagKey, int valueLength, int maxLength) {
healthMetrics.onAdditionalTagValueCardinalityBlocked(tagKey);
if (warnedAboutLength.add(tagKey)) {
log.warn(
"Additional metric tag '{}' had a value of length {} exceeding the max length of {}; "
+ "replacing with '{}' for the rest of the current window",
tagKey,
valueLength,
maxLength,
BLOCKED_VALUE);
}
}

/** Clears per-tag value sets and rearms the per-key log lines. Invoked by the periodic task. */
void reset() {
for (Set<String> seen : seenValuesPerTag.values()) {
seen.clear();
}
warnedAboutCardinality.clear();
warnedAboutLength.clear();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,16 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve
Pair.of(
DDCaches.newFixedSizeCache(512),
value -> UTF8BytesString.create(key + ":" + value));
private static final DDCache<
String, Pair<DDCache<String, UTF8BytesString>, Function<String, UTF8BytesString>>>
ADDITIONAL_TAG_VALUES_CACHE = DDCaches.newFixedSizeCache(64);
private static final Function<
String, Pair<DDCache<String, UTF8BytesString>, Function<String, UTF8BytesString>>>
ADDITIONAL_TAG_VALUES_CACHE_ADDER =
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In hindsight, I wish we hadn't chosen to concatenate key + ":" + value in the payload. For keeping memory down, separate fields would probably have been better.

key ->
Pair.of(
DDCaches.newFixedSizeCache(512),
value -> UTF8BytesString.create(key + ":" + value));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think as is this degenerates into hot allocation when a tag has high cardinality. In high cardinality situation, we need to avoid the concatenation before it happens.

Right now, we're still paying the allocation cost and undoing the benefit of the cache.

Copy link
Copy Markdown
Contributor

@dougqh dougqh May 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to verify, I ran this through Claude as well...

Vulnerabilities / Misconfiguration risk

  1. No per-value cardinality cap (acknowledged in PR description)
    The MAX_ADDITIONAL_TAG_KEYS = 10 cap protects against the number of configured keys. It does nothing about the number of unique values per key. A single misconfiguration —
    DD_TRACE_STATS_ADDITIONAL_TAGS=user_id or request_id or trace_id — gives the customer an unbounded MetricKey set, unbounded keys map growth, and an oversized payload to the
    agent. The PR explicitly defers cardinality protection ("requires an HLL/bounded-set estimator"), which is reasonable for MVP — but in its current form this config is a foot-gun
    and should not be enabled by default, which it isn't. Two suggestions:
  • Add a log.warn (one-shot, at startup) when additionalTagKeys is non-empty, calling out the cardinality risk and pointing at the future
    DD_TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT config.
  • Consider a stop-gap "max unique MetricKeys before we stop adding" guard rather than waiting for the full HLL implementation.
  1. Tag values are not length-bounded before being interned and serialized
    UTF8BytesString.create(key + ":" + value) will faithfully serialize whatever the customer set on the span tag. If application code stuffs a JSON body or a stack trace into a tag
    named in DD_TRACE_STATS_ADDITIONAL_TAGS, the per-value cache entry, the MetricKey, and the msgpack payload all carry it. This is pre-existing for peer tags too, but again is
    amplified here because the user controls which keys feed this path. A value.length() < N guard with a one-shot warn would be cheap insurance.

Overhead
5. Cache footprint scales with MAX_ADDITIONAL_TAG_KEYS × per-key-cache-size
ADDITIONAL_TAG_VALUES_CACHE is newFixedSizeCache(64) (plenty) but each entry holds an inner newFixedSizeCache(512). With 10 configured keys that's up to 10×512 = 5,120 cached
UTF8BytesString entries plus the lambda closures, vs. the existing peer-tags cache of similar size. Acceptable, but worth a comment near ADDITIONAL_TAG_VALUES_CACHE explaining
the bound (mirrors the existing comment on PEER_TAGS_CACHE).

Copy link
Copy Markdown
Contributor

@dougqh dougqh May 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also worth noting, that the cache is still unbounded in terms of bytes. That has bit us with SQL statements previously.

And I guess part of what Claude is pointing out is that tags may also contain complicated objects, we might need to filter those out otherwise allocation could explode if misconfigured. Although, that was definitely a pre-existing issue.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added basic cardinality control and will add a char limit to the string values, would that address all your concerns here?

Copy link
Copy Markdown
Contributor

@dougqh dougqh May 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm experimenting with a revised approach.

I think the simplest solution might be to combine the cardinality tracking and the caching into a single class.

In hindsight, I'd argue that DDCache isn't a great fit for metrics. DDCache works well with low cardinality, but doesn't work well in situations where cardinality can be high.

In a strict sense, that's okay because the GC will be able to reclaim the memory, so memory isn't unbounded. However, a solution that deals with high cardinality more gracefully would be preferable.

private static final CharSequence SYNTHETICS_ORIGIN = "synthetics";

private static final Set<String> ELIGIBLE_SPAN_KINDS_FOR_METRICS =
Expand All @@ -92,7 +102,21 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve
unmodifiableSet(
new HashSet<>(Arrays.asList(SPAN_KIND_CLIENT, SPAN_KIND_PRODUCER, SPAN_KIND_CONSUMER)));

// Cap on configured additional metric tag keys. By default only 4 primary tag dimensions are
// supported.
// We sometimes increase this limit for users so a value of 10 allows us to protect against
// extreme misconfiguration
// while still allowing some additional tags to be used.
static final int MAX_ADDITIONAL_TAG_KEYS = 10;

// Maximum length of an additional metric tag *value*. Caps cache footprint and wire payload
// size from stack-trace / JSON / SQL stuffed into a tag by misconfigured app code. Values
// exceeding this are emitted as `<tagKey>:blocked_by_tracer`.
static final int MAX_ADDITIONAL_TAG_VALUE_LENGTH = 250;

private final Set<String> ignoredResources;
private final List<String> additionalTagKeys;
private final AdditionalTagsCardinalityLimiter cardinalityLimiter;
Copy link
Copy Markdown
Contributor

@dougqh dougqh May 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think rather than a single AdditionalTagsCardinalityLimiter that handles all the extra tags. A better design would be a handler per configured tag.

That would avoid the Map of Set design which is adding undue allocation.

Although for the most part, I think we just need to take a step back and rework this code before we add more to it. Rather than continue to review this PR, I'd like to work on solutions to these issues more directly.

private final MessagePassingQueue<Batch> batchPool;
private final ConcurrentHashMap<MetricKey, Batch> pending;
private final ConcurrentHashMap<MetricKey, MetricKey> keys;
Expand All @@ -107,6 +131,10 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve
private final boolean includeEndpointInMetrics;

private volatile AgentTaskScheduler.Scheduled<?> cancellation;
private volatile AgentTaskScheduler.Scheduled<?> cardinalityResetCancellation;

// Hard-reset window for per-tag value cardinality tracking.
static final long CARDINALITY_RESET_INTERVAL_MINUTES = 10;

public ConflatingMetricsAggregator(
Config config,
Expand All @@ -115,6 +143,8 @@ public ConflatingMetricsAggregator(
this(
config.getWellKnownTags(),
config.getMetricsIgnoredResources(),
config.getTraceStatsAdditionalTags(),
config.getTraceStatsAdditionalTagsCardinalityLimit(),
sharedCommunicationObjects.featuresDiscovery(config),
healthMetrics,
new OkHttpSink(
Expand All @@ -132,6 +162,8 @@ public ConflatingMetricsAggregator(
ConflatingMetricsAggregator(
WellKnownTags wellKnownTags,
Set<String> ignoredResources,
Set<String> additionalTagKeys,
int additionalTagsCardinalityLimit,
DDAgentFeaturesDiscovery features,
HealthMetrics healthMetric,
Sink sink,
Expand All @@ -141,6 +173,8 @@ public ConflatingMetricsAggregator(
this(
wellKnownTags,
ignoredResources,
additionalTagKeys,
additionalTagsCardinalityLimit,
features,
healthMetric,
sink,
Expand All @@ -154,6 +188,8 @@ public ConflatingMetricsAggregator(
ConflatingMetricsAggregator(
WellKnownTags wellKnownTags,
Set<String> ignoredResources,
Set<String> additionalTagKeys,
int additionalTagsCardinalityLimit,
DDAgentFeaturesDiscovery features,
HealthMetrics healthMetric,
Sink sink,
Expand All @@ -164,6 +200,8 @@ public ConflatingMetricsAggregator(
boolean includeEndpointInMetrics) {
this(
ignoredResources,
additionalTagKeys,
additionalTagsCardinalityLimit,
features,
healthMetric,
sink,
Expand All @@ -177,6 +215,8 @@ public ConflatingMetricsAggregator(

ConflatingMetricsAggregator(
Set<String> ignoredResources,
Set<String> additionalTagKeys,
int additionalTagsCardinalityLimit,
DDAgentFeaturesDiscovery features,
HealthMetrics healthMetric,
Sink sink,
Expand All @@ -187,6 +227,9 @@ public ConflatingMetricsAggregator(
TimeUnit timeUnit,
boolean includeEndpointInMetrics) {
this.ignoredResources = ignoredResources;
this.additionalTagKeys = normalizeAdditionalTagKeys(additionalTagKeys);
this.cardinalityLimiter =
new AdditionalTagsCardinalityLimiter(additionalTagsCardinalityLimit, healthMetric);
this.includeEndpointInMetrics = includeEndpointInMetrics;
this.inbox = Queues.mpscArrayQueue(queueSize);
this.batchPool = Queues.spmcArrayQueue(maxAggregates);
Expand Down Expand Up @@ -223,6 +266,14 @@ public void start() {
reportingInterval,
reportingInterval,
reportingIntervalTimeUnit);
cardinalityResetCancellation =
AgentTaskScheduler.get()
.scheduleAtFixedRate(
new CardinalityResetTask(),
this,
CARDINALITY_RESET_INTERVAL_MINUTES,
CARDINALITY_RESET_INTERVAL_MINUTES,
TimeUnit.MINUTES);
log.debug("started metrics aggregator");
}

Expand Down Expand Up @@ -350,7 +401,8 @@ private boolean publish(CoreSpan<?> span, boolean isTopLevel, CharSequence spanK
getPeerTags(span, spanKind.toString()),
httpMethod,
httpEndpoint,
grpcStatusCode);
grpcStatusCode,
getAdditionalTags(span));
MetricKey key = keys.putIfAbsent(newKey, newKey);
if (null == key) {
key = newKey;
Expand Down Expand Up @@ -413,6 +465,55 @@ private List<UTF8BytesString> getPeerTags(CoreSpan<?> span, String spanKind) {
return Collections.emptyList();
}

static List<String> normalizeAdditionalTagKeys(Set<String> configured) {
if (configured == null || configured.isEmpty()) {
return Collections.emptyList();
}
List<String> sorted = new ArrayList<>(configured);
Collections.sort(sorted);
if (sorted.size() > MAX_ADDITIONAL_TAG_KEYS) {
log.warn(
"Configured additional metric tag keys ({}) exceeds the supported limit of {}; "
+ "dropping extra keys: {}",
sorted.size(),
MAX_ADDITIONAL_TAG_KEYS,
sorted.subList(MAX_ADDITIONAL_TAG_KEYS, sorted.size()));
sorted = sorted.subList(0, MAX_ADDITIONAL_TAG_KEYS);
}
return Collections.unmodifiableList(new ArrayList<>(sorted));
}

private List<UTF8BytesString> getAdditionalTags(CoreSpan<?> span) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have high allocation from creating MetricKey-s just to perform Map look-ups.
Producing an intermediate list with UTF8BytesStrings is going to compound that problem significantly.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I copied how we interact with peer tags, to keep behavior consistent. What do you propose I do differently?

if (additionalTagKeys.isEmpty()) {
return Collections.emptyList();
}
List<UTF8BytesString> result = null;
for (String tagKey : additionalTagKeys) {
Object value = span.unsafeGetTag(tagKey);
if (value == null) {
continue;
}
String rawValue = value.toString();
String admittedValue;
if (rawValue.length() > MAX_ADDITIONAL_TAG_VALUE_LENGTH) {
cardinalityLimiter.noteBlockedDueToLength(
tagKey, rawValue.length(), MAX_ADDITIONAL_TAG_VALUE_LENGTH);
admittedValue = AdditionalTagsCardinalityLimiter.BLOCKED_VALUE;
} else {
admittedValue = cardinalityLimiter.admitOrBlock(tagKey, rawValue);
}
Pair<DDCache<String, UTF8BytesString>, Function<String, UTF8BytesString>> cacheAndCreator =
ADDITIONAL_TAG_VALUES_CACHE.computeIfAbsent(tagKey, ADDITIONAL_TAG_VALUES_CACHE_ADDER);
UTF8BytesString formatted =
cacheAndCreator.getLeft().computeIfAbsent(admittedValue, cacheAndCreator.getRight());
if (result == null) {
result = new ArrayList<>(additionalTagKeys.size());
}
result.add(formatted);
}
return result == null ? Collections.emptyList() : result;
}

private static boolean isSynthetic(CoreSpan<?> span) {
return span.getOrigin() != null && SYNTHETICS_ORIGIN.equals(span.getOrigin().toString());
}
Expand All @@ -429,6 +530,9 @@ public void stop() {
if (null != cancellation) {
cancellation.cancel();
}
if (null != cardinalityResetCancellation) {
cardinalityResetCancellation.cancel();
}
inbox.offer(STOP);
}

Expand Down Expand Up @@ -482,4 +586,13 @@ public void run(ConflatingMetricsAggregator target) {
target.report();
}
}

private static final class CardinalityResetTask
implements AgentTaskScheduler.Task<ConflatingMetricsAggregator> {

@Override
public void run(ConflatingMetricsAggregator target) {
target.cardinalityLimiter.reset();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ public final class MetricKey {
private final UTF8BytesString httpMethod;
private final UTF8BytesString httpEndpoint;
private final UTF8BytesString grpcStatusCode;
private final List<UTF8BytesString> additionalTags;

public MetricKey(
CharSequence resource,
Expand All @@ -53,7 +54,8 @@ public MetricKey(
List<UTF8BytesString> peerTags,
CharSequence httpMethod,
CharSequence httpEndpoint,
CharSequence grpcStatusCode) {
CharSequence grpcStatusCode,
List<UTF8BytesString> additionalTags) {
this.resource = null == resource ? EMPTY : utf8(RESOURCE_CACHE, resource);
this.service = null == service ? EMPTY : utf8(SERVICE_CACHE, service);
this.serviceSource = null == serviceSource ? null : utf8(SERVICE_SOURCE_CACHE, serviceSource);
Expand All @@ -68,6 +70,7 @@ public MetricKey(
this.httpEndpoint = httpEndpoint == null ? null : utf8(HTTP_ENDPOINT_CACHE, httpEndpoint);
this.grpcStatusCode =
grpcStatusCode == null ? null : utf8(GRPC_STATUS_CODE_CACHE, grpcStatusCode);
this.additionalTags = additionalTags == null ? Collections.emptyList() : additionalTags;

int tmpHash = 0;
tmpHash = HashingUtils.addToHash(tmpHash, this.isTraceRoot);
Expand All @@ -83,6 +86,7 @@ public MetricKey(
tmpHash = HashingUtils.addToHash(tmpHash, this.httpEndpoint);
tmpHash = HashingUtils.addToHash(tmpHash, this.httpMethod);
tmpHash = HashingUtils.addToHash(tmpHash, this.grpcStatusCode);
tmpHash = HashingUtils.addToHash(tmpHash, this.additionalTags);
this.hash = tmpHash;
}

Expand Down Expand Up @@ -146,6 +150,10 @@ public UTF8BytesString getGrpcStatusCode() {
return grpcStatusCode;
}

public List<UTF8BytesString> getAdditionalTags() {
return additionalTags;
}

@Override
public boolean equals(Object o) {
if (this == o) {
Expand All @@ -166,7 +174,8 @@ public boolean equals(Object o) {
&& Objects.equals(serviceSource, metricKey.serviceSource)
&& Objects.equals(httpMethod, metricKey.httpMethod)
&& Objects.equals(httpEndpoint, metricKey.httpEndpoint)
&& Objects.equals(grpcStatusCode, metricKey.grpcStatusCode);
&& Objects.equals(grpcStatusCode, metricKey.grpcStatusCode)
&& additionalTags.equals(metricKey.additionalTags);
}
return false;
}
Expand Down
Loading