From 5908557aba7ded1835f11f0b0860253220eb4060 Mon Sep 17 00:00:00 2001
From: ArunPiduguDD <arun.pidugu@datadoghq.com>
Date: Thu, 18 Jun 2026 01:34:10 +0000
Subject: [PATCH 1/2] feat(tag_cardinality_limit): add per-tag
 cache_size_per_key override in probabilistic mode

---
 ...dinality_per_tag_cache_size.enhancement.md |   3 +
 .../tag_cardinality_limit/config.rs           |  19 +-
 src/transforms/tag_cardinality_limit/mod.rs   |  26 +-
 src/transforms/tag_cardinality_limit/tests.rs | 146 +++++-
 .../generated/tag_cardinality_limit.cue       |  26 +-
 .../cue/reference/generated/configuration.cue | 446 +-----------------
 6 files changed, 209 insertions(+), 457 deletions(-)
 create mode 100644 changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md
diff --git a/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md b/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md
new file mode 100644
index 0000000000000..65547bd0f350d
--- /dev/null
+++ b/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md
@@ -0,0 +1,3 @@
+Adds support for specifying a `cache_size_per_key` in per-tag override configuration options when in probabilistic mode. Previously, even if a per-tag `value_limit` override is specified, it would still inherit the same `cache_size_per_key` as the enclosing global/per-metric configuration, which can lead to a higher false positive rate if the per-tag `value_limit` is higher than the enclosing global/per-metric `value_limit`. The field is optional and falls back to the enclosing per-metric or global value when omitted (ignored when `mode` is `exact`).
+
+authors: ArunPiduguDD
diff --git a/src/transforms/tag_cardinality_limit/config.rs b/src/transforms/tag_cardinality_limit/config.rs
index a1471746972fa..9267cdd5f12bd 100644
--- a/src/transforms/tag_cardinality_limit/config.rs
+++ b/src/transforms/tag_cardinality_limit/config.rs
@@ -212,6 +212,10 @@ impl OverrideMode {
 ///   environment:
 ///     mode: limit_override  # track with a per-tag cap
 ///     value_limit: 3
+///   high_cardinality_tag:
+///     mode: limit_override
+///     value_limit: 1000
+///     cache_size_per_key: 102400  # larger bloom filter for this tag
 ///   trace_id:
 ///     mode: excluded        # opt out of tracking entirely
 /// ```
@@ -225,19 +229,24 @@ pub struct PerTagConfig {
 
 /// Mode applied to a specific tag key within a per-metric override.
 ///
-/// The tracking algorithm (`exact`/`probabilistic`), `cache_size_per_key`,
-/// `limit_exceeded_action`, and `internal_metrics` are always inherited from the
-/// enclosing per-metric configuration.
+/// The tracking algorithm (`exact`/`probabilistic`), `limit_exceeded_action`, and
+/// `internal_metrics` are inherited from the enclosing per-metric (or global) configuration.
+/// `cache_size_per_key` may optionally be overridden per tag when probabilistic mode is in use.
 #[configurable_component]
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 #[serde(tag = "mode", rename_all = "snake_case", deny_unknown_fields)]
 #[configurable(metadata(docs::enum_tag_description = "Controls how this tag key is handled."))]
 pub enum PerTagMode {
-    /// Track this tag with a per-tag value limit. The enclosing per-metric tracking
-    /// algorithm and all other settings still apply.
+    /// Track this tag with a per-tag value limit. All other settings are inherited from
+    /// the enclosing config.
     LimitOverride {
         /// Maximum number of distinct values to accept for this tag key.
         value_limit: usize,
+
+        /// Override the bloom filter cache size for this specific tag key.
+        /// Only effective in `probabilistic` mode. Inherits from the enclosing config when unset.
+        #[serde(default)]
+        cache_size_per_key: Option<usize>,
     },
     /// Opt this tag out of cardinality tracking entirely. All values pass through
     /// without being recorded or checked against any `value_limit`.
diff --git a/src/transforms/tag_cardinality_limit/mod.rs b/src/transforms/tag_cardinality_limit/mod.rs
index 755a23c8d1006..e3b382e1af6a8 100644
--- a/src/transforms/tag_cardinality_limit/mod.rs
+++ b/src/transforms/tag_cardinality_limit/mod.rs
@@ -26,6 +26,17 @@ use crate::event::metric::TagValueSet;
 
 type MetricId = (Option<String>, String);
 
+/// Applies a per-tag `cache_size_per_key` override to a `Mode`. No-op in exact mode or when
+/// `override_size` is `None`.
+const fn apply_cache_size_override(mode: Mode, override_size: Option<usize>) -> Mode {
+    match (mode, override_size) {
+        (Mode::Probabilistic(_), Some(size)) => Mode::Probabilistic(BloomFilterConfig {
+            cache_size_per_key: size,
+        }),
+        _ => mode,
+    }
+}
+
 /// Outcome of applying tag cardinality tracking to a tag value.
 #[derive(Debug, Eq, PartialEq)]
 enum AcceptResult {
@@ -126,13 +137,14 @@ impl TagCardinalityLimit {
         if let Some(per_tag) = per_metric.per_tag_limits.get(tag_key) {
             match per_tag.mode {
                 PerTagMode::Excluded => return TagSettings::Excluded,
-                PerTagMode::LimitOverride { value_limit } => {
-                    // Tracking algorithm and all other settings are always inherited
-                    // from the per-metric config.
+                PerTagMode::LimitOverride {
+                    value_limit,
+                    cache_size_per_key,
+                } => {
                     return TagSettings::Tracked(Inner {
                         value_limit,
                         limit_exceeded_action,
-                        mode: metric_mode,
+                        mode: apply_cache_size_override(metric_mode, cache_size_per_key),
                         internal_metrics,
                     });
                 }
@@ -152,8 +164,12 @@ impl TagCardinalityLimit {
         let global = self.config.global;
         match self.config.per_tag_limits.get(tag_key).map(|c| c.mode) {
             Some(PerTagMode::Excluded) => TagSettings::Excluded,
-            Some(PerTagMode::LimitOverride { value_limit }) => TagSettings::Tracked(Inner {
+            Some(PerTagMode::LimitOverride {
+                value_limit,
+                cache_size_per_key,
+            }) => TagSettings::Tracked(Inner {
                 value_limit,
+                mode: apply_cache_size_override(global.mode, cache_size_per_key),
                 ..global
             }),
             None => TagSettings::Tracked(global),
diff --git a/src/transforms/tag_cardinality_limit/tests.rs b/src/transforms/tag_cardinality_limit/tests.rs
index b8a453a3c4844..e4b5f246e0755 100644
--- a/src/transforms/tag_cardinality_limit/tests.rs
+++ b/src/transforms/tag_cardinality_limit/tests.rs
@@ -849,7 +849,10 @@ fn max_tracked_keys_caps_across_per_metric_buckets() {
 
 fn make_per_tag(value_limit: usize) -> PerTagConfig {
     PerTagConfig {
-        mode: PerTagMode::LimitOverride { value_limit },
+        mode: PerTagMode::LimitOverride {
+            value_limit,
+            cache_size_per_key: None,
+        },
     }
 }
 
@@ -1127,7 +1130,10 @@ fn tag_excluded_unbounded_sibling_limited() {
 #[test]
 fn per_tag_limit_override_caps_at_explicit_value() {
     let per_tag = PerTagConfig {
-        mode: PerTagMode::LimitOverride { value_limit: 2 },
+        mode: PerTagMode::LimitOverride {
+            value_limit: 2,
+            cache_size_per_key: None,
+        },
     };
     let config = make_transform_hashset_with_per_metric_limits(
         500,
@@ -1210,7 +1216,13 @@ per_metric_limits:
     let per_metric = parsed.per_metric_limits.get("metric_a").unwrap();
 
     let capped = per_metric.per_tag_limits.get("capped_tag").unwrap();
-    assert_eq!(capped.mode, PerTagMode::LimitOverride { value_limit: 10 });
+    assert_eq!(
+        capped.mode,
+        PerTagMode::LimitOverride {
+            value_limit: 10,
+            cache_size_per_key: None,
+        }
+    );
 
     let excluded = per_metric.per_tag_limits.get("excluded_tag").unwrap();
     assert_eq!(excluded.mode, PerTagMode::Excluded);
@@ -1456,8 +1468,134 @@ per_tag_limits:
     let parsed: Config = serde_yaml::from_str(yaml).expect("yaml should deserialize");
 
     let capped = parsed.per_tag_limits.get("capped_tag").unwrap();
-    assert_eq!(capped.mode, PerTagMode::LimitOverride { value_limit: 10 });
+    assert_eq!(
+        capped.mode,
+        PerTagMode::LimitOverride {
+            value_limit: 10,
+            cache_size_per_key: None,
+        }
+    );
 
     let excluded = parsed.per_tag_limits.get("excluded_tag").unwrap();
     assert_eq!(excluded.mode, PerTagMode::Excluded);
 }
+
+// ============================================================================
+// cache_size_per_key override tests
+// ============================================================================
+
+/// `apply_cache_size_override` replaces the bloom size when mode is probabilistic and an
+/// override is given; leaves the mode unchanged in all other cases.
+#[test]
+fn apply_cache_size_override_probabilistic_with_some() {
+    let base = Mode::Probabilistic(BloomFilterConfig {
+        cache_size_per_key: default_cache_size(),
+    });
+    let result = apply_cache_size_override(base, Some(1024));
+    assert_eq!(
+        result,
+        Mode::Probabilistic(BloomFilterConfig {
+            cache_size_per_key: 1024,
+        })
+    );
+}
+
+#[test]
+fn apply_cache_size_override_exact_with_some_is_noop() {
+    let result = apply_cache_size_override(Mode::Exact, Some(1024));
+    assert_eq!(result, Mode::Exact);
+}
+
+#[test]
+fn apply_cache_size_override_probabilistic_with_none_inherits() {
+    let base = Mode::Probabilistic(BloomFilterConfig {
+        cache_size_per_key: default_cache_size(),
+    });
+    let result = apply_cache_size_override(base, None);
+    assert_eq!(
+        result,
+        Mode::Probabilistic(BloomFilterConfig {
+            cache_size_per_key: default_cache_size(),
+        })
+    );
+}
+
+/// A per-metric `limit_override` with `cache_size_per_key` set deserializes correctly.
+#[test]
+fn per_tag_cache_size_per_key_deserializes() {
+    let yaml = r#"
+value_limit: 5
+mode: probabilistic
+cache_size_per_key: 5120
+per_metric_limits:
+  metric_a:
+    mode: probabilistic
+    cache_size_per_key: 5120
+    per_tag_limits:
+      big_tag:
+        mode: limit_override
+        value_limit: 100
+        cache_size_per_key: 32768
+      default_tag:
+        mode: limit_override
+        value_limit: 10
+"#;
+    let parsed: Config = serde_yaml::from_str(yaml).expect("yaml should deserialize");
+    let per_metric = parsed.per_metric_limits.get("metric_a").unwrap();
+
+    let big_tag = per_metric.per_tag_limits.get("big_tag").unwrap();
+    assert_eq!(
+        big_tag.mode,
+        PerTagMode::LimitOverride {
+            value_limit: 100,
+            cache_size_per_key: Some(32768),
+        }
+    );
+
+    // Omitting the field defaults to None (inherits from enclosing config).
+    let default_tag = per_metric.per_tag_limits.get("default_tag").unwrap();
+    assert_eq!(
+        default_tag.mode,
+        PerTagMode::LimitOverride {
+            value_limit: 10,
+            cache_size_per_key: None,
+        }
+    );
+}
+
+/// A global `per_tag_limits` `limit_override` with `cache_size_per_key` set deserializes correctly.
+#[test]
+fn global_per_tag_cache_size_per_key_deserializes() {
+    let yaml = r#"
+value_limit: 5
+mode: probabilistic
+cache_size_per_key: 5120
+per_tag_limits:
+  big_tag:
+    mode: limit_override
+    value_limit: 100
+    cache_size_per_key: 32768
+  default_tag:
+    mode: limit_override
+    value_limit: 10
+"#;
+    let parsed: Config = serde_yaml::from_str(yaml).expect("yaml should deserialize");
+
+    let big_tag = parsed.per_tag_limits.get("big_tag").unwrap();
+    assert_eq!(
+        big_tag.mode,
+        PerTagMode::LimitOverride {
+            value_limit: 100,
+            cache_size_per_key: Some(32768),
+        }
+    );
+
+    let default_tag = parsed.per_tag_limits.get("default_tag").unwrap();
+    assert_eq!(
+        default_tag.mode,
+        PerTagMode::LimitOverride {
+            value_limit: 10,
+            cache_size_per_key: None,
+        }
+    );
+}
diff --git a/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue b/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue
index fe47c37655f90..06cf8280975a4 100644
--- a/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue
+++ b/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue
@@ -154,6 +154,15 @@ generated: components: transforms: tag_cardinality_limit: configuration: {
 						description: "An individual tag configuration."
 						required:    true
 						type: object: options: {
+							cache_size_per_key: {
+								description: """
+																								Override the bloom filter cache size for this specific tag key.
+																								Only effective in `probabilistic` mode. Inherits from the enclosing config when unset.
+																								"""
+								relevant_when: "mode = \"limit_override\""
+								required:      false
+								type: uint: {}
+							}
 							mode: {
 								description: "Controls how this tag key is handled."
 								required:    true
@@ -163,8 +172,8 @@ generated: components: transforms: tag_cardinality_limit: configuration: {
 																											without being recorded or checked against any `value_limit`.
 																											"""
 									limit_override: """
-																											Track this tag with a per-tag value limit. The enclosing per-metric tracking
-																											algorithm and all other settings still apply.
+																											Track this tag with a per-tag value limit. All other settings are inherited from
+																											the enclosing config.
 																											"""
 								}
 							}
@@ -199,6 +208,15 @@ generated: components: transforms: tag_cardinality_limit: configuration: {
 			description: "An individual tag configuration."
 			required:    true
 			type: object: options: {
+				cache_size_per_key: {
+					description: """
+						Override the bloom filter cache size for this specific tag key.
+						Only effective in `probabilistic` mode. Inherits from the enclosing config when unset.
+						"""
+					relevant_when: "mode = \"limit_override\""
+					required:      false
+					type: uint: {}
+				}
 				mode: {
 					description: "Controls how this tag key is handled."
 					required:    true
@@ -208,8 +226,8 @@ generated: components: transforms: tag_cardinality_limit: configuration: {
 																			without being recorded or checked against any `value_limit`.
 																			"""
 						limit_override: """
-																			Track this tag with a per-tag value limit. The enclosing per-metric tracking
-																			algorithm and all other settings still apply.
+																			Track this tag with a per-tag value limit. All other settings are inherited from
+																			the enclosing config.
 																			"""
 					}
 				}
diff --git a/website/cue/reference/generated/configuration.cue b/website/cue/reference/generated/configuration.cue
index 20792a94ee322..827f14531e353 100644
--- a/website/cue/reference/generated/configuration.cue
+++ b/website/cue/reference/generated/configuration.cue
@@ -2,32 +2,6 @@ package metadata
 
 generated: configuration: {
 	configuration: {
-		api: {
-			type: object: options: {
-				address: {
-					type: string: {
-						default: "127.0.0.1:8686"
-						examples: ["0.0.0.0:8686", "127.0.0.1:1234"]
-					}
-					description: """
-						The network address to which the API should bind. If you're running
-						Vector in a Docker container, bind to `0.0.0.0`. Otherwise
-						the API will not be exposed outside the container.
-						"""
-					common:   true
-					required: false
-				}
-				enabled: {
-					type: bool: default: false
-					description: "Whether the API is enabled for this Vector instance."
-					common:      true
-					required:    false
-				}
-			}
-			description: "API options."
-			warnings: ["The API currently does not support authentication. Only enable it in isolated environments or for debugging. It must not be exposed to untrusted clients."]
-			group: "api"
-		}
 		enrichment_tables: {
 			type: object: options: "*": {
 				type: object: options: {
@@ -158,9 +132,8 @@ generated: configuration: {
 								required: true
 							}
 						}
-						description:   "File-specific settings."
-						required:      true
-						relevant_when: "type = \"file\""
+						description: "File-specific settings."
+						required:    true
 					}
 					schema: {
 						type: object: options: "*": {
@@ -213,175 +186,11 @@ generated: configuration: {
 														[rfc3339]: https://tools.ietf.org/html/rfc3339
 														[chrono_fmt]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html#specifiers
 														"""
-						required:      false
-						relevant_when: "type = \"file\""
-					}
-					flush_interval: {
-						type: uint: {}
-						description: """
-														The interval used for making writes visible in the table.
-														Longer intervals might get better performance,
-														but there is a longer delay before the data is visible in the table.
-														Since every TTL scan makes its changes visible, only use this value
-														if it is shorter than the `scan_interval`.
-
-														By default, all writes are made visible immediately.
-														"""
-						required:      false
-						relevant_when: "type = \"memory\""
-					}
-					internal_metrics: {
-						type: object: options: include_key_tag: {
-							type: bool: default: false
-							description: """
-																		Determines whether to include the key tag on internal metrics.
-
-																		This is useful for distinguishing between different keys while monitoring. However, the tag's
-																		cardinality is unbounded.
-																		"""
-							required: false
-						}
-						description:   "Configuration of internal metrics"
-						required:      false
-						relevant_when: "type = \"memory\""
-					}
-					max_byte_size: {
-						type: uint: {}
-						description: """
-														Maximum size of the table in bytes. All insertions that make
-														this table bigger than the maximum size are rejected.
-
-														By default, there is no size limit.
-														"""
-						required:      false
-						relevant_when: "type = \"memory\""
-					}
-					scan_interval: {
-						type: uint: default: 30
-						description: """
-														The scan interval used to look for expired records. This is provided
-														as an optimization to ensure that TTL is updated, but without doing
-														too many cache scans.
-														"""
-						required:      false
-						relevant_when: "type = \"memory\""
-					}
-					source_config: {
-						type: object: options: {
-							export_batch_size: {
-								type: uint: {}
-								description: """
-																		Batch size for data exporting. Used to prevent exporting entire table at
-																		once and blocking the system.
-
-																		By default, batches are not used and entire table is exported.
-																		"""
-								required: false
-							}
-							export_expired_items: {
-								type: bool: default: false
-								description: """
-																		Set to true to export expired items via the `expired` output port.
-																		Expired items ignore other settings and are exported as they are flushed from the table.
-																		"""
-								required: false
-							}
-							export_interval: {
-								type: uint: {}
-								description: "Interval for exporting all data from the table when used as a source."
-								required:    false
-							}
-							remove_after_export: {
-								type: bool: default: false
-								description: """
-																		If set to true, all data will be removed from cache after exporting.
-																		Only valid if used as a source and export_interval > 0
-
-																		By default, export will not remove data from cache
-																		"""
-								required: false
-							}
-							source_key: {
-								type: string: {}
-								description: """
-																		Key to use for this component when used as a source. This must be different from the
-																		component key.
-																		"""
-								required: true
-							}
-						}
-						description:   "Configuration for source functionality."
-						required:      false
-						relevant_when: "type = \"memory\""
-					}
-					ttl: {
-						type: uint: default: 600
-						description: """
-														TTL (time-to-live in seconds) is used to limit the lifetime of data stored in the cache.
-														When TTL expires, data behind a specific key in the cache is removed.
-														TTL is reset when the key is replaced.
-														"""
-						required:      false
-						relevant_when: "type = \"memory\""
-					}
-					ttl_field: {
-						type: string: default: ""
-						description:   "Field in the incoming value used as the TTL override."
-						required:      false
-						relevant_when: "type = \"memory\""
-					}
-					locale: {
-						type: string: default: "en"
-						description: """
-														The locale to use when querying the database.
-
-														MaxMind includes localized versions of some of the fields within their database, such as
-														country name. This setting can control which of those localized versions are returned by the
-														transform.
-
-														More information on which portions of the geolocation data are localized, and what languages
-														are available, can be found [here][locale_docs].
-
-														[locale_docs]: https://support.maxmind.com/hc/en-us/articles/4414877149467-IP-Geolocation-Data#h_01FRRGRYTGZB29ERDBZCX3MR8Q
-														"""
-						required:      false
-						relevant_when: "type = \"geoip\""
-					}
-					path: {
-						type: string: {}
-						description: """
-														Path to the [MaxMind GeoIP2][geoip2] or [GeoLite2 binary city database file][geolite2]
-														(**GeoLite2-City.mmdb**).
-
-														Other databases, such as the country database, are not supported.
-														`mmdb` enrichment table can be used for other databases.
-
-														[geoip2]: https://dev.maxmind.com/geoip/geoip2/downloadable
-														[geolite2]: https://dev.maxmind.com/geoip/geoip2/geolite2/#Download_Access
-														"""
-						required:      true
-						relevant_when: "type = \"geoip\" or type = \"mmdb\""
+						required: false
 					}
 					type: {
 						required: true
-						type: string: enum: {
-							file: "Exposes data from a static file as an enrichment table."
-							memory: """
-																	Exposes data from a memory cache as an enrichment table. The cache can be written to using
-																	a sink.
-																	"""
-							geoip: """
-																	Exposes data from a [MaxMind][maxmind] [GeoIP2][geoip2] database as an enrichment table.
-
-																	[maxmind]: https://www.maxmind.com/
-																	[geoip2]: https://www.maxmind.com/en/geoip2-databases
-																	"""
-							mmdb: """
-																	Exposes data from a [MaxMind][maxmind] database as an enrichment table.
-
-																	[maxmind]: https://www.maxmind.com/
-																	"""
-						}
+						type: string: enum: file: "Exposes data from a static file as an enrichment table."
 						description: "enrichment table type"
 					}
 				}
@@ -536,253 +345,12 @@ generated: configuration: {
 						required:      false
 						relevant_when: "type = \"exec\""
 					}
-					auth: {
-						type: object: options: {
-							access_key_id: {
-								type: string: examples: ["AKIAIOSFODNN7EXAMPLE"]
-								description: "The AWS access key ID."
-								required:    true
-							}
-							assume_role: {
-								type: string: examples: ["arn:aws:iam::123456789098:role/my_role"]
-								description: """
-																		The ARN of an [IAM role][iam_role] to assume.
-
-																		[iam_role]: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html
-																		"""
-								required: true
-							}
-							external_id: {
-								type: string: examples: ["randomEXAMPLEidString"]
-								description: """
-																		The optional unique external ID in conjunction with role to assume.
-
-																		[external_id]: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-user_externalid.html
-																		"""
-								required: false
-							}
-							region: {
-								type: string: examples: ["us-west-2"]
-								description: """
-																		The [AWS region][aws_region] to send STS requests to.
-
-																		If not set, this defaults to the configured region
-																		for the service itself.
-
-																		[aws_region]: https://docs.aws.amazon.com/general/latest/gr/rande.html#regional-endpoints
-																		"""
-								required: false
-							}
-							secret_access_key: {
-								type: string: examples: ["wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"]
-								description: "The AWS secret access key."
-								required:    true
-							}
-							session_name: {
-								type: string: examples: ["vector-indexer-role"]
-								description: """
-																		The optional [RoleSessionName][role_session_name] is a unique session identifier for your assumed role.
-
-																		Should be unique per principal or reason.
-																		If not set, the session name is autogenerated like assume-role-provider-1736428351340
-
-																		[role_session_name]: https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html
-																		"""
-								required: false
-							}
-							session_token: {
-								type: string: examples: ["AQoDYXdz...AQoDYXdz..."]
-								description: """
-																		The AWS session token.
-																		See [AWS temporary credentials](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_temp_use-resources.html)
-																		"""
-								required: false
-							}
-							credentials_file: {
-								type: string: examples: ["/my/aws/credentials"]
-								description: "Path to the credentials file."
-								required:    true
-							}
-							profile: {
-								type: string: {
-									default: "default"
-									examples: ["develop"]
-								}
-								description: """
-																		The credentials profile to use.
-
-																		Used to select AWS credentials from a provided credentials file.
-																		"""
-								required: false
-							}
-							imds: {
-								type: object: options: {
-									connect_timeout_seconds: {
-										type: uint: {
-											default: 1
-											unit:    "seconds"
-										}
-										description: "Connect timeout for IMDS."
-										required:    false
-									}
-									max_attempts: {
-										type: uint: default: 4
-										description: "Number of IMDS retries for fetching tokens and metadata."
-										required:    false
-									}
-									read_timeout_seconds: {
-										type: uint: {
-											default: 1
-											unit:    "seconds"
-										}
-										description: "Read timeout for IMDS."
-										required:    false
-									}
-								}
-								description: "Configuration for authenticating with AWS through IMDS."
-								required:    false
-							}
-							load_timeout_secs: {
-								type: uint: {
-									examples: [30]
-									unit: "seconds"
-								}
-								description: """
-																		Timeout for successfully loading any credentials, in seconds.
-
-																		Relevant when the default credentials chain or `assume_role` is used.
-																		"""
-								required: false
-							}
-						}
-						description:   "Configuration of the authentication strategy for interacting with AWS services."
-						required:      false
-						relevant_when: "type = \"aws_secrets_manager\""
-					}
-					secret_id: {
-						type: string: {}
-						description:   "ID of the secret to resolve."
-						required:      true
-						relevant_when: "type = \"aws_secrets_manager\""
-					}
-					tls: {
-						type: object: options: {
-							alpn_protocols: {
-								type: array: items: type: string: examples: ["h2"]
-								description: """
-																		Sets the list of supported ALPN protocols.
-
-																		Declare the supported ALPN protocols, which are used during negotiation with a peer. They are prioritized in the order
-																		that they are defined.
-																		"""
-								required: false
-							}
-							ca_file: {
-								type: string: examples: ["/path/to/certificate_authority.crt"]
-								description: """
-																		Absolute path to an additional CA certificate file.
-
-																		The certificate must be in the DER or PEM (X.509) format. Additionally, the certificate can be provided as an inline string in PEM format.
-																		"""
-								required: false
-							}
-							crt_file: {
-								type: string: examples: ["/path/to/host_certificate.crt"]
-								description: """
-																		Absolute path to a certificate file used to identify this server.
-
-																		The certificate must be in DER, PEM (X.509), or PKCS#12 format. Additionally, the certificate can be provided as
-																		an inline string in PEM format.
-
-																		If this is set _and_ is not a PKCS#12 archive, `key_file` must also be set.
-																		"""
-								required: false
-							}
-							key_file: {
-								type: string: examples: ["/path/to/host_certificate.key"]
-								description: """
-																		Absolute path to a private key file used to identify this server.
-
-																		The key must be in DER or PEM (PKCS#8) format. Additionally, the key can be provided as an inline string in PEM format.
-																		"""
-								required: false
-							}
-							key_pass: {
-								type: string: examples: ["${KEY_PASS_ENV_VAR}", "PassWord1"]
-								description: """
-																		Passphrase used to unlock the encrypted key file.
-
-																		This has no effect unless `key_file` is set.
-																		"""
-								required: false
-							}
-							server_name: {
-								type: string: examples: ["www.example.com"]
-								description: """
-																		Server name to use when using Server Name Indication (SNI).
-
-																		Only relevant for outgoing connections.
-																		"""
-								required: false
-							}
-							verify_certificate: {
-								type: bool: {}
-								description: """
-																		Enables certificate verification. For components that create a server, this requires that the
-																		client connections have a valid client certificate. For components that initiate requests,
-																		this validates that the upstream has a valid certificate.
-
-																		If enabled, certificates must not be expired and must be issued by a trusted
-																		issuer. This verification operates in a hierarchical manner, checking that the leaf certificate (the
-																		certificate presented by the client/server) is not only valid, but that the issuer of that certificate is also valid, and
-																		so on, until the verification process reaches a root certificate.
-
-																		Do NOT set this to `false` unless you understand the risks of not verifying the validity of certificates.
-																		"""
-								required: false
-							}
-							verify_hostname: {
-								type: bool: {}
-								description: """
-																		Enables hostname verification.
-
-																		If enabled, the hostname used to connect to the remote host must be present in the TLS certificate presented by
-																		the remote host, either as the Common Name or as an entry in the Subject Alternative Name extension.
-
-																		Only relevant for outgoing connections.
-
-																		Do NOT set this to `false` unless you understand the risks of not verifying the remote hostname.
-																		"""
-								required: false
-							}
-						}
-						description:   "TLS configuration."
-						required:      false
-						relevant_when: "type = \"aws_secrets_manager\""
-					}
-					endpoint: {
-						type: string: examples: ["http://127.0.0.0:5000/path/to/service"]
-						description:   "Custom endpoint for use with AWS-compatible services."
-						required:      false
-						relevant_when: "type = \"aws_secrets_manager\""
-					}
-					region: {
-						type: string: examples: ["us-east-1"]
-						description: """
-														The [AWS region][aws_region] of the target service.
-
-														[aws_region]: https://docs.aws.amazon.com/general/latest/gr/rande.html#regional-endpoints
-														"""
-						required:      false
-						relevant_when: "type = \"aws_secrets_manager\""
-					}
 					type: {
 						required: true
 						type: string: enum: {
-							file:                "File."
-							directory:           "Directory."
-							exec:                "Exec."
-							aws_secrets_manager: "AWS Secrets Manager."
+							file:      "File."
+							directory: "Directory."
+							exec:      "Exec."
 						}
 						description: "secret type"
 					}

From 657934e9c8010665499e212902f6737dbfea2749 Mon Sep 17 00:00:00 2001
From: ArunPiduguDD <arun.pidugu@datadoghq.com>
Date: Thu, 18 Jun 2026 03:27:40 +0000
Subject: [PATCH 2/2] chore: regenerate component docs

---
 ...dinality_per_tag_cache_size.enhancement.md |   2 +-
 src/transforms/tag_cardinality_limit/tests.rs |   4 -
 .../cue/reference/generated/configuration.cue | 446 +++++++++++++++++-
 3 files changed, 440 insertions(+), 12 deletions(-)

diff --git a/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md b/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md
index 65547bd0f350d..bb2f135a6399c 100644
--- a/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md
+++ b/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md
@@ -1,3 +1,3 @@
-Adds support for specifying a `cache_size_per_key` in per-tag override configuration options when in probabilistic mode. Previously, even if a per-tag `value_limit` override is specified, it would still inherit the same `cache_size_per_key` as the enclosing global/per-metric configuration, which can lead to a higher false positive rate if the per-tag `value_limit` is higher than the enclosing global/per-metric `value_limit`. The field is optional and falls back to the enclosing per-metric or global value when omitted (ignored when `mode` is `exact`).
+Adds support for specifying a `cache_size_per_key` in per-tag override configuration options when in probabilistic mode. Previously, even if a per-tag `value_limit` override is specified, it would still inherit the same `cache_size_per_key` as the enclosing global/per-metric configuration, which can lead to a higher false positive rate if the per-tag `value_limit` is higher than the enclosing global/per-metric `value_limit`. The field is optional and falls back to the enclosing per-metric or global cache size when omitted (ignored when `mode` is `exact`).
 
 authors: ArunPiduguDD
diff --git a/src/transforms/tag_cardinality_limit/tests.rs b/src/transforms/tag_cardinality_limit/tests.rs
index e4b5f246e0755..55fa3fe153902 100644
--- a/src/transforms/tag_cardinality_limit/tests.rs
+++ b/src/transforms/tag_cardinality_limit/tests.rs
@@ -1480,10 +1480,6 @@ per_tag_limits:
     assert_eq!(excluded.mode, PerTagMode::Excluded);
 }
 
-// ============================================================================
-// cache_size_per_key override tests
-// ============================================================================
-
 /// `apply_cache_size_override` replaces the bloom size when mode is probabilistic and an
 /// override is given; leaves the mode unchanged in all other cases.
 #[test]
diff --git a/website/cue/reference/generated/configuration.cue b/website/cue/reference/generated/configuration.cue
index 827f14531e353..20792a94ee322 100644
--- a/website/cue/reference/generated/configuration.cue
+++ b/website/cue/reference/generated/configuration.cue
@@ -2,6 +2,32 @@ package metadata
 
 generated: configuration: {
 	configuration: {
+		api: {
+			type: object: options: {
+				address: {
+					type: string: {
+						default: "127.0.0.1:8686"
+						examples: ["0.0.0.0:8686", "127.0.0.1:1234"]
+					}
+					description: """
+						The network address to which the API should bind. If you're running
+						Vector in a Docker container, bind to `0.0.0.0`. Otherwise
+						the API will not be exposed outside the container.
+						"""
+					common:   true
+					required: false
+				}
+				enabled: {
+					type: bool: default: false
+					description: "Whether the API is enabled for this Vector instance."
+					common:      true
+					required:    false
+				}
+			}
+			description: "API options."
+			warnings: ["The API currently does not support authentication. Only enable it in isolated environments or for debugging. It must not be exposed to untrusted clients."]
+			group: "api"
+		}
 		enrichment_tables: {
 			type: object: options: "*": {
 				type: object: options: {
@@ -132,8 +158,9 @@ generated: configuration: {
 								required: true
 							}
 						}
-						description: "File-specific settings."
-						required:    true
+						description:   "File-specific settings."
+						required:      true
+						relevant_when: "type = \"file\""
 					}
 					schema: {
 						type: object: options: "*": {
@@ -186,11 +213,175 @@ generated: configuration: {
 														[rfc3339]: https://tools.ietf.org/html/rfc3339
 														[chrono_fmt]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html#specifiers
 														"""
-						required: false
+						required:      false
+						relevant_when: "type = \"file\""
+					}
+					flush_interval: {
+						type: uint: {}
+						description: """
+														The interval used for making writes visible in the table.
+														Longer intervals might get better performance,
+														but there is a longer delay before the data is visible in the table.
+														Since every TTL scan makes its changes visible, only use this value
+														if it is shorter than the `scan_interval`.
+
+														By default, all writes are made visible immediately.
+														"""
+						required:      false
+						relevant_when: "type = \"memory\""
+					}
+					internal_metrics: {
+						type: object: options: include_key_tag: {
+							type: bool: default: false
+							description: """
+																		Determines whether to include the key tag on internal metrics.
+
+																		This is useful for distinguishing between different keys while monitoring. However, the tag's
+																		cardinality is unbounded.
+																		"""
+							required: false
+						}
+						description:   "Configuration of internal metrics"
+						required:      false
+						relevant_when: "type = \"memory\""
+					}
+					max_byte_size: {
+						type: uint: {}
+						description: """
+														Maximum size of the table in bytes. All insertions that make
+														this table bigger than the maximum size are rejected.
+
+														By default, there is no size limit.
+														"""
+						required:      false
+						relevant_when: "type = \"memory\""
+					}
+					scan_interval: {
+						type: uint: default: 30
+						description: """
+														The scan interval used to look for expired records. This is provided
+														as an optimization to ensure that TTL is updated, but without doing
+														too many cache scans.
+														"""
+						required:      false
+						relevant_when: "type = \"memory\""
+					}
+					source_config: {
+						type: object: options: {
+							export_batch_size: {
+								type: uint: {}
+								description: """
+																		Batch size for data exporting. Used to prevent exporting entire table at
+																		once and blocking the system.
+
+																		By default, batches are not used and entire table is exported.
+																		"""
+								required: false
+							}
+							export_expired_items: {
+								type: bool: default: false
+								description: """
+																		Set to true to export expired items via the `expired` output port.
+																		Expired items ignore other settings and are exported as they are flushed from the table.
+																		"""
+								required: false
+							}
+							export_interval: {
+								type: uint: {}
+								description: "Interval for exporting all data from the table when used as a source."
+								required:    false
+							}
+							remove_after_export: {
+								type: bool: default: false
+								description: """
+																		If set to true, all data will be removed from cache after exporting.
+																		Only valid if used as a source and export_interval > 0
+
+																		By default, export will not remove data from cache
+																		"""
+								required: false
+							}
+							source_key: {
+								type: string: {}
+								description: """
+																		Key to use for this component when used as a source. This must be different from the
+																		component key.
+																		"""
+								required: true
+							}
+						}
+						description:   "Configuration for source functionality."
+						required:      false
+						relevant_when: "type = \"memory\""
+					}
+					ttl: {
+						type: uint: default: 600
+						description: """
+														TTL (time-to-live in seconds) is used to limit the lifetime of data stored in the cache.
+														When TTL expires, data behind a specific key in the cache is removed.
+														TTL is reset when the key is replaced.
+														"""
+						required:      false
+						relevant_when: "type = \"memory\""
+					}
+					ttl_field: {
+						type: string: default: ""
+						description:   "Field in the incoming value used as the TTL override."
+						required:      false
+						relevant_when: "type = \"memory\""
+					}
+					locale: {
+						type: string: default: "en"
+						description: """
+														The locale to use when querying the database.
+
+														MaxMind includes localized versions of some of the fields within their database, such as
+														country name. This setting can control which of those localized versions are returned by the
+														transform.
+
+														More information on which portions of the geolocation data are localized, and what languages
+														are available, can be found [here][locale_docs].
+
+														[locale_docs]: https://support.maxmind.com/hc/en-us/articles/4414877149467-IP-Geolocation-Data#h_01FRRGRYTGZB29ERDBZCX3MR8Q
+														"""
+						required:      false
+						relevant_when: "type = \"geoip\""
+					}
+					path: {
+						type: string: {}
+						description: """
+														Path to the [MaxMind GeoIP2][geoip2] or [GeoLite2 binary city database file][geolite2]
+														(**GeoLite2-City.mmdb**).
+
+														Other databases, such as the country database, are not supported.
+														`mmdb` enrichment table can be used for other databases.
+
+														[geoip2]: https://dev.maxmind.com/geoip/geoip2/downloadable
+														[geolite2]: https://dev.maxmind.com/geoip/geoip2/geolite2/#Download_Access
+														"""
+						required:      true
+						relevant_when: "type = \"geoip\" or type = \"mmdb\""
 					}
 					type: {
 						required: true
-						type: string: enum: file: "Exposes data from a static file as an enrichment table."
+						type: string: enum: {
+							file: "Exposes data from a static file as an enrichment table."
+							memory: """
+																	Exposes data from a memory cache as an enrichment table. The cache can be written to using
+																	a sink.
+																	"""
+							geoip: """
+																	Exposes data from a [MaxMind][maxmind] [GeoIP2][geoip2] database as an enrichment table.
+
+																	[maxmind]: https://www.maxmind.com/
+																	[geoip2]: https://www.maxmind.com/en/geoip2-databases
+																	"""
+							mmdb: """
+																	Exposes data from a [MaxMind][maxmind] database as an enrichment table.
+
+																	[maxmind]: https://www.maxmind.com/
+																	"""
+						}
 						description: "enrichment table type"
 					}
 				}
@@ -345,12 +536,253 @@ generated: configuration: {
 						required:      false
 						relevant_when: "type = \"exec\""
 					}
+					auth: {
+						type: object: options: {
+							access_key_id: {
+								type: string: examples: ["AKIAIOSFODNN7EXAMPLE"]
+								description: "The AWS access key ID."
+								required:    true
+							}
+							assume_role: {
+								type: string: examples: ["arn:aws:iam::123456789098:role/my_role"]
+								description: """
+																		The ARN of an [IAM role][iam_role] to assume.
+
+																		[iam_role]: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html
+																		"""
+								required: true
+							}
+							external_id: {
+								type: string: examples: ["randomEXAMPLEidString"]
+								description: """
+																		The optional unique external ID in conjunction with role to assume.
+
+																		[external_id]: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-user_externalid.html
+																		"""
+								required: false
+							}
+							region: {
+								type: string: examples: ["us-west-2"]
+								description: """
+																		The [AWS region][aws_region] to send STS requests to.
+
+																		If not set, this defaults to the configured region
+																		for the service itself.
+
+																		[aws_region]: https://docs.aws.amazon.com/general/latest/gr/rande.html#regional-endpoints
+																		"""
+								required: false
+							}
+							secret_access_key: {
+								type: string: examples: ["wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"]
+								description: "The AWS secret access key."
+								required:    true
+							}
+							session_name: {
+								type: string: examples: ["vector-indexer-role"]
+								description: """
+																		The optional [RoleSessionName][role_session_name] is a unique session identifier for your assumed role.
+
+																		Should be unique per principal or reason.
+																		If not set, the session name is autogenerated like assume-role-provider-1736428351340
+
+																		[role_session_name]: https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html
+																		"""
+								required: false
+							}
+							session_token: {
+								type: string: examples: ["AQoDYXdz...AQoDYXdz..."]
+								description: """
+																		The AWS session token.
+																		See [AWS temporary credentials](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_temp_use-resources.html)
+																		"""
+								required: false
+							}
+							credentials_file: {
+								type: string: examples: ["/my/aws/credentials"]
+								description: "Path to the credentials file."
+								required:    true
+							}
+							profile: {
+								type: string: {
+									default: "default"
+									examples: ["develop"]
+								}
+								description: """
+																		The credentials profile to use.
+
+																		Used to select AWS credentials from a provided credentials file.
+																		"""
+								required: false
+							}
+							imds: {
+								type: object: options: {
+									connect_timeout_seconds: {
+										type: uint: {
+											default: 1
+											unit:    "seconds"
+										}
+										description: "Connect timeout for IMDS."
+										required:    false
+									}
+									max_attempts: {
+										type: uint: default: 4
+										description: "Number of IMDS retries for fetching tokens and metadata."
+										required:    false
+									}
+									read_timeout_seconds: {
+										type: uint: {
+											default: 1
+											unit:    "seconds"
+										}
+										description: "Read timeout for IMDS."
+										required:    false
+									}
+								}
+								description: "Configuration for authenticating with AWS through IMDS."
+								required:    false
+							}
+							load_timeout_secs: {
+								type: uint: {
+									examples: [30]
+									unit: "seconds"
+								}
+								description: """
+																		Timeout for successfully loading any credentials, in seconds.
+
+																		Relevant when the default credentials chain or `assume_role` is used.
+																		"""
+								required: false
+							}
+						}
+						description:   "Configuration of the authentication strategy for interacting with AWS services."
+						required:      false
+						relevant_when: "type = \"aws_secrets_manager\""
+					}
+					secret_id: {
+						type: string: {}
+						description:   "ID of the secret to resolve."
+						required:      true
+						relevant_when: "type = \"aws_secrets_manager\""
+					}
+					tls: {
+						type: object: options: {
+							alpn_protocols: {
+								type: array: items: type: string: examples: ["h2"]
+								description: """
+																		Sets the list of supported ALPN protocols.
+
+																		Declare the supported ALPN protocols, which are used during negotiation with a peer. They are prioritized in the order
+																		that they are defined.
+																		"""
+								required: false
+							}
+							ca_file: {
+								type: string: examples: ["/path/to/certificate_authority.crt"]
+								description: """
+																		Absolute path to an additional CA certificate file.
+
+																		The certificate must be in the DER or PEM (X.509) format. Additionally, the certificate can be provided as an inline string in PEM format.
+																		"""
+								required: false
+							}
+							crt_file: {
+								type: string: examples: ["/path/to/host_certificate.crt"]
+								description: """
+																		Absolute path to a certificate file used to identify this server.
+
+																		The certificate must be in DER, PEM (X.509), or PKCS#12 format. Additionally, the certificate can be provided as
+																		an inline string in PEM format.
+
+																		If this is set _and_ is not a PKCS#12 archive, `key_file` must also be set.
+																		"""
+								required: false
+							}
+							key_file: {
+								type: string: examples: ["/path/to/host_certificate.key"]
+								description: """
+																		Absolute path to a private key file used to identify this server.
+
+																		The key must be in DER or PEM (PKCS#8) format. Additionally, the key can be provided as an inline string in PEM format.
+																		"""
+								required: false
+							}
+							key_pass: {
+								type: string: examples: ["${KEY_PASS_ENV_VAR}", "PassWord1"]
+								description: """
+																		Passphrase used to unlock the encrypted key file.
+
+																		This has no effect unless `key_file` is set.
+																		"""
+								required: false
+							}
+							server_name: {
+								type: string: examples: ["www.example.com"]
+								description: """
+																		Server name to use when using Server Name Indication (SNI).
+
+																		Only relevant for outgoing connections.
+																		"""
+								required: false
+							}
+							verify_certificate: {
+								type: bool: {}
+								description: """
+																		Enables certificate verification. For components that create a server, this requires that the
+																		client connections have a valid client certificate. For components that initiate requests,
+																		this validates that the upstream has a valid certificate.
+
+																		If enabled, certificates must not be expired and must be issued by a trusted
+																		issuer. This verification operates in a hierarchical manner, checking that the leaf certificate (the
+																		certificate presented by the client/server) is not only valid, but that the issuer of that certificate is also valid, and
+																		so on, until the verification process reaches a root certificate.
+
+																		Do NOT set this to `false` unless you understand the risks of not verifying the validity of certificates.
+																		"""
+								required: false
+							}
+							verify_hostname: {
+								type: bool: {}
+								description: """
+																		Enables hostname verification.
+
+																		If enabled, the hostname used to connect to the remote host must be present in the TLS certificate presented by
+																		the remote host, either as the Common Name or as an entry in the Subject Alternative Name extension.
+
+																		Only relevant for outgoing connections.
+
+																		Do NOT set this to `false` unless you understand the risks of not verifying the remote hostname.
+																		"""
+								required: false
+							}
+						}
+						description:   "TLS configuration."
+						required:      false
+						relevant_when: "type = \"aws_secrets_manager\""
+					}
+					endpoint: {
+						type: string: examples: ["http://127.0.0.0:5000/path/to/service"]
+						description:   "Custom endpoint for use with AWS-compatible services."
+						required:      false
+						relevant_when: "type = \"aws_secrets_manager\""
+					}
+					region: {
+						type: string: examples: ["us-east-1"]
+						description: """
+														The [AWS region][aws_region] of the target service.
+
+														[aws_region]: https://docs.aws.amazon.com/general/latest/gr/rande.html#regional-endpoints
+														"""
+						required:      false
+						relevant_when: "type = \"aws_secrets_manager\""
+					}
 					type: {
 						required: true
 						type: string: enum: {
-							file:      "File."
-							directory: "Directory."
-							exec:      "Exec."
+							file:                "File."
+							directory:           "Directory."
+							exec:                "Exec."
+							aws_secrets_manager: "AWS Secrets Manager."
 						}
 						description: "secret type"
 					}