From 5908557aba7ded1835f11f0b0860253220eb4060 Mon Sep 17 00:00:00 2001 From: ArunPiduguDD Date: Thu, 18 Jun 2026 01:34:10 +0000 Subject: [PATCH 1/2] feat(tag_cardinality_limit): add per-tag cache_size_per_key override in probabilistic mode --- ...dinality_per_tag_cache_size.enhancement.md | 3 + .../tag_cardinality_limit/config.rs | 19 +- src/transforms/tag_cardinality_limit/mod.rs | 26 +- src/transforms/tag_cardinality_limit/tests.rs | 146 +++++- .../generated/tag_cardinality_limit.cue | 26 +- .../cue/reference/generated/configuration.cue | 446 +----------------- 6 files changed, 209 insertions(+), 457 deletions(-) create mode 100644 changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md diff --git a/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md b/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md new file mode 100644 index 0000000000000..65547bd0f350d --- /dev/null +++ b/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md @@ -0,0 +1,3 @@ +Adds support for specifying a `cache_size_per_key` in per-tag override configuration options when in probabilistic mode. Previously, even if a per-tag `value_limit` override is specified, it would still inherit the same `cache_size_per_key` as the enclosing global/per-metric configuration, which can lead to a higher false positive rate if the per-tag `value_limit` is higher than the enclosing global/per-metric `value_limit`. The field is optional and falls back to the enclosing per-metric or global value when omitted (ignored when `mode` is `exact`). + +authors: ArunPiduguDD diff --git a/src/transforms/tag_cardinality_limit/config.rs b/src/transforms/tag_cardinality_limit/config.rs index a1471746972fa..9267cdd5f12bd 100644 --- a/src/transforms/tag_cardinality_limit/config.rs +++ b/src/transforms/tag_cardinality_limit/config.rs @@ -212,6 +212,10 @@ impl OverrideMode { /// environment: /// mode: limit_override # track with a per-tag cap /// value_limit: 3 +/// high_cardinality_tag: +/// mode: limit_override +/// value_limit: 1000 +/// cache_size_per_key: 102400 # larger bloom filter for this tag /// trace_id: /// mode: excluded # opt out of tracking entirely /// ``` @@ -225,19 +229,24 @@ pub struct PerTagConfig { /// Mode applied to a specific tag key within a per-metric override. /// -/// The tracking algorithm (`exact`/`probabilistic`), `cache_size_per_key`, -/// `limit_exceeded_action`, and `internal_metrics` are always inherited from the -/// enclosing per-metric configuration. +/// The tracking algorithm (`exact`/`probabilistic`), `limit_exceeded_action`, and +/// `internal_metrics` are inherited from the enclosing per-metric (or global) configuration. +/// `cache_size_per_key` may optionally be overridden per tag when probabilistic mode is in use. #[configurable_component] #[derive(Clone, Copy, Debug, Eq, PartialEq)] #[serde(tag = "mode", rename_all = "snake_case", deny_unknown_fields)] #[configurable(metadata(docs::enum_tag_description = "Controls how this tag key is handled."))] pub enum PerTagMode { - /// Track this tag with a per-tag value limit. The enclosing per-metric tracking - /// algorithm and all other settings still apply. + /// Track this tag with a per-tag value limit. All other settings are inherited from + /// the enclosing config. LimitOverride { /// Maximum number of distinct values to accept for this tag key. value_limit: usize, + + /// Override the bloom filter cache size for this specific tag key. + /// Only effective in `probabilistic` mode. Inherits from the enclosing config when unset. + #[serde(default)] + cache_size_per_key: Option, }, /// Opt this tag out of cardinality tracking entirely. All values pass through /// without being recorded or checked against any `value_limit`. diff --git a/src/transforms/tag_cardinality_limit/mod.rs b/src/transforms/tag_cardinality_limit/mod.rs index 755a23c8d1006..e3b382e1af6a8 100644 --- a/src/transforms/tag_cardinality_limit/mod.rs +++ b/src/transforms/tag_cardinality_limit/mod.rs @@ -26,6 +26,17 @@ use crate::event::metric::TagValueSet; type MetricId = (Option, String); +/// Applies a per-tag `cache_size_per_key` override to a `Mode`. No-op in exact mode or when +/// `override_size` is `None`. +const fn apply_cache_size_override(mode: Mode, override_size: Option) -> Mode { + match (mode, override_size) { + (Mode::Probabilistic(_), Some(size)) => Mode::Probabilistic(BloomFilterConfig { + cache_size_per_key: size, + }), + _ => mode, + } +} + /// Outcome of applying tag cardinality tracking to a tag value. #[derive(Debug, Eq, PartialEq)] enum AcceptResult { @@ -126,13 +137,14 @@ impl TagCardinalityLimit { if let Some(per_tag) = per_metric.per_tag_limits.get(tag_key) { match per_tag.mode { PerTagMode::Excluded => return TagSettings::Excluded, - PerTagMode::LimitOverride { value_limit } => { - // Tracking algorithm and all other settings are always inherited - // from the per-metric config. + PerTagMode::LimitOverride { + value_limit, + cache_size_per_key, + } => { return TagSettings::Tracked(Inner { value_limit, limit_exceeded_action, - mode: metric_mode, + mode: apply_cache_size_override(metric_mode, cache_size_per_key), internal_metrics, }); } @@ -152,8 +164,12 @@ impl TagCardinalityLimit { let global = self.config.global; match self.config.per_tag_limits.get(tag_key).map(|c| c.mode) { Some(PerTagMode::Excluded) => TagSettings::Excluded, - Some(PerTagMode::LimitOverride { value_limit }) => TagSettings::Tracked(Inner { + Some(PerTagMode::LimitOverride { + value_limit, + cache_size_per_key, + }) => TagSettings::Tracked(Inner { value_limit, + mode: apply_cache_size_override(global.mode, cache_size_per_key), ..global }), None => TagSettings::Tracked(global), diff --git a/src/transforms/tag_cardinality_limit/tests.rs b/src/transforms/tag_cardinality_limit/tests.rs index b8a453a3c4844..e4b5f246e0755 100644 --- a/src/transforms/tag_cardinality_limit/tests.rs +++ b/src/transforms/tag_cardinality_limit/tests.rs @@ -849,7 +849,10 @@ fn max_tracked_keys_caps_across_per_metric_buckets() { fn make_per_tag(value_limit: usize) -> PerTagConfig { PerTagConfig { - mode: PerTagMode::LimitOverride { value_limit }, + mode: PerTagMode::LimitOverride { + value_limit, + cache_size_per_key: None, + }, } } @@ -1127,7 +1130,10 @@ fn tag_excluded_unbounded_sibling_limited() { #[test] fn per_tag_limit_override_caps_at_explicit_value() { let per_tag = PerTagConfig { - mode: PerTagMode::LimitOverride { value_limit: 2 }, + mode: PerTagMode::LimitOverride { + value_limit: 2, + cache_size_per_key: None, + }, }; let config = make_transform_hashset_with_per_metric_limits( 500, @@ -1210,7 +1216,13 @@ per_metric_limits: let per_metric = parsed.per_metric_limits.get("metric_a").unwrap(); let capped = per_metric.per_tag_limits.get("capped_tag").unwrap(); - assert_eq!(capped.mode, PerTagMode::LimitOverride { value_limit: 10 }); + assert_eq!( + capped.mode, + PerTagMode::LimitOverride { + value_limit: 10, + cache_size_per_key: None, + } + ); let excluded = per_metric.per_tag_limits.get("excluded_tag").unwrap(); assert_eq!(excluded.mode, PerTagMode::Excluded); @@ -1456,8 +1468,134 @@ per_tag_limits: let parsed: Config = serde_yaml::from_str(yaml).expect("yaml should deserialize"); let capped = parsed.per_tag_limits.get("capped_tag").unwrap(); - assert_eq!(capped.mode, PerTagMode::LimitOverride { value_limit: 10 }); + assert_eq!( + capped.mode, + PerTagMode::LimitOverride { + value_limit: 10, + cache_size_per_key: None, + } + ); let excluded = parsed.per_tag_limits.get("excluded_tag").unwrap(); assert_eq!(excluded.mode, PerTagMode::Excluded); } + +// ============================================================================ +// cache_size_per_key override tests +// ============================================================================ + +/// `apply_cache_size_override` replaces the bloom size when mode is probabilistic and an +/// override is given; leaves the mode unchanged in all other cases. +#[test] +fn apply_cache_size_override_probabilistic_with_some() { + let base = Mode::Probabilistic(BloomFilterConfig { + cache_size_per_key: default_cache_size(), + }); + let result = apply_cache_size_override(base, Some(1024)); + assert_eq!( + result, + Mode::Probabilistic(BloomFilterConfig { + cache_size_per_key: 1024, + }) + ); +} + +#[test] +fn apply_cache_size_override_exact_with_some_is_noop() { + let result = apply_cache_size_override(Mode::Exact, Some(1024)); + assert_eq!(result, Mode::Exact); +} + +#[test] +fn apply_cache_size_override_probabilistic_with_none_inherits() { + let base = Mode::Probabilistic(BloomFilterConfig { + cache_size_per_key: default_cache_size(), + }); + let result = apply_cache_size_override(base, None); + assert_eq!( + result, + Mode::Probabilistic(BloomFilterConfig { + cache_size_per_key: default_cache_size(), + }) + ); +} + +/// A per-metric `limit_override` with `cache_size_per_key` set deserializes correctly. +#[test] +fn per_tag_cache_size_per_key_deserializes() { + let yaml = r#" +value_limit: 5 +mode: probabilistic +cache_size_per_key: 5120 +per_metric_limits: + metric_a: + mode: probabilistic + cache_size_per_key: 5120 + per_tag_limits: + big_tag: + mode: limit_override + value_limit: 100 + cache_size_per_key: 32768 + default_tag: + mode: limit_override + value_limit: 10 +"#; + let parsed: Config = serde_yaml::from_str(yaml).expect("yaml should deserialize"); + let per_metric = parsed.per_metric_limits.get("metric_a").unwrap(); + + let big_tag = per_metric.per_tag_limits.get("big_tag").unwrap(); + assert_eq!( + big_tag.mode, + PerTagMode::LimitOverride { + value_limit: 100, + cache_size_per_key: Some(32768), + } + ); + + // Omitting the field defaults to None (inherits from enclosing config). + let default_tag = per_metric.per_tag_limits.get("default_tag").unwrap(); + assert_eq!( + default_tag.mode, + PerTagMode::LimitOverride { + value_limit: 10, + cache_size_per_key: None, + } + ); +} + +/// A global `per_tag_limits` `limit_override` with `cache_size_per_key` set deserializes correctly. +#[test] +fn global_per_tag_cache_size_per_key_deserializes() { + let yaml = r#" +value_limit: 5 +mode: probabilistic +cache_size_per_key: 5120 +per_tag_limits: + big_tag: + mode: limit_override + value_limit: 100 + cache_size_per_key: 32768 + default_tag: + mode: limit_override + value_limit: 10 +"#; + let parsed: Config = serde_yaml::from_str(yaml).expect("yaml should deserialize"); + + let big_tag = parsed.per_tag_limits.get("big_tag").unwrap(); + assert_eq!( + big_tag.mode, + PerTagMode::LimitOverride { + value_limit: 100, + cache_size_per_key: Some(32768), + } + ); + + let default_tag = parsed.per_tag_limits.get("default_tag").unwrap(); + assert_eq!( + default_tag.mode, + PerTagMode::LimitOverride { + value_limit: 10, + cache_size_per_key: None, + } + ); +} diff --git a/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue b/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue index fe47c37655f90..06cf8280975a4 100644 --- a/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue +++ b/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue @@ -154,6 +154,15 @@ generated: components: transforms: tag_cardinality_limit: configuration: { description: "An individual tag configuration." required: true type: object: options: { + cache_size_per_key: { + description: """ + Override the bloom filter cache size for this specific tag key. + Only effective in `probabilistic` mode. Inherits from the enclosing config when unset. + """ + relevant_when: "mode = \"limit_override\"" + required: false + type: uint: {} + } mode: { description: "Controls how this tag key is handled." required: true @@ -163,8 +172,8 @@ generated: components: transforms: tag_cardinality_limit: configuration: { without being recorded or checked against any `value_limit`. """ limit_override: """ - Track this tag with a per-tag value limit. The enclosing per-metric tracking - algorithm and all other settings still apply. + Track this tag with a per-tag value limit. All other settings are inherited from + the enclosing config. """ } } @@ -199,6 +208,15 @@ generated: components: transforms: tag_cardinality_limit: configuration: { description: "An individual tag configuration." required: true type: object: options: { + cache_size_per_key: { + description: """ + Override the bloom filter cache size for this specific tag key. + Only effective in `probabilistic` mode. Inherits from the enclosing config when unset. + """ + relevant_when: "mode = \"limit_override\"" + required: false + type: uint: {} + } mode: { description: "Controls how this tag key is handled." required: true @@ -208,8 +226,8 @@ generated: components: transforms: tag_cardinality_limit: configuration: { without being recorded or checked against any `value_limit`. """ limit_override: """ - Track this tag with a per-tag value limit. The enclosing per-metric tracking - algorithm and all other settings still apply. + Track this tag with a per-tag value limit. All other settings are inherited from + the enclosing config. """ } } diff --git a/website/cue/reference/generated/configuration.cue b/website/cue/reference/generated/configuration.cue index 20792a94ee322..827f14531e353 100644 --- a/website/cue/reference/generated/configuration.cue +++ b/website/cue/reference/generated/configuration.cue @@ -2,32 +2,6 @@ package metadata generated: configuration: { configuration: { - api: { - type: object: options: { - address: { - type: string: { - default: "127.0.0.1:8686" - examples: ["0.0.0.0:8686", "127.0.0.1:1234"] - } - description: """ - The network address to which the API should bind. If you're running - Vector in a Docker container, bind to `0.0.0.0`. Otherwise - the API will not be exposed outside the container. - """ - common: true - required: false - } - enabled: { - type: bool: default: false - description: "Whether the API is enabled for this Vector instance." - common: true - required: false - } - } - description: "API options." - warnings: ["The API currently does not support authentication. Only enable it in isolated environments or for debugging. It must not be exposed to untrusted clients."] - group: "api" - } enrichment_tables: { type: object: options: "*": { type: object: options: { @@ -158,9 +132,8 @@ generated: configuration: { required: true } } - description: "File-specific settings." - required: true - relevant_when: "type = \"file\"" + description: "File-specific settings." + required: true } schema: { type: object: options: "*": { @@ -213,175 +186,11 @@ generated: configuration: { [rfc3339]: https://tools.ietf.org/html/rfc3339 [chrono_fmt]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html#specifiers """ - required: false - relevant_when: "type = \"file\"" - } - flush_interval: { - type: uint: {} - description: """ - The interval used for making writes visible in the table. - Longer intervals might get better performance, - but there is a longer delay before the data is visible in the table. - Since every TTL scan makes its changes visible, only use this value - if it is shorter than the `scan_interval`. - - By default, all writes are made visible immediately. - """ - required: false - relevant_when: "type = \"memory\"" - } - internal_metrics: { - type: object: options: include_key_tag: { - type: bool: default: false - description: """ - Determines whether to include the key tag on internal metrics. - - This is useful for distinguishing between different keys while monitoring. However, the tag's - cardinality is unbounded. - """ - required: false - } - description: "Configuration of internal metrics" - required: false - relevant_when: "type = \"memory\"" - } - max_byte_size: { - type: uint: {} - description: """ - Maximum size of the table in bytes. All insertions that make - this table bigger than the maximum size are rejected. - - By default, there is no size limit. - """ - required: false - relevant_when: "type = \"memory\"" - } - scan_interval: { - type: uint: default: 30 - description: """ - The scan interval used to look for expired records. This is provided - as an optimization to ensure that TTL is updated, but without doing - too many cache scans. - """ - required: false - relevant_when: "type = \"memory\"" - } - source_config: { - type: object: options: { - export_batch_size: { - type: uint: {} - description: """ - Batch size for data exporting. Used to prevent exporting entire table at - once and blocking the system. - - By default, batches are not used and entire table is exported. - """ - required: false - } - export_expired_items: { - type: bool: default: false - description: """ - Set to true to export expired items via the `expired` output port. - Expired items ignore other settings and are exported as they are flushed from the table. - """ - required: false - } - export_interval: { - type: uint: {} - description: "Interval for exporting all data from the table when used as a source." - required: false - } - remove_after_export: { - type: bool: default: false - description: """ - If set to true, all data will be removed from cache after exporting. - Only valid if used as a source and export_interval > 0 - - By default, export will not remove data from cache - """ - required: false - } - source_key: { - type: string: {} - description: """ - Key to use for this component when used as a source. This must be different from the - component key. - """ - required: true - } - } - description: "Configuration for source functionality." - required: false - relevant_when: "type = \"memory\"" - } - ttl: { - type: uint: default: 600 - description: """ - TTL (time-to-live in seconds) is used to limit the lifetime of data stored in the cache. - When TTL expires, data behind a specific key in the cache is removed. - TTL is reset when the key is replaced. - """ - required: false - relevant_when: "type = \"memory\"" - } - ttl_field: { - type: string: default: "" - description: "Field in the incoming value used as the TTL override." - required: false - relevant_when: "type = \"memory\"" - } - locale: { - type: string: default: "en" - description: """ - The locale to use when querying the database. - - MaxMind includes localized versions of some of the fields within their database, such as - country name. This setting can control which of those localized versions are returned by the - transform. - - More information on which portions of the geolocation data are localized, and what languages - are available, can be found [here][locale_docs]. - - [locale_docs]: https://support.maxmind.com/hc/en-us/articles/4414877149467-IP-Geolocation-Data#h_01FRRGRYTGZB29ERDBZCX3MR8Q - """ - required: false - relevant_when: "type = \"geoip\"" - } - path: { - type: string: {} - description: """ - Path to the [MaxMind GeoIP2][geoip2] or [GeoLite2 binary city database file][geolite2] - (**GeoLite2-City.mmdb**). - - Other databases, such as the country database, are not supported. - `mmdb` enrichment table can be used for other databases. - - [geoip2]: https://dev.maxmind.com/geoip/geoip2/downloadable - [geolite2]: https://dev.maxmind.com/geoip/geoip2/geolite2/#Download_Access - """ - required: true - relevant_when: "type = \"geoip\" or type = \"mmdb\"" + required: false } type: { required: true - type: string: enum: { - file: "Exposes data from a static file as an enrichment table." - memory: """ - Exposes data from a memory cache as an enrichment table. The cache can be written to using - a sink. - """ - geoip: """ - Exposes data from a [MaxMind][maxmind] [GeoIP2][geoip2] database as an enrichment table. - - [maxmind]: https://www.maxmind.com/ - [geoip2]: https://www.maxmind.com/en/geoip2-databases - """ - mmdb: """ - Exposes data from a [MaxMind][maxmind] database as an enrichment table. - - [maxmind]: https://www.maxmind.com/ - """ - } + type: string: enum: file: "Exposes data from a static file as an enrichment table." description: "enrichment table type" } } @@ -536,253 +345,12 @@ generated: configuration: { required: false relevant_when: "type = \"exec\"" } - auth: { - type: object: options: { - access_key_id: { - type: string: examples: ["AKIAIOSFODNN7EXAMPLE"] - description: "The AWS access key ID." - required: true - } - assume_role: { - type: string: examples: ["arn:aws:iam::123456789098:role/my_role"] - description: """ - The ARN of an [IAM role][iam_role] to assume. - - [iam_role]: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html - """ - required: true - } - external_id: { - type: string: examples: ["randomEXAMPLEidString"] - description: """ - The optional unique external ID in conjunction with role to assume. - - [external_id]: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-user_externalid.html - """ - required: false - } - region: { - type: string: examples: ["us-west-2"] - description: """ - The [AWS region][aws_region] to send STS requests to. - - If not set, this defaults to the configured region - for the service itself. - - [aws_region]: https://docs.aws.amazon.com/general/latest/gr/rande.html#regional-endpoints - """ - required: false - } - secret_access_key: { - type: string: examples: ["wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"] - description: "The AWS secret access key." - required: true - } - session_name: { - type: string: examples: ["vector-indexer-role"] - description: """ - The optional [RoleSessionName][role_session_name] is a unique session identifier for your assumed role. - - Should be unique per principal or reason. - If not set, the session name is autogenerated like assume-role-provider-1736428351340 - - [role_session_name]: https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html - """ - required: false - } - session_token: { - type: string: examples: ["AQoDYXdz...AQoDYXdz..."] - description: """ - The AWS session token. - See [AWS temporary credentials](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_temp_use-resources.html) - """ - required: false - } - credentials_file: { - type: string: examples: ["/my/aws/credentials"] - description: "Path to the credentials file." - required: true - } - profile: { - type: string: { - default: "default" - examples: ["develop"] - } - description: """ - The credentials profile to use. - - Used to select AWS credentials from a provided credentials file. - """ - required: false - } - imds: { - type: object: options: { - connect_timeout_seconds: { - type: uint: { - default: 1 - unit: "seconds" - } - description: "Connect timeout for IMDS." - required: false - } - max_attempts: { - type: uint: default: 4 - description: "Number of IMDS retries for fetching tokens and metadata." - required: false - } - read_timeout_seconds: { - type: uint: { - default: 1 - unit: "seconds" - } - description: "Read timeout for IMDS." - required: false - } - } - description: "Configuration for authenticating with AWS through IMDS." - required: false - } - load_timeout_secs: { - type: uint: { - examples: [30] - unit: "seconds" - } - description: """ - Timeout for successfully loading any credentials, in seconds. - - Relevant when the default credentials chain or `assume_role` is used. - """ - required: false - } - } - description: "Configuration of the authentication strategy for interacting with AWS services." - required: false - relevant_when: "type = \"aws_secrets_manager\"" - } - secret_id: { - type: string: {} - description: "ID of the secret to resolve." - required: true - relevant_when: "type = \"aws_secrets_manager\"" - } - tls: { - type: object: options: { - alpn_protocols: { - type: array: items: type: string: examples: ["h2"] - description: """ - Sets the list of supported ALPN protocols. - - Declare the supported ALPN protocols, which are used during negotiation with a peer. They are prioritized in the order - that they are defined. - """ - required: false - } - ca_file: { - type: string: examples: ["/path/to/certificate_authority.crt"] - description: """ - Absolute path to an additional CA certificate file. - - The certificate must be in the DER or PEM (X.509) format. Additionally, the certificate can be provided as an inline string in PEM format. - """ - required: false - } - crt_file: { - type: string: examples: ["/path/to/host_certificate.crt"] - description: """ - Absolute path to a certificate file used to identify this server. - - The certificate must be in DER, PEM (X.509), or PKCS#12 format. Additionally, the certificate can be provided as - an inline string in PEM format. - - If this is set _and_ is not a PKCS#12 archive, `key_file` must also be set. - """ - required: false - } - key_file: { - type: string: examples: ["/path/to/host_certificate.key"] - description: """ - Absolute path to a private key file used to identify this server. - - The key must be in DER or PEM (PKCS#8) format. Additionally, the key can be provided as an inline string in PEM format. - """ - required: false - } - key_pass: { - type: string: examples: ["${KEY_PASS_ENV_VAR}", "PassWord1"] - description: """ - Passphrase used to unlock the encrypted key file. - - This has no effect unless `key_file` is set. - """ - required: false - } - server_name: { - type: string: examples: ["www.example.com"] - description: """ - Server name to use when using Server Name Indication (SNI). - - Only relevant for outgoing connections. - """ - required: false - } - verify_certificate: { - type: bool: {} - description: """ - Enables certificate verification. For components that create a server, this requires that the - client connections have a valid client certificate. For components that initiate requests, - this validates that the upstream has a valid certificate. - - If enabled, certificates must not be expired and must be issued by a trusted - issuer. This verification operates in a hierarchical manner, checking that the leaf certificate (the - certificate presented by the client/server) is not only valid, but that the issuer of that certificate is also valid, and - so on, until the verification process reaches a root certificate. - - Do NOT set this to `false` unless you understand the risks of not verifying the validity of certificates. - """ - required: false - } - verify_hostname: { - type: bool: {} - description: """ - Enables hostname verification. - - If enabled, the hostname used to connect to the remote host must be present in the TLS certificate presented by - the remote host, either as the Common Name or as an entry in the Subject Alternative Name extension. - - Only relevant for outgoing connections. - - Do NOT set this to `false` unless you understand the risks of not verifying the remote hostname. - """ - required: false - } - } - description: "TLS configuration." - required: false - relevant_when: "type = \"aws_secrets_manager\"" - } - endpoint: { - type: string: examples: ["http://127.0.0.0:5000/path/to/service"] - description: "Custom endpoint for use with AWS-compatible services." - required: false - relevant_when: "type = \"aws_secrets_manager\"" - } - region: { - type: string: examples: ["us-east-1"] - description: """ - The [AWS region][aws_region] of the target service. - - [aws_region]: https://docs.aws.amazon.com/general/latest/gr/rande.html#regional-endpoints - """ - required: false - relevant_when: "type = \"aws_secrets_manager\"" - } type: { required: true type: string: enum: { - file: "File." - directory: "Directory." - exec: "Exec." - aws_secrets_manager: "AWS Secrets Manager." + file: "File." + directory: "Directory." + exec: "Exec." } description: "secret type" } From 657934e9c8010665499e212902f6737dbfea2749 Mon Sep 17 00:00:00 2001 From: ArunPiduguDD Date: Thu, 18 Jun 2026 03:27:40 +0000 Subject: [PATCH 2/2] chore: regenerate component docs --- ...dinality_per_tag_cache_size.enhancement.md | 2 +- src/transforms/tag_cardinality_limit/tests.rs | 4 - .../cue/reference/generated/configuration.cue | 446 +++++++++++++++++- 3 files changed, 440 insertions(+), 12 deletions(-) diff --git a/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md b/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md index 65547bd0f350d..bb2f135a6399c 100644 --- a/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md +++ b/changelog.d/tag_cardinality_per_tag_cache_size.enhancement.md @@ -1,3 +1,3 @@ -Adds support for specifying a `cache_size_per_key` in per-tag override configuration options when in probabilistic mode. Previously, even if a per-tag `value_limit` override is specified, it would still inherit the same `cache_size_per_key` as the enclosing global/per-metric configuration, which can lead to a higher false positive rate if the per-tag `value_limit` is higher than the enclosing global/per-metric `value_limit`. The field is optional and falls back to the enclosing per-metric or global value when omitted (ignored when `mode` is `exact`). +Adds support for specifying a `cache_size_per_key` in per-tag override configuration options when in probabilistic mode. Previously, even if a per-tag `value_limit` override is specified, it would still inherit the same `cache_size_per_key` as the enclosing global/per-metric configuration, which can lead to a higher false positive rate if the per-tag `value_limit` is higher than the enclosing global/per-metric `value_limit`. The field is optional and falls back to the enclosing per-metric or global cache size when omitted (ignored when `mode` is `exact`). authors: ArunPiduguDD diff --git a/src/transforms/tag_cardinality_limit/tests.rs b/src/transforms/tag_cardinality_limit/tests.rs index e4b5f246e0755..55fa3fe153902 100644 --- a/src/transforms/tag_cardinality_limit/tests.rs +++ b/src/transforms/tag_cardinality_limit/tests.rs @@ -1480,10 +1480,6 @@ per_tag_limits: assert_eq!(excluded.mode, PerTagMode::Excluded); } -// ============================================================================ -// cache_size_per_key override tests -// ============================================================================ - /// `apply_cache_size_override` replaces the bloom size when mode is probabilistic and an /// override is given; leaves the mode unchanged in all other cases. #[test] diff --git a/website/cue/reference/generated/configuration.cue b/website/cue/reference/generated/configuration.cue index 827f14531e353..20792a94ee322 100644 --- a/website/cue/reference/generated/configuration.cue +++ b/website/cue/reference/generated/configuration.cue @@ -2,6 +2,32 @@ package metadata generated: configuration: { configuration: { + api: { + type: object: options: { + address: { + type: string: { + default: "127.0.0.1:8686" + examples: ["0.0.0.0:8686", "127.0.0.1:1234"] + } + description: """ + The network address to which the API should bind. If you're running + Vector in a Docker container, bind to `0.0.0.0`. Otherwise + the API will not be exposed outside the container. + """ + common: true + required: false + } + enabled: { + type: bool: default: false + description: "Whether the API is enabled for this Vector instance." + common: true + required: false + } + } + description: "API options." + warnings: ["The API currently does not support authentication. Only enable it in isolated environments or for debugging. It must not be exposed to untrusted clients."] + group: "api" + } enrichment_tables: { type: object: options: "*": { type: object: options: { @@ -132,8 +158,9 @@ generated: configuration: { required: true } } - description: "File-specific settings." - required: true + description: "File-specific settings." + required: true + relevant_when: "type = \"file\"" } schema: { type: object: options: "*": { @@ -186,11 +213,175 @@ generated: configuration: { [rfc3339]: https://tools.ietf.org/html/rfc3339 [chrono_fmt]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html#specifiers """ - required: false + required: false + relevant_when: "type = \"file\"" + } + flush_interval: { + type: uint: {} + description: """ + The interval used for making writes visible in the table. + Longer intervals might get better performance, + but there is a longer delay before the data is visible in the table. + Since every TTL scan makes its changes visible, only use this value + if it is shorter than the `scan_interval`. + + By default, all writes are made visible immediately. + """ + required: false + relevant_when: "type = \"memory\"" + } + internal_metrics: { + type: object: options: include_key_tag: { + type: bool: default: false + description: """ + Determines whether to include the key tag on internal metrics. + + This is useful for distinguishing between different keys while monitoring. However, the tag's + cardinality is unbounded. + """ + required: false + } + description: "Configuration of internal metrics" + required: false + relevant_when: "type = \"memory\"" + } + max_byte_size: { + type: uint: {} + description: """ + Maximum size of the table in bytes. All insertions that make + this table bigger than the maximum size are rejected. + + By default, there is no size limit. + """ + required: false + relevant_when: "type = \"memory\"" + } + scan_interval: { + type: uint: default: 30 + description: """ + The scan interval used to look for expired records. This is provided + as an optimization to ensure that TTL is updated, but without doing + too many cache scans. + """ + required: false + relevant_when: "type = \"memory\"" + } + source_config: { + type: object: options: { + export_batch_size: { + type: uint: {} + description: """ + Batch size for data exporting. Used to prevent exporting entire table at + once and blocking the system. + + By default, batches are not used and entire table is exported. + """ + required: false + } + export_expired_items: { + type: bool: default: false + description: """ + Set to true to export expired items via the `expired` output port. + Expired items ignore other settings and are exported as they are flushed from the table. + """ + required: false + } + export_interval: { + type: uint: {} + description: "Interval for exporting all data from the table when used as a source." + required: false + } + remove_after_export: { + type: bool: default: false + description: """ + If set to true, all data will be removed from cache after exporting. + Only valid if used as a source and export_interval > 0 + + By default, export will not remove data from cache + """ + required: false + } + source_key: { + type: string: {} + description: """ + Key to use for this component when used as a source. This must be different from the + component key. + """ + required: true + } + } + description: "Configuration for source functionality." + required: false + relevant_when: "type = \"memory\"" + } + ttl: { + type: uint: default: 600 + description: """ + TTL (time-to-live in seconds) is used to limit the lifetime of data stored in the cache. + When TTL expires, data behind a specific key in the cache is removed. + TTL is reset when the key is replaced. + """ + required: false + relevant_when: "type = \"memory\"" + } + ttl_field: { + type: string: default: "" + description: "Field in the incoming value used as the TTL override." + required: false + relevant_when: "type = \"memory\"" + } + locale: { + type: string: default: "en" + description: """ + The locale to use when querying the database. + + MaxMind includes localized versions of some of the fields within their database, such as + country name. This setting can control which of those localized versions are returned by the + transform. + + More information on which portions of the geolocation data are localized, and what languages + are available, can be found [here][locale_docs]. + + [locale_docs]: https://support.maxmind.com/hc/en-us/articles/4414877149467-IP-Geolocation-Data#h_01FRRGRYTGZB29ERDBZCX3MR8Q + """ + required: false + relevant_when: "type = \"geoip\"" + } + path: { + type: string: {} + description: """ + Path to the [MaxMind GeoIP2][geoip2] or [GeoLite2 binary city database file][geolite2] + (**GeoLite2-City.mmdb**). + + Other databases, such as the country database, are not supported. + `mmdb` enrichment table can be used for other databases. + + [geoip2]: https://dev.maxmind.com/geoip/geoip2/downloadable + [geolite2]: https://dev.maxmind.com/geoip/geoip2/geolite2/#Download_Access + """ + required: true + relevant_when: "type = \"geoip\" or type = \"mmdb\"" } type: { required: true - type: string: enum: file: "Exposes data from a static file as an enrichment table." + type: string: enum: { + file: "Exposes data from a static file as an enrichment table." + memory: """ + Exposes data from a memory cache as an enrichment table. The cache can be written to using + a sink. + """ + geoip: """ + Exposes data from a [MaxMind][maxmind] [GeoIP2][geoip2] database as an enrichment table. + + [maxmind]: https://www.maxmind.com/ + [geoip2]: https://www.maxmind.com/en/geoip2-databases + """ + mmdb: """ + Exposes data from a [MaxMind][maxmind] database as an enrichment table. + + [maxmind]: https://www.maxmind.com/ + """ + } description: "enrichment table type" } } @@ -345,12 +536,253 @@ generated: configuration: { required: false relevant_when: "type = \"exec\"" } + auth: { + type: object: options: { + access_key_id: { + type: string: examples: ["AKIAIOSFODNN7EXAMPLE"] + description: "The AWS access key ID." + required: true + } + assume_role: { + type: string: examples: ["arn:aws:iam::123456789098:role/my_role"] + description: """ + The ARN of an [IAM role][iam_role] to assume. + + [iam_role]: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html + """ + required: true + } + external_id: { + type: string: examples: ["randomEXAMPLEidString"] + description: """ + The optional unique external ID in conjunction with role to assume. + + [external_id]: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-user_externalid.html + """ + required: false + } + region: { + type: string: examples: ["us-west-2"] + description: """ + The [AWS region][aws_region] to send STS requests to. + + If not set, this defaults to the configured region + for the service itself. + + [aws_region]: https://docs.aws.amazon.com/general/latest/gr/rande.html#regional-endpoints + """ + required: false + } + secret_access_key: { + type: string: examples: ["wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"] + description: "The AWS secret access key." + required: true + } + session_name: { + type: string: examples: ["vector-indexer-role"] + description: """ + The optional [RoleSessionName][role_session_name] is a unique session identifier for your assumed role. + + Should be unique per principal or reason. + If not set, the session name is autogenerated like assume-role-provider-1736428351340 + + [role_session_name]: https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html + """ + required: false + } + session_token: { + type: string: examples: ["AQoDYXdz...AQoDYXdz..."] + description: """ + The AWS session token. + See [AWS temporary credentials](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_temp_use-resources.html) + """ + required: false + } + credentials_file: { + type: string: examples: ["/my/aws/credentials"] + description: "Path to the credentials file." + required: true + } + profile: { + type: string: { + default: "default" + examples: ["develop"] + } + description: """ + The credentials profile to use. + + Used to select AWS credentials from a provided credentials file. + """ + required: false + } + imds: { + type: object: options: { + connect_timeout_seconds: { + type: uint: { + default: 1 + unit: "seconds" + } + description: "Connect timeout for IMDS." + required: false + } + max_attempts: { + type: uint: default: 4 + description: "Number of IMDS retries for fetching tokens and metadata." + required: false + } + read_timeout_seconds: { + type: uint: { + default: 1 + unit: "seconds" + } + description: "Read timeout for IMDS." + required: false + } + } + description: "Configuration for authenticating with AWS through IMDS." + required: false + } + load_timeout_secs: { + type: uint: { + examples: [30] + unit: "seconds" + } + description: """ + Timeout for successfully loading any credentials, in seconds. + + Relevant when the default credentials chain or `assume_role` is used. + """ + required: false + } + } + description: "Configuration of the authentication strategy for interacting with AWS services." + required: false + relevant_when: "type = \"aws_secrets_manager\"" + } + secret_id: { + type: string: {} + description: "ID of the secret to resolve." + required: true + relevant_when: "type = \"aws_secrets_manager\"" + } + tls: { + type: object: options: { + alpn_protocols: { + type: array: items: type: string: examples: ["h2"] + description: """ + Sets the list of supported ALPN protocols. + + Declare the supported ALPN protocols, which are used during negotiation with a peer. They are prioritized in the order + that they are defined. + """ + required: false + } + ca_file: { + type: string: examples: ["/path/to/certificate_authority.crt"] + description: """ + Absolute path to an additional CA certificate file. + + The certificate must be in the DER or PEM (X.509) format. Additionally, the certificate can be provided as an inline string in PEM format. + """ + required: false + } + crt_file: { + type: string: examples: ["/path/to/host_certificate.crt"] + description: """ + Absolute path to a certificate file used to identify this server. + + The certificate must be in DER, PEM (X.509), or PKCS#12 format. Additionally, the certificate can be provided as + an inline string in PEM format. + + If this is set _and_ is not a PKCS#12 archive, `key_file` must also be set. + """ + required: false + } + key_file: { + type: string: examples: ["/path/to/host_certificate.key"] + description: """ + Absolute path to a private key file used to identify this server. + + The key must be in DER or PEM (PKCS#8) format. Additionally, the key can be provided as an inline string in PEM format. + """ + required: false + } + key_pass: { + type: string: examples: ["${KEY_PASS_ENV_VAR}", "PassWord1"] + description: """ + Passphrase used to unlock the encrypted key file. + + This has no effect unless `key_file` is set. + """ + required: false + } + server_name: { + type: string: examples: ["www.example.com"] + description: """ + Server name to use when using Server Name Indication (SNI). + + Only relevant for outgoing connections. + """ + required: false + } + verify_certificate: { + type: bool: {} + description: """ + Enables certificate verification. For components that create a server, this requires that the + client connections have a valid client certificate. For components that initiate requests, + this validates that the upstream has a valid certificate. + + If enabled, certificates must not be expired and must be issued by a trusted + issuer. This verification operates in a hierarchical manner, checking that the leaf certificate (the + certificate presented by the client/server) is not only valid, but that the issuer of that certificate is also valid, and + so on, until the verification process reaches a root certificate. + + Do NOT set this to `false` unless you understand the risks of not verifying the validity of certificates. + """ + required: false + } + verify_hostname: { + type: bool: {} + description: """ + Enables hostname verification. + + If enabled, the hostname used to connect to the remote host must be present in the TLS certificate presented by + the remote host, either as the Common Name or as an entry in the Subject Alternative Name extension. + + Only relevant for outgoing connections. + + Do NOT set this to `false` unless you understand the risks of not verifying the remote hostname. + """ + required: false + } + } + description: "TLS configuration." + required: false + relevant_when: "type = \"aws_secrets_manager\"" + } + endpoint: { + type: string: examples: ["http://127.0.0.0:5000/path/to/service"] + description: "Custom endpoint for use with AWS-compatible services." + required: false + relevant_when: "type = \"aws_secrets_manager\"" + } + region: { + type: string: examples: ["us-east-1"] + description: """ + The [AWS region][aws_region] of the target service. + + [aws_region]: https://docs.aws.amazon.com/general/latest/gr/rande.html#regional-endpoints + """ + required: false + relevant_when: "type = \"aws_secrets_manager\"" + } type: { required: true type: string: enum: { - file: "File." - directory: "Directory." - exec: "Exec." + file: "File." + directory: "Directory." + exec: "Exec." + aws_secrets_manager: "AWS Secrets Manager." } description: "secret type" }