Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
271 changes: 161 additions & 110 deletions ansible/inventories/devnet-0/group_vars/all/all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -297,132 +297,183 @@ docker_nginx_proxy_wildcard_cert: "{{ network_server_subdomain }}"
docker_nginx_proxy_wildcard_cert_url: "http://cert.{{ network_server_subdomain }}/{{ network_server_subdomain }}-latest.tar.enc"
docker_nginx_proxy_wildcard_cert_psk: "{{ secret_cert_encryption_psk }}"

# role: otelcol_contrib (local role)
# Reuses secret_loki credentials (same vmauth backend serves both ingresses).
# OTLP egress to the prod analytics gateway (shared by Vector's logs + traces sinks).
# Ingress identity (auth username + ingress_user tag) is the current devnet name
# rather than secret_loki.username: the sops username drifts stale between devnet
# iterations, while the gateway only validates the (constant) password — so using
# the network name keeps log/trace attribution correct no matter what sops holds.
otlp_endpoint: "https://otlp.analytics.production.platform.ethpandaops.io"
otlp_deployment_env: production

otelcol_contrib_container_networks: "{{ docker_networks_shared }}"

# Vector kept alongside otelcol just to ship logs to Loki. Will be removed
# when Loki path is replaced (e.g. central aggregator or Loki OTLP support).
# Shared docker network so the clients can reach Vector's OTLP listeners and
# Vector can read the other containers' logs.
vector_container_networks: "{{ docker_networks_shared }}"
vector_config: |
# Docker container logs (clean per-container metadata straight from the Docker API)
[sources.in]
type = "docker_logs"
exclude_containers = [
"{{ vector_container_name }}",
"otelcol",
"ethereum-metrics-exporter",
"nginx-proxy",
"node_exporter",
"prometheus",
"snooper-",
]

[sinks.loki]
type = "loki"
inputs = ["in"]
out_of_order_action = "accept"
labels.forwarder = "vector"
labels.instance = "{{ inventory_hostname }}"
labels.network = "{{ ethereum_network_name }}"
labels.testnet = "{{ ethereum_network_name }}"
labels.ingress_user = "{{ secret_loki.username }}"
labels.container_name = "{{ '{{ container_name }}' }}"
{%- if ethereum_node_el is defined +%}
labels.ethereum_el = "{{ ethereum_node_el }}"
{%- endif +%}
{%- if ethereum_node_cl is defined +%}
labels.ethereum_cl = "{{ ethereum_node_cl }}"
{%- endif +%}
encoding.codec = "json"
endpoint = "{{ secret_loki.endpoint }}"
auth.strategy = "basic"
auth.user = "{{ secret_loki.username }}"
auth.password = "{{ secret_loki.password }}"
otelcol_contrib_config: |
extensions:
basicauth/client:
client_auth:
username: {{ secret_loki.username }}
password: {{ secret_loki.password }}

receivers:
filelog:
include: [/var/lib/docker/containers/*/*-json.log]
include_file_path: true
start_at: end
operators:
- type: container
format: docker
add_metadata_from_filepath: true
- type: filter
expr: '(attributes["container.name"] != nil and attributes["container.name"] matches "^(otelcol|ethereum-metrics-exporter|nginx-proxy|node_exporter|prometheus|snooper-.*)$") or body matches "github\\.com/open-telemetry/opentelemetry-collector-contrib|otelcol-contrib"'
- type: json_parser
if: 'body matches "^\\s*\\{"'
on_error: send
severity:
parse_from: attributes.level
overwrite_text: true
mapping:
fatal4: [emergency, emerg]
fatal3: [alert]
fatal2: [critical, crit]
fatal: [panic]

otlp:
protocols:
grpc: {endpoint: "[::]:4317"}
http: {endpoint: "[::]:4318"}

processors:
resource:
attributes:
- {key: deployment.environment, value: "{{ otlp_deployment_env }}", action: upsert}
- {key: forwarder, value: "otelcol", action: upsert}
- {key: network, value: "{{ ethereum_network_name }}", action: upsert}
- {key: testnet, value: "{{ ethereum_network_name }}", action: upsert}
- {key: instance, value: "{{ inventory_hostname }}", action: upsert}
- {key: ingress_user, value: "{{ secret_loki.username }}", action: upsert}
- {key: host.name, value: "{{ inventory_hostname }}", action: upsert}
{%- if ethereum_node_cl is defined +%}
- {key: ethereum_cl, value: "{{ ethereum_node_cl }}", action: upsert}
{%- endif +%}
{%- if ethereum_node_el is defined +%}
- {key: ethereum_el, value: "{{ ethereum_node_el }}", action: upsert}
{%- endif +%}
# OTLP traces pushed by the clients (beacon -> :4317 grpc, geth -> :4318 http).
# use_otlp_decoding preserves the resourceSpans envelope so the sink forwards
# natively with the client's batching intact (1 request in = 1 request out).
[sources.otlp_in]
type = "opentelemetry"
use_otlp_decoding = true

transform/service_name:
log_statements:
- context: resource
statements:
- set(attributes["service.name"], attributes["container.name"]) where attributes["container.name"] != nil
[sources.otlp_in.grpc]
address = "[::]:4317"

batch:
send_batch_size: 500
timeout: 5s

exporters:
otlphttp/staging:
endpoint: "{{ otlp_endpoint }}"
auth:
authenticator: basicauth/client

# Dual-write traces to existing Tempo via gRPC. Tempo's URL embeds a
# tenant UUID — no separate auth needed.
otlp/tempo:
endpoint: "{{ tempo_grpcs_url | regex_replace('^grpcs?://', '') }}"

service:
extensions: [basicauth/client]
pipelines:
logs:
receivers: [filelog, otlp]
processors: [resource, transform/service_name, batch]
exporters: [otlphttp/staging]
traces:
receivers: [otlp]
processors: [resource, batch]
exporters: [otlphttp/staging, otlp/tempo]
[sources.otlp_in.http]
address = "[::]:4318"

# Shape docker_logs events into an OTLP resourceLogs envelope with full metadata.
[transforms.otel_shape]
type = "remap"
inputs = ["in"]
source = '''
ts_ns = to_string(to_unix_timestamp(now(), unit: "nanoseconds"))
if is_timestamp(.timestamp) {
ts_ns = to_string(to_unix_timestamp!(.timestamp, unit: "nanoseconds"))
}
msg = string(.message) ?? ""
ctr = string(.container_name) ?? "unknown"
img = string(.image) ?? ""
strm = string(.stream) ?? "stdout"

# Extract the source log level: JSON-structured lines (.level / .severity)
# first, then logfmt (level=xxx), then a level token near the start of a
# plain-text line. Client formats vary (lighthouse truncates to 4 chars:
# DEBG/ERRO/CRIT), so the text matcher allows a union of forms. SeverityText
# keeps the source's exact text; only SeverityNumber is normalised to the
# OTel scale (https://opentelemetry.io/docs/specs/otel/logs/data-model/#field-severitynumber).
# Unrecognised lines are left unset rather than guessed.
sevtext = ""
if starts_with(msg, "{") {
j = parse_json(msg) ?? {}
if is_string(j.level) {
sevtext = string!(j.level)
} else if is_string(j.severity) {
sevtext = string!(j.severity)
}
}
if sevtext == "" {
lm = parse_regex(msg, r'(?i)\b(?:level|lvl|severity)="?(?P<l>[a-z]+)') ?? {}
if is_string(lm.l) {
sevtext = string!(lm.l)
} else {
hm = parse_regex(truncate(msg, 48), r'(?i)\b(?P<l>TRACE|DEBUG|DEBG|DBUG|NOTICE|INFO|WARNING|WARN|ERROR|ERRO|CRITICAL|CRIT|FATAL|PANIC)\b') ?? {}
if is_string(hm.l) {
sevtext = string!(hm.l)
}
}
}
lvl = upcase(sevtext)
sevnum = 0
if lvl == "TRACE" {
sevnum = 1
} else if lvl == "DEBUG" || lvl == "DEBG" || lvl == "DBUG" {
sevnum = 5
} else if lvl == "INFO" || lvl == "INFOR" || lvl == "NOTICE" {
sevnum = 9
} else if lvl == "WARN" || lvl == "WARNING" {
sevnum = 13
} else if lvl == "ERROR" || lvl == "ERRO" {
sevnum = 17
} else if lvl == "CRIT" || lvl == "CRITICAL" {
sevnum = 18
} else if lvl == "FATAL" || lvl == "PANIC" {
sevnum = 21
}

attrs = [
{"key": "service.name", "value": {"stringValue": ctr}},
{"key": "container.name", "value": {"stringValue": ctr}},
{"key": "container.image.name", "value": {"stringValue": img}},
{"key": "deployment.environment", "value": {"stringValue": "{{ otlp_deployment_env }}"}},
{"key": "forwarder", "value": {"stringValue": "vector"}},
{"key": "ingress_user", "value": {"stringValue": "{{ ethereum_network_name }}"}},
{"key": "network", "value": {"stringValue": "{{ ethereum_network_name }}"}},
{"key": "testnet", "value": {"stringValue": "{{ ethereum_network_name }}"}},
{"key": "instance", "value": {"stringValue": "{{ inventory_hostname }}"}},
{"key": "host.name", "value": {"stringValue": "{{ inventory_hostname }}"}}
]
{%- if ethereum_node_cl is defined +%}
attrs = push(attrs, {"key": "ethereum_cl", "value": {"stringValue": "{{ ethereum_node_cl }}"}})
{%- endif +%}
{%- if ethereum_node_el is defined +%}
attrs = push(attrs, {"key": "ethereum_el", "value": {"stringValue": "{{ ethereum_node_el }}"}})
{%- endif +%}
. = {
"resource_log": {
"resource": {"attributes": attrs},
"scopeLogs": [{
"scope": {"name": "blob-devnets-vector"},
"logRecords": [{
"timeUnixNano": ts_ns,
"severityNumber": sevnum,
"severityText": sevtext,
"body": {"stringValue": msg},
"attributes": [{"key": "stream", "value": {"stringValue": strm}}]
}]
}]
}
}
'''

# Batch many shaped log events into one OTLP envelope (reduce works on logs).
[transforms.batch_envelope]
type = "reduce"
inputs = ["otel_shape"]
expire_after_ms = 30000
end_every_period_ms = 5000
max_events = 500
merge_strategies.resource_log = "array"

[transforms.finalize_envelope]
type = "remap"
inputs = ["batch_envelope"]
source = '''
. = {"resourceLogs": .resource_log}
'''

[sinks.otlp_logs]
type = "opentelemetry"
inputs = ["finalize_envelope"]
[sinks.otlp_logs.protocol]
type = "http"
uri = "{{ otlp_endpoint }}/v1/logs"
method = "post"
encoding.codec = "otlp"
auth.strategy = "basic"
auth.user = "{{ ethereum_network_name }}"
auth.password = "{{ secret_loki.password }}"
# One event here is already a full OTLP envelope (built by the reduce above).
# max_events MUST be 1 — OTLP/HTTP allows one envelope per request.
batch.max_events = 1
batch.timeout_secs = 5

[sinks.otlp_traces]
type = "opentelemetry"
inputs = ["otlp_in.traces"]
[sinks.otlp_traces.protocol]
type = "http"
uri = "{{ otlp_endpoint }}/v1/traces"
method = "post"
encoding.codec = "otlp"
auth.strategy = "basic"
auth.user = "{{ ethereum_network_name }}"
auth.password = "{{ secret_loki.password }}"
# use_otlp_decoding => one event already carries the client's full span batch.
batch.max_events = 1
batch.timeout_secs = 5
# Edge telemetry consolidated onto Vector (docker_logs + OTLP traces -> prod OTLP).
# otelcol-contrib is no longer used at the edge; the role removes its container.
otelcol_contrib_cleanup: true
2 changes: 1 addition & 1 deletion ansible/inventories/devnet-0/group_vars/all/images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ default_tooling_images:
nginx_proxy_acme: nginxproxy/acme-companion
nginx_proxy_cert_loader: ethpandaops/debian-docker:latest
nginx_proxy_cert_linker: nginxproxy/docker-gen
vector: timberio/vector:0.55.0-alpine
vector: timberio/vector:0.56.0-alpine
spamoor: ethpandaops/spamoor:master-latest
blobber: ethpandaops/blobber:latest
syncoor_web: docker.ethquokkaops.io/gh/ethpandaops/syncoor-web:master
Expand Down
4 changes: 2 additions & 2 deletions ansible/inventories/devnet-0/group_vars/bootnode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ lighthouse_container_command_extra_args:
- --testnet-dir=/network-config
- --enable-partial-columns
# OTLP traces → local otelcol sidecar (gRPC).
- --telemetry-collector-url=http://otelcol:4317
- --telemetry-collector-url=http://vector:4317
- --telemetry-service-name=lighthouse-bn-{{ ethereum_network_name }}-{{ inventory_hostname }}
- >-
--boot-nodes={{
Expand Down Expand Up @@ -104,7 +104,7 @@ geth_container_command_extra_args:
- --state.scheme=hash
# OTLP traces → local otelcol sidecar (handles upstream auth + endpoint).
- --rpc.telemetry
- --rpc.telemetry.endpoint=http://otelcol:4318/v1/traces
- --rpc.telemetry.endpoint=http://vector:4318/v1/traces
- --rpc.telemetry.instance-id={{ ethereum_network_name }}-{{ inventory_hostname }}
- >-
--bootnodes={{
Expand Down
2 changes: 1 addition & 1 deletion ansible/inventories/devnet-0/group_vars/geth.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ geth_container_command_extra_args:
- --syncmode=full
- --bootnodes={{ ethereum_el_bootnodes | join(',') }}
- --rpc.telemetry
- --rpc.telemetry.endpoint=http://otelcol:4318/v1/traces
- --rpc.telemetry.endpoint=http://vector:4318/v1/traces
- --rpc.telemetry.sample-ratio=1 # Required until geth ships the fix for the SampleRatio default https://github.com/ethereum/go-ethereum/pull/34948
- --rpc.telemetry.instance-id={{ ethereum_network_name }}-{{ inventory_hostname }}
- --rpc.telemetry.tags=execution_client={{ ethereum_node_el }},consensus_client={{ ethereum_node_cl }},supernode={{ ethereum_node_cl_supernode_enabled | bool | default(false) }},network={{ ethereum_network_name }},instance-id={{ ethereum_network_name }}-{{ inventory_hostname }}
Expand Down
2 changes: 1 addition & 1 deletion ansible/inventories/devnet-0/group_vars/lighthouse.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ lighthouse_container_command_extra_args:
- --boot-nodes={{ ethereum_cl_bootnodes | join(',') }}
- --debug-level=debug
- --enable-partial-columns
- --telemetry-collector-url=http://otelcol:4317
- --telemetry-collector-url=http://vector:4317
- --telemetry-service-name={{ ethereum_network_name }}-{{ inventory_hostname }}
lighthouse_validator_container_volumes:
- "{{ lighthouse_validator_datadir }}:/validator-data"
Expand Down