ethpandaops · samcm · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/ansible/inventories/devnet-0/group_vars/all/all.yaml b/ansible/inventories/devnet-0/group_vars/all/all.yaml
@@ -297,132 +297,183 @@ docker_nginx_proxy_wildcard_cert: "{{ network_server_subdomain }}"
 docker_nginx_proxy_wildcard_cert_url: "http://cert.{{ network_server_subdomain }}/{{ network_server_subdomain }}-latest.tar.enc"
 docker_nginx_proxy_wildcard_cert_psk: "{{ secret_cert_encryption_psk }}"
 
-# role: otelcol_contrib (local role)
-# Reuses secret_loki credentials (same vmauth backend serves both ingresses).
+# OTLP egress to the prod analytics gateway (shared by Vector's logs + traces sinks).
+# Ingress identity (auth username + ingress_user tag) is the current devnet name
+# rather than secret_loki.username: the sops username drifts stale between devnet
+# iterations, while the gateway only validates the (constant) password — so using
+# the network name keeps log/trace attribution correct no matter what sops holds.
 otlp_endpoint: "https://otlp.analytics.production.platform.ethpandaops.io"
 otlp_deployment_env: production
 
-otelcol_contrib_container_networks: "{{ docker_networks_shared }}"
-
-# Vector kept alongside otelcol just to ship logs to Loki. Will be removed
-# when Loki path is replaced (e.g. central aggregator or Loki OTLP support).
+# Shared docker network so the clients can reach Vector's OTLP listeners and
+# Vector can read the other containers' logs.
 vector_container_networks: "{{ docker_networks_shared }}"
 vector_config: |
+  # Docker container logs (clean per-container metadata straight from the Docker API)
   [sources.in]
     type = "docker_logs"
     exclude_containers = [
       "{{ vector_container_name }}",
-      "otelcol",
       "ethereum-metrics-exporter",
       "nginx-proxy",
       "node_exporter",
       "prometheus",
       "snooper-",
     ]
 
-  [sinks.loki]
-    type = "loki"
-    inputs = ["in"]
-    out_of_order_action = "accept"
-    labels.forwarder = "vector"
-    labels.instance = "{{ inventory_hostname }}"
-    labels.network = "{{ ethereum_network_name }}"
-    labels.testnet = "{{ ethereum_network_name }}"
-    labels.ingress_user = "{{ secret_loki.username }}"
-    labels.container_name = "{{ '{{ container_name }}' }}"
-  {%- if ethereum_node_el is defined +%}
-    labels.ethereum_el = "{{ ethereum_node_el }}"
-  {%- endif +%}
-  {%- if ethereum_node_cl is defined +%}
-    labels.ethereum_cl = "{{ ethereum_node_cl }}"
-  {%- endif +%}
-    encoding.codec = "json"
-    endpoint = "{{ secret_loki.endpoint }}"
-    auth.strategy = "basic"
-    auth.user = "{{ secret_loki.username }}"
-    auth.password = "{{ secret_loki.password }}"
-otelcol_contrib_config: |
-  extensions:
-    basicauth/client:
-      client_auth:
-        username: {{ secret_loki.username }}
-        password: {{ secret_loki.password }}
-
-  receivers:
-    filelog:
-      include: [/var/lib/docker/containers/*/*-json.log]
-      include_file_path: true
-      start_at: end
-      operators:
-        - type: container
-          format: docker
-          add_metadata_from_filepath: true
-        - type: filter
-          expr: '(attributes["container.name"] != nil and attributes["container.name"] matches "^(otelcol|ethereum-metrics-exporter|nginx-proxy|node_exporter|prometheus|snooper-.*)$") or body matches "github\\.com/open-telemetry/opentelemetry-collector-contrib|otelcol-contrib"'
-        - type: json_parser
-          if: 'body matches "^\\s*\\{"'
-          on_error: send
-          severity:
-            parse_from: attributes.level
-            overwrite_text: true
-            mapping:
-              fatal4: [emergency, emerg]
-              fatal3: [alert]
-              fatal2: [critical, crit]
-              fatal:  [panic]
-
-    otlp:
-      protocols:
-        grpc: {endpoint: "[::]:4317"}
-        http: {endpoint: "[::]:4318"}
-
-  processors:
-    resource:
-      attributes:
-        - {key: deployment.environment, value: "{{ otlp_deployment_env }}", action: upsert}
-        - {key: forwarder,              value: "otelcol", action: upsert}
-        - {key: network,                value: "{{ ethereum_network_name }}", action: upsert}
-        - {key: testnet,                value: "{{ ethereum_network_name }}", action: upsert}
-        - {key: instance,               value: "{{ inventory_hostname }}", action: upsert}
-        - {key: ingress_user,           value: "{{ secret_loki.username }}", action: upsert}
-        - {key: host.name,              value: "{{ inventory_hostname }}", action: upsert}
-      {%- if ethereum_node_cl is defined +%}
-        - {key: ethereum_cl,            value: "{{ ethereum_node_cl }}", action: upsert}
-      {%- endif +%}
-      {%- if ethereum_node_el is defined +%}
-        - {key: ethereum_el,            value: "{{ ethereum_node_el }}", action: upsert}
-      {%- endif +%}
+  # OTLP traces pushed by the clients (beacon -> :4317 grpc, geth -> :4318 http).
+  # use_otlp_decoding preserves the resourceSpans envelope so the sink forwards
+  # natively with the client's batching intact (1 request in = 1 request out).
+  [sources.otlp_in]
+    type = "opentelemetry"
+    use_otlp_decoding = true
 
-    transform/service_name:
-      log_statements:
-        - context: resource
-          statements:
-            - set(attributes["service.name"], attributes["container.name"]) where attributes["container.name"] != nil
+  [sources.otlp_in.grpc]
+    address = "[::]:4317"
 
-    batch:
-      send_batch_size: 500
-      timeout: 5s
-
-  exporters:
-    otlphttp/staging:
-      endpoint: "{{ otlp_endpoint }}"
-      auth:
-        authenticator: basicauth/client
-
-    # Dual-write traces to existing Tempo via gRPC. Tempo's URL embeds a
-    # tenant UUID — no separate auth needed.
-    otlp/tempo:
-      endpoint: "{{ tempo_grpcs_url | regex_replace('^grpcs?://', '') }}"
-
-  service:
-    extensions: [basicauth/client]
-    pipelines:
-      logs:
-        receivers: [filelog, otlp]
-        processors: [resource, transform/service_name, batch]
-        exporters: [otlphttp/staging]
-      traces:
-        receivers: [otlp]
-        processors: [resource, batch]
-        exporters: [otlphttp/staging, otlp/tempo]
+  [sources.otlp_in.http]
+    address = "[::]:4318"
 
+  # Shape docker_logs events into an OTLP resourceLogs envelope with full metadata.
+  [transforms.otel_shape]
+    type = "remap"
+    inputs = ["in"]
+    source = '''
+      ts_ns = to_string(to_unix_timestamp(now(), unit: "nanoseconds"))
+      if is_timestamp(.timestamp) {
+        ts_ns = to_string(to_unix_timestamp!(.timestamp, unit: "nanoseconds"))
+      }
+      msg  = string(.message)        ?? ""
+      ctr  = string(.container_name) ?? "unknown"
+      img  = string(.image)          ?? ""
+      strm = string(.stream)         ?? "stdout"
+
+      # Extract the source log level: JSON-structured lines (.level / .severity)
+      # first, then logfmt (level=xxx), then a level token near the start of a
+      # plain-text line. Client formats vary (lighthouse truncates to 4 chars:
+      # DEBG/ERRO/CRIT), so the text matcher allows a union of forms. SeverityText
+      # keeps the source's exact text; only SeverityNumber is normalised to the
+      # OTel scale (https://opentelemetry.io/docs/specs/otel/logs/data-model/#field-severitynumber).
+      # Unrecognised lines are left unset rather than guessed.
+      sevtext = ""
+      if starts_with(msg, "{") {
+        j = parse_json(msg) ?? {}
+        if is_string(j.level) {
+          sevtext = string!(j.level)
+        } else if is_string(j.severity) {
+          sevtext = string!(j.severity)
+        }
+      }
+      if sevtext == "" {
+        lm = parse_regex(msg, r'(?i)\b(?:level|lvl|severity)="?(?P<l>[a-z]+)') ?? {}
+        if is_string(lm.l) {
+          sevtext = string!(lm.l)
+        } else {
+          hm = parse_regex(truncate(msg, 48), r'(?i)\b(?P<l>TRACE|DEBUG|DEBG|DBUG|NOTICE|INFO|WARNING|WARN|ERROR|ERRO|CRITICAL|CRIT|FATAL|PANIC)\b') ?? {}
+          if is_string(hm.l) {
+            sevtext = string!(hm.l)
+          }
+        }
+      }
+      lvl = upcase(sevtext)
+      sevnum = 0
+      if lvl == "TRACE" {
+        sevnum = 1
+      } else if lvl == "DEBUG" || lvl == "DEBG" || lvl == "DBUG" {
+        sevnum = 5
+      } else if lvl == "INFO" || lvl == "INFOR" || lvl == "NOTICE" {
+        sevnum = 9
+      } else if lvl == "WARN" || lvl == "WARNING" {
+        sevnum = 13
+      } else if lvl == "ERROR" || lvl == "ERRO" {
+        sevnum = 17
+      } else if lvl == "CRIT" || lvl == "CRITICAL" {
+        sevnum = 18
+      } else if lvl == "FATAL" || lvl == "PANIC" {
+        sevnum = 21
+      }
+
+      attrs = [
+        {"key": "service.name",           "value": {"stringValue": ctr}},
+        {"key": "container.name",         "value": {"stringValue": ctr}},
+        {"key": "container.image.name",   "value": {"stringValue": img}},
+        {"key": "deployment.environment", "value": {"stringValue": "{{ otlp_deployment_env }}"}},
+        {"key": "forwarder",              "value": {"stringValue": "vector"}},
+        {"key": "ingress_user",           "value": {"stringValue": "{{ ethereum_network_name }}"}},
+        {"key": "network",                "value": {"stringValue": "{{ ethereum_network_name }}"}},
+        {"key": "testnet",                "value": {"stringValue": "{{ ethereum_network_name }}"}},
+        {"key": "instance",               "value": {"stringValue": "{{ inventory_hostname }}"}},
+        {"key": "host.name",              "value": {"stringValue": "{{ inventory_hostname }}"}}
+      ]
+  {%- if ethereum_node_cl is defined +%}
+      attrs = push(attrs, {"key": "ethereum_cl", "value": {"stringValue": "{{ ethereum_node_cl }}"}})
+  {%- endif +%}
+  {%- if ethereum_node_el is defined +%}
+      attrs = push(attrs, {"key": "ethereum_el", "value": {"stringValue": "{{ ethereum_node_el }}"}})
+  {%- endif +%}
+      . = {
+        "resource_log": {
+          "resource": {"attributes": attrs},
+          "scopeLogs": [{
+            "scope": {"name": "blob-devnets-vector"},
+            "logRecords": [{
+              "timeUnixNano":   ts_ns,
+              "severityNumber": sevnum,
+              "severityText":   sevtext,
+              "body": {"stringValue": msg},
+              "attributes": [{"key": "stream", "value": {"stringValue": strm}}]
+            }]
+          }]
+        }
+      }
+    '''
+
+  # Batch many shaped log events into one OTLP envelope (reduce works on logs).
+  [transforms.batch_envelope]
+    type = "reduce"
+    inputs = ["otel_shape"]
+    expire_after_ms = 30000
+    end_every_period_ms = 5000
+    max_events = 500
+    merge_strategies.resource_log = "array"
+
+  [transforms.finalize_envelope]
+    type = "remap"
+    inputs = ["batch_envelope"]
+    source = '''
+      . = {"resourceLogs": .resource_log}
+    '''
+
+  [sinks.otlp_logs]
+    type = "opentelemetry"
+    inputs = ["finalize_envelope"]
+    [sinks.otlp_logs.protocol]
+      type = "http"
+      uri = "{{ otlp_endpoint }}/v1/logs"
+      method = "post"
+      encoding.codec = "otlp"
+      auth.strategy = "basic"
+      auth.user = "{{ ethereum_network_name }}"
+      auth.password = "{{ secret_loki.password }}"
+      # One event here is already a full OTLP envelope (built by the reduce above).
+      # max_events MUST be 1 — OTLP/HTTP allows one envelope per request.
+      batch.max_events = 1
+      batch.timeout_secs = 5
+
+  [sinks.otlp_traces]
+    type = "opentelemetry"
+    inputs = ["otlp_in.traces"]
+    [sinks.otlp_traces.protocol]
+      type = "http"
+      uri = "{{ otlp_endpoint }}/v1/traces"
+      method = "post"
+      encoding.codec = "otlp"
+      auth.strategy = "basic"
+      auth.user = "{{ ethereum_network_name }}"
+      auth.password = "{{ secret_loki.password }}"
+      # use_otlp_decoding => one event already carries the client's full span batch.
+      batch.max_events = 1
+      batch.timeout_secs = 5
+# Edge telemetry consolidated onto Vector (docker_logs + OTLP traces -> prod OTLP).
+# otelcol-contrib is no longer used at the edge; the role removes its container.
+otelcol_contrib_cleanup: true
diff --git a/ansible/inventories/devnet-0/group_vars/all/images.yaml b/ansible/inventories/devnet-0/group_vars/all/images.yaml
@@ -56,7 +56,7 @@ default_tooling_images:
   nginx_proxy_acme: nginxproxy/acme-companion
   nginx_proxy_cert_loader: ethpandaops/debian-docker:latest
   nginx_proxy_cert_linker: nginxproxy/docker-gen
-  vector: timberio/vector:0.55.0-alpine
+  vector: timberio/vector:0.56.0-alpine
   spamoor: ethpandaops/spamoor:master-latest
   blobber: ethpandaops/blobber:latest
   syncoor_web: docker.ethquokkaops.io/gh/ethpandaops/syncoor-web:master

diff --git a/ansible/inventories/devnet-0/group_vars/bootnode.yaml b/ansible/inventories/devnet-0/group_vars/bootnode.yaml
@@ -61,7 +61,7 @@ lighthouse_container_command_extra_args:
   - --testnet-dir=/network-config
   - --enable-partial-columns
   # OTLP traces → local otelcol sidecar (gRPC).
-  - --telemetry-collector-url=http://otelcol:4317
+  - --telemetry-collector-url=http://vector:4317
   - --telemetry-service-name=lighthouse-bn-{{ ethereum_network_name }}-{{ inventory_hostname }}
   - >-
     --boot-nodes={{
@@ -104,7 +104,7 @@ geth_container_command_extra_args:
   - --state.scheme=hash
   # OTLP traces → local otelcol sidecar (handles upstream auth + endpoint).
   - --rpc.telemetry
-  - --rpc.telemetry.endpoint=http://otelcol:4318/v1/traces
+  - --rpc.telemetry.endpoint=http://vector:4318/v1/traces
   - --rpc.telemetry.instance-id={{ ethereum_network_name }}-{{ inventory_hostname }}
   - >-
     --bootnodes={{

diff --git a/ansible/inventories/devnet-0/group_vars/geth.yaml b/ansible/inventories/devnet-0/group_vars/geth.yaml
@@ -28,7 +28,7 @@ geth_container_command_extra_args:
   - --syncmode=full
   - --bootnodes={{ ethereum_el_bootnodes | join(',') }}
   - --rpc.telemetry
-  - --rpc.telemetry.endpoint=http://otelcol:4318/v1/traces
+  - --rpc.telemetry.endpoint=http://vector:4318/v1/traces
   - --rpc.telemetry.sample-ratio=1 # Required until geth ships the fix for the SampleRatio default https://github.com/ethereum/go-ethereum/pull/34948
   - --rpc.telemetry.instance-id={{ ethereum_network_name }}-{{ inventory_hostname }}
   - --rpc.telemetry.tags=execution_client={{ ethereum_node_el }},consensus_client={{ ethereum_node_cl }},supernode={{ ethereum_node_cl_supernode_enabled | bool | default(false) }},network={{ ethereum_network_name }},instance-id={{ ethereum_network_name }}-{{ inventory_hostname }}

diff --git a/ansible/inventories/devnet-0/group_vars/lighthouse.yaml b/ansible/inventories/devnet-0/group_vars/lighthouse.yaml
@@ -46,7 +46,7 @@ lighthouse_container_command_extra_args:
   - --boot-nodes={{ ethereum_cl_bootnodes | join(',') }}
   - --debug-level=debug
   - --enable-partial-columns
-  - --telemetry-collector-url=http://otelcol:4317
+  - --telemetry-collector-url=http://vector:4317
   - --telemetry-service-name={{ ethereum_network_name }}-{{ inventory_hostname }}
 lighthouse_validator_container_volumes:
   - "{{ lighthouse_validator_datadir }}:/validator-data"