From 7f2405985203c1b12cb46783899844daedbe73e4 Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Fri, 8 May 2026 12:53:08 +0200 Subject: [PATCH 1/7] Fix n8n metric mappings and add full v2 metric coverage - Drop fabricated metric names that n8n never emitted; map only what is empirically present. - Add the n8n 2.x metric families: workflow.execution.duration histogram, audit.workflow.*, embed.login.*, token.exchange.*, process.pss.bytes, runner.task.requested, and the workflow_statistics gauges. - Add worker-only families (node.started, node.finished, queue.job.dequeued, runner.task.requested) by introducing a worker-scrape instance. - Stop gating the OpenMetrics scrape on /healthz/readiness; emit n8n.readiness.check unconditionally so metrics still flow when the readiness endpoint is unhealthy. - Replace the custom Dockerfile with a direct n8nio/n8n image reference and parameterise the version via hatch.toml so the test matrix can run against both 1.118.1 and 2.19.5. - Allocate free host ports via datadog_checks.dev.utils.find_free_ports and forward them through docker_run env_vars to avoid port collisions on re-runs. --- n8n/README.md | 56 +- n8n/assets/configuration/spec.yaml | 2 +- n8n/datadog_checks/n8n/check.py | 66 ++- n8n/datadog_checks/n8n/data/conf.yaml.example | 2 +- n8n/datadog_checks/n8n/metrics.py | 80 +-- n8n/hatch.toml | 7 +- n8n/metadata.csv | 98 ++-- n8n/tests/common.py | 194 ++++--- n8n/tests/conftest.py | 109 +++- n8n/tests/docker/Dockerfile | 14 - n8n/tests/docker/README.md | 3 +- n8n/tests/docker/docker-compose.yaml | 75 ++- n8n/tests/docker/sample_workflow.json | 59 +++ n8n/tests/docker/sample_workflow_failing.json | 50 ++ n8n/tests/fixtures/n8n.txt | 498 +++++++++--------- n8n/tests/fixtures/n8n_custom.txt | 497 +++++++++-------- n8n/tests/test_e2e.py | 28 +- n8n/tests/test_integration.py | 58 ++ n8n/tests/test_unit.py | 128 +++-- 19 files changed, 1207 insertions(+), 817 deletions(-) delete mode 100644 n8n/tests/docker/Dockerfile create mode 100644 n8n/tests/docker/sample_workflow.json create mode 100644 n8n/tests/docker/sample_workflow_failing.json create mode 100644 n8n/tests/test_integration.py diff --git a/n8n/README.md b/n8n/README.md index 18eb081e7890a..76339740046a4 100644 --- a/n8n/README.md +++ b/n8n/README.md @@ -5,12 +5,12 @@ This check monitors [n8n][1] through the Datadog Agent. Collect n8n metrics including: -- Cache metrics: Hit and miss statistics. -- Message event bus metrics: Event-related metrics. -- Workflow metrics: Can include workflow ID labels. -- Node metrics: Can include node type labels. -- Credential metrics: Can include credential type labels. -- Queue metrics +- Cache metrics: hit, miss, and update counts. +- Workflow metrics: started, success, failed counters; in n8n 2.x, an execution-duration histogram. +- Node metrics: per-node started and finished counters emitted by worker processes in queue mode. +- Queue metrics: queue depth, enqueued/dequeued/completed/failed counters, and scaling-mode worker gauges. +- HTTP metrics: request duration histograms tagged with status code. +- Process and Node.js runtime metrics. ## Setup @@ -40,6 +40,10 @@ N8N_METRICS_INCLUDE_CACHE_METRICS=true N8N_METRICS_INCLUDE_MESSAGE_EVENT_BUS_METRICS=true N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL=true N8N_METRICS_INCLUDE_API_ENDPOINTS=true +N8N_METRICS_INCLUDE_QUEUE_METRICS=true + +# Optional: n8n 2.x adds workflow_statistics gauges (workflows, users, executions, ...) — opt in +N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS=true # Optional: Customize the metric prefix (default is 'n8n_') N8N_METRICS_PREFIX=n8n_ @@ -47,6 +51,46 @@ N8N_METRICS_PREFIX=n8n_ For more details, see the n8n documentation on [enabling Prometheus metrics][10]. +Set `openmetrics_endpoint` in `conf.yaml` to the full `/metrics` URL of your n8n process, for example `http://localhost:5678/metrics`. + +#### Event-driven counters + +Some n8n counters are registered dynamically the first time the corresponding event fires. For example, `n8n.workflow.started.count`, `n8n.workflow.success.count`, `n8n.workflow.failed.count`, and the queue and node event counters do not appear until at least one workflow has been executed. This is expected behavior and is not a sign of a misconfigured integration. + +#### Queue mode and workers + +In queue mode, n8n runs separate worker processes that execute jobs picked up from a Redis-backed queue. Each worker exposes its own `/metrics` endpoint and emits a different subset of metrics than the main process. Worker-observed metrics include `n8n.queue.job.dequeued.count`, `n8n.node.started.count`, `n8n.node.finished.count`, and (n8n 2.x) `n8n.runner.task.requested.count`. Main-only metrics include `n8n.instance.role.leader` and the `n8n.scaling.mode.queue.jobs.*` family. + +To expose worker metrics, set `QUEUE_HEALTH_CHECK_ACTIVE=true` and `QUEUE_HEALTH_CHECK_PORT=` on each worker. **In n8n 2.x, port `5679` is reserved for the task runner broker, so pick a different port (for example `5680`).** + +For full coverage in queue deployments, configure one Datadog instance per n8n process exposing `/metrics`, including main and worker processes: + +```yaml +instances: + - openmetrics_endpoint: http://n8n-main:5678/metrics + - openmetrics_endpoint: http://n8n-worker:5680/metrics +``` + +#### Version-specific metrics + +Several metric families were introduced in n8n 2.x and are not emitted on n8n 1.x: + +- `n8n.workflow.execution.duration.seconds.*` (histogram) +- `n8n.audit.workflow.activated.count`, `n8n.audit.workflow.executed.count` +- `n8n.embed.login.requests.count` (tagged with `result:success`/`failure`), `n8n.embed.login.failures.count` (tagged with `reason`) +- `n8n.token.exchange.requests.count` (tagged with `result:success`/`failure`), `n8n.token.exchange.failures.count` (tagged with `reason`), `n8n.token.exchange.identity.linked.count`, `n8n.token.exchange.jit.provisioning.count` +- `n8n.process.pss.bytes` (Linux only) +- `n8n.runner.task.requested.count` (worker-only) +- The `n8n.{production,manual,production.root}.executions`, `n8n.users.total`, `n8n.enabled.users`, `n8n.workflows.total`, and `n8n.credentials.total` family — only emitted when `N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS=true` is set. + +The failures-only counters (`*.failures.count`) and the libuv `n8n.nodejs.active.requests` gauge only emit samples once the corresponding event fires (an auth failure, an in-flight libuv request); a healthy idle deployment may not produce any data points for them. + +The `metadata.csv` description for each affected metric calls out its version requirement. + +#### Tag cardinality + +When `N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL=true`, http and workflow execution histograms are tagged with `workflow_id` (and similar labels for nodes). On deployments with many distinct workflows or nodes, this can produce high-cardinality metrics. Drop the label via `exclude_labels` or omit `N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL` to keep tag cardinality bounded. + #### Configure the Datadog Agent 1. Edit the `n8n.d/conf.yaml` file, in the `conf.d/` folder at the root of your Agent's configuration directory to start collecting your n8n performance data. See the [sample n8n.d/conf.yaml][4] for all available configuration options. diff --git a/n8n/assets/configuration/spec.yaml b/n8n/assets/configuration/spec.yaml index f828a10ec05c0..cea34bff83932 100644 --- a/n8n/assets/configuration/spec.yaml +++ b/n8n/assets/configuration/spec.yaml @@ -12,7 +12,7 @@ files: openmetrics_endpoint.required: true openmetrics_endpoint.hidden: false openmetrics_endpoint.display_priority: 1 - openmetrics_endpoint.value.example: http://localhost:5678 + openmetrics_endpoint.value.example: http://localhost:5678/metrics openmetrics_endpoint.description: | Endpoint exposing the n8n's metrics in the OpenMetrics format. For more information, refer to: https://docs.n8n.io/hosting/logging-monitoring/monitoring/ diff --git a/n8n/datadog_checks/n8n/check.py b/n8n/datadog_checks/n8n/check.py index 00c41569b83d5..012a70e20bc7a 100644 --- a/n8n/datadog_checks/n8n/check.py +++ b/n8n/datadog_checks/n8n/check.py @@ -2,58 +2,52 @@ # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse + +from requests.exceptions import RequestException from datadog_checks.base import OpenMetricsBaseCheckV2 from datadog_checks.n8n.metrics import METRIC_MAP, RENAME_LABELS_MAP from .config_models import ConfigMixin -DEFAULT_READY_ENDPOINT = '/healthz/readiness' +DEFAULT_READY_PATH = '/healthz/readiness' class N8nCheck(OpenMetricsBaseCheckV2, ConfigMixin): __NAMESPACE__ = 'n8n' DEFAULT_METRIC_LIMIT = 0 - def __init__(self, name, init_config, instances=None): - super(N8nCheck, self).__init__( - name, - init_config, - instances, - ) - self.openmetrics_endpoint = self.instance["openmetrics_endpoint"] - self.tags = self.instance.get('tags', []) - self._ready_endpoint = DEFAULT_READY_ENDPOINT - - def get_default_config(self): + def get_default_config(self) -> dict: return { 'metrics': [METRIC_MAP], 'rename_labels': RENAME_LABELS_MAP, 'raw_metric_prefix': 'n8n_', } - def _check_n8n_readiness(self): - endpoint = urljoin(self.openmetrics_endpoint, self._ready_endpoint) - response = self.http.get(endpoint) - - # Determine metric value and status_code tag - if response.status_code is None: - self.log.warning("The readiness endpoint did not return a status code") - metric_value = 0 - metric_tags = self.tags + ['status_code:null'] - elif response.status_code == 200: - # Ready - submit 1 - metric_value = 1 - metric_tags = self.tags + [f'status_code:{response.status_code}'] - else: - # Not ready - submit 0 - metric_value = 0 - metric_tags = self.tags + [f'status_code:{response.status_code}'] - - # Submit metric with appropriate value and status_code tag - self.gauge('readiness.check', metric_value, tags=metric_tags) - - def check(self, instance): - super().check(instance) + def _readiness_endpoint(self) -> str: + parsed = urlparse(self.config.openmetrics_endpoint) + base = f'{parsed.scheme}://{parsed.netloc}' + return urljoin(base, DEFAULT_READY_PATH) + + def _check_n8n_readiness(self) -> None: + endpoint = self._readiness_endpoint() + tags = list(self.config.tags or ()) + + try: + response = self.http.get(endpoint) + except RequestException as e: + self.log.warning("Could not reach n8n readiness endpoint %s: %s", endpoint, e) + self.gauge('readiness.check', 0, tags=tags + ['status_code:none']) + return + + is_ready = response.status_code == 200 + self.gauge( + 'readiness.check', + 1 if is_ready else 0, + tags=tags + [f'status_code:{response.status_code}'], + ) + + def check(self, instance: dict) -> None: self._check_n8n_readiness() + super().check(instance) diff --git a/n8n/datadog_checks/n8n/data/conf.yaml.example b/n8n/datadog_checks/n8n/data/conf.yaml.example index e80f23c8c08c1..5f96c4acb66fe 100644 --- a/n8n/datadog_checks/n8n/data/conf.yaml.example +++ b/n8n/datadog_checks/n8n/data/conf.yaml.example @@ -18,7 +18,7 @@ instances: ## https://docs.n8n.io/hosting/logging-monitoring/monitoring/ ## https://docs.n8n.io/hosting/configuration/environment-variables/endpoints/ # - - openmetrics_endpoint: http://localhost:5678 + - openmetrics_endpoint: http://localhost:5678/metrics ## @param raw_metric_prefix - string - optional - default: n8n_ ## The prefix prepended to all metrics from n8n. diff --git a/n8n/datadog_checks/n8n/metrics.py b/n8n/datadog_checks/n8n/metrics.py index 5e29ba629340c..7a3be922b6f09 100644 --- a/n8n/datadog_checks/n8n/metrics.py +++ b/n8n/datadog_checks/n8n/metrics.py @@ -2,36 +2,49 @@ # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) -# Metrics mapping without prefix - use raw_metric_prefix config to strip prefixes like 'n8n_', 'n8n_my_team_', etc. -# Namespace will be applied by the check -# Note: OpenMetrics automatically appends .count to counter metrics, so don't add it here +# Metrics emitted by n8n's /metrics endpoint, verified live against n8n@1.118.1 +# and n8n@2.19.5 with the test environment in `tests/docker/`. +# +# The OpenMetrics base check strips `_total` from counter names before lookup +# and appends `.count` on submission, so counter keys here are written without +# the `_total` suffix (e.g. `cache_hits_total` -> key `cache_hits`). +# +# Many counters are dynamically registered from EventBus events (event +# `n8n...` becomes counter `___total`) and only appear once +# the corresponding event fires at runtime. In queue mode, worker processes +# emit `node_started_total`, `node_finished_total`, `queue_job_dequeued_total`, +# and (n8n 2.x+) `runner_task_requested_total`. +# +# Several families were introduced in n8n 2.x (see the README "Version-specific +# metrics" section). The `workflow_statistics_*` and SSO/embed token-exchange +# families require additional flags (`N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS`, +# token-exchange counters always register but only emit on auth events). METRIC_MAP = { 'active_workflow_count': 'active.workflow.count', - 'api_request_duration_seconds': 'api.request.duration.seconds', - 'api_requests': 'api.requests', - 'cache_errors': 'cache.errors', + 'audit_workflow_activated': 'audit.workflow.activated', # n8n 2.x+ + 'audit_workflow_executed': 'audit.workflow.executed', # n8n 2.x+ 'cache_hits': 'cache.hits', - 'cache_latency_seconds': 'cache.latency.seconds', 'cache_misses': 'cache.misses', - 'cache_operations': 'cache.operations', - 'eventbus_connections_total': 'eventbus.connections.total', - 'eventbus_events_failed': 'eventbus.events.failed', - 'eventbus_events_processed': 'eventbus.events.processed', - 'eventbus_events': 'eventbus.events', - 'eventbus_queue_size': 'eventbus.queue.size', + 'cache_updates': 'cache.updates', + 'credentials': 'credentials.total', # n8n 2.x+, requires N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS + 'embed_login_failures': 'embed.login.failures', # n8n 2.x+ + 'embed_login_requests': 'embed.login.requests', # n8n 2.x+ + 'enabled_users': 'enabled.users', # n8n 2.x+, requires N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS 'http_request_duration_seconds': 'http.request.duration.seconds', 'instance_role_leader': 'instance.role.leader', 'last_activity': { 'name': 'last.activity', 'type': 'time_elapsed', }, + 'manual_executions': 'manual.executions', # n8n 2.x+, requires N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS + 'node_finished': 'node.finished', + 'node_started': 'node.started', 'nodejs_active_handles': 'nodejs.active.handles', 'nodejs_active_handles_total': 'nodejs.active.handles.total', 'nodejs_active_requests': 'nodejs.active.requests', 'nodejs_active_requests_total': 'nodejs.active.requests.total', 'nodejs_active_resources': 'nodejs.active.resources', 'nodejs_active_resources_total': 'nodejs.active.resources.total', - 'nodejs_event_loop_lag_seconds': 'nodejs.event.loop.lag.seconds', 'nodejs_eventloop_lag_max_seconds': 'nodejs.eventloop.lag.max.seconds', 'nodejs_eventloop_lag_mean_seconds': 'nodejs.eventloop.lag.mean.seconds', 'nodejs_eventloop_lag_min_seconds': 'nodejs.eventloop.lag.min.seconds', @@ -47,47 +60,44 @@ 'nodejs_heap_space_size_available_bytes': 'nodejs.heap.space.size.available.bytes', 'nodejs_heap_space_size_total_bytes': 'nodejs.heap.space.size.total.bytes', 'nodejs_heap_space_size_used_bytes': 'nodejs.heap.space.size.used.bytes', - 'nodejs_heap_total_bytes': 'nodejs.heap.total.bytes', - 'nodejs_heap_used_bytes': 'nodejs.heap.used.bytes', + 'nodejs_version_info': {'type': 'metadata', 'label': 'version', 'name': 'nodejs.version'}, + 'process_cpu_seconds': 'process.cpu.seconds', 'process_cpu_system_seconds': 'process.cpu.system.seconds', 'process_cpu_user_seconds': 'process.cpu.user.seconds', 'process_heap_bytes': 'process.heap.bytes', 'process_max_fds': 'process.max.fds', 'process_open_fds': 'process.open.fds', + 'process_pss_bytes': 'process.pss.bytes', # n8n 2.x+ 'process_resident_memory_bytes': 'process.resident.memory.bytes', 'process_start_time_seconds': { 'name': 'process.uptime.seconds', 'type': 'time_elapsed', }, 'process_virtual_memory_bytes': 'process.virtual.memory.bytes', - 'queue_job_active_total': 'queue.job.active.total', - 'queue_job_attempts': 'queue.job.attempts', + 'production_executions': 'production.executions', # n8n 2.x+, requires N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS + 'production_root_executions': 'production.root.executions', # n8n 2.x+, requires flag 'queue_job_completed': 'queue.job.completed', - 'queue_job_delayed_total': 'queue.job.delayed.total', 'queue_job_dequeued': 'queue.job.dequeued', 'queue_job_enqueued': 'queue.job.enqueued', 'queue_job_failed': 'queue.job.failed', - 'queue_job_waiting_duration_seconds': 'queue.job.waiting.duration.seconds', - 'queue_job_waiting_total': 'queue.job.waiting.total', - 'queue_jobs_duration_seconds': 'queue.jobs.duration.seconds', - 'queue_jobs': 'queue.jobs', - 'workflow_executions_active': 'workflow.executions.active', - 'workflow_executions_duration_seconds': 'workflow.executions.duration.seconds', - 'workflow_executions': 'workflow.executions', + 'runner_task_requested': 'runner.task.requested', # n8n 2.x+ + 'scaling_mode_queue_jobs_active': 'scaling.mode.queue.jobs.active', + 'scaling_mode_queue_jobs_completed': 'scaling.mode.queue.jobs.completed', + 'scaling_mode_queue_jobs_failed': 'scaling.mode.queue.jobs.failed', + 'scaling_mode_queue_jobs_waiting': 'scaling.mode.queue.jobs.waiting', + 'token_exchange_failures': 'token.exchange.failures', # n8n 2.x+ + 'token_exchange_identity_linked': 'token.exchange.identity.linked', # n8n 2.x+ + 'token_exchange_jit_provisioning': 'token.exchange.jit.provisioning', # n8n 2.x+ + 'token_exchange_requests': 'token.exchange.requests', # n8n 2.x+ + 'users': 'users.total', # n8n 2.x+, requires N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS + 'version_info': {'type': 'metadata', 'label': 'version', 'name': 'version'}, + 'workflow_execution_duration_seconds': 'workflow.execution.duration.seconds', # n8n 2.x+ 'workflow_failed': 'workflow.failed', 'workflow_started': 'workflow.started', 'workflow_success': 'workflow.success', - 'process_cpu_seconds': 'process.cpu.seconds', - 'version_info': 'version.info', - 'nodejs_version_info': 'nodejs.version.info', + 'workflows': 'workflows.total', # n8n 2.x+, requires N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS } -N8N_VERSION = {'version_info': {'type': 'metadata', 'label': 'version', 'name': 'version'}} -NODEJS_VERSION = {'nodejs_version_info': {'type': 'metadata', 'label': 'version', 'name': 'nodejs.version'}} - -METRIC_MAP.update(N8N_VERSION) -METRIC_MAP.update(NODEJS_VERSION) - RENAME_LABELS_MAP = { 'name': 'n8n_name', 'namespace': 'n8n_namespace', diff --git a/n8n/hatch.toml b/n8n/hatch.toml index 15f12fe355887..945448e0ac48b 100644 --- a/n8n/hatch.toml +++ b/n8n/hatch.toml @@ -3,9 +3,10 @@ [[envs.default.matrix]] python = ["3.13"] -version = ["1.118.1"] +version = ["1", "2"] [envs.default.overrides] matrix.version.env-vars = [ - { key = "N8N_VERSION", value = "1.118.1", if = ["1.118.1"] }, -] \ No newline at end of file + { key = "N8N_VERSION", value = "1.118.1", if = ["1"] }, + { key = "N8N_VERSION", value = "2.19.5", if = ["2"] }, +] diff --git a/n8n/metadata.csv b/n8n/metadata.csv index 29f8c23c7483e..4ba6daa38c79e 100644 --- a/n8n/metadata.csv +++ b/n8n/metadata.csv @@ -1,32 +1,28 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags n8n.active.workflow.count,gauge,,,,Total number of active workflows.,0,n8n,,, -n8n.api.request.duration.seconds.bucket,count,,,,Histogram bucket for API request duration in seconds,0,n8n,,, -n8n.api.request.duration.seconds.count,count,,,,The count of API request duration in seconds,0,n8n,,, -n8n.api.request.duration.seconds.sum,count,,,,The sum of API request duration in seconds,0,n8n,,, -n8n.api.requests.count,count,,,,Total API requests,0,n8n,,, -n8n.cache.errors.count,count,,,,Cache errors,0,n8n,,, -n8n.cache.hits.count,count,,,,Cache hits,0,n8n,,, -n8n.cache.latency.seconds.bucket,count,,,,Histogram bucket for cache operation latency in seconds,0,n8n,,, -n8n.cache.latency.seconds.count,count,,,,The count of cache operation latency in seconds,0,n8n,,, -n8n.cache.latency.seconds.sum,count,,,,The sum of cache operation latency in seconds,0,n8n,,, -n8n.cache.misses.count,count,,,,Cache misses,0,n8n,,, -n8n.cache.operations.count,count,,,,Total cache operations,0,n8n,,, -n8n.eventbus.connections.total,gauge,,,,Active event bus backend connections,0,n8n,,, -n8n.eventbus.events.count,count,,,,Total events published on the event bus,0,n8n,,, -n8n.eventbus.events.failed.count,count,,,,Total failed event processing,0,n8n,,, -n8n.eventbus.events.processed.count,count,,,,Total processed events,0,n8n,,, -n8n.eventbus.queue.size,gauge,,,,Current event queue size,0,n8n,,, -n8n.http.request.duration.seconds.count,count,,,,The count of the http responses duration labeled with: status_code,0,n8n,,, -n8n.http.request.duration.seconds.sum,count,,,,The sum of the http responses duration labeled with: status_code,0,n8n,,, +n8n.audit.workflow.activated.count,count,,,,Total number of audited workflow activations. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.executed.count,count,,,,Total number of audited workflow executions. Available in n8n 2.x and later.,0,n8n,,, +n8n.cache.hits.count,count,,,,Total number of cache hits.,0,n8n,,, +n8n.cache.misses.count,count,,,,Total number of cache misses.,0,n8n,,, +n8n.cache.updates.count,count,,,,Total number of cache updates.,0,n8n,,, +n8n.credentials.total,gauge,,,,Total number of credentials. Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, +n8n.embed.login.failures.count,count,,,,Total number of embed login failures broken down by reason. Available in n8n 2.x and later. Only emits samples after the first failure.,0,n8n,,, +n8n.embed.login.requests.count,count,,,,Total number of embed login requests (tagged with `result:success`/`result:failure`). Available in n8n 2.x and later.,0,n8n,,, +n8n.enabled.users,gauge,,,,Total number of enabled users. Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, +n8n.http.request.duration.seconds.bucket,count,,,,Histogram bucket for HTTP request duration in seconds labeled with status_code.,0,n8n,,, +n8n.http.request.duration.seconds.count,count,,,,The count of HTTP request duration samples.,0,n8n,,, +n8n.http.request.duration.seconds.sum,count,,,,The sum of HTTP request duration in seconds.,0,n8n,,, n8n.instance.role.leader,gauge,,,,Whether this main instance is the leader (1) or not (0).,0,n8n,,, n8n.last.activity,gauge,,second,,Time elapsed since the last instance activity (backend request).,0,n8n,,, +n8n.manual.executions,gauge,,,,Total number of manual workflow executions. Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, +n8n.node.finished.count,count,,,,Total number of node executions that finished. Emitted by worker processes in queue mode (n8n.node.finished event).,0,n8n,,, +n8n.node.started.count,count,,,,Total number of node executions that started. Emitted by worker processes in queue mode (n8n.node.started event).,0,n8n,,, n8n.nodejs.active.handles,gauge,,,,Number of active libuv handles grouped by handle type. Every handle type is C++ class name.,0,n8n,,, n8n.nodejs.active.handles.total,gauge,,,,Total number of active handles.,0,n8n,,, -n8n.nodejs.active.requests,gauge,,,,Number of active libuv requests grouped by request type. Every request type is C++ class name.,0,n8n,,, +n8n.nodejs.active.requests,gauge,,,,Number of active libuv requests grouped by request type. Only emits samples for request types currently in flight at scrape time (from prom-client's default collector via process._getActiveRequests).,0,n8n,,, n8n.nodejs.active.requests.total,gauge,,,,Total number of active requests.,0,n8n,,, -n8n.nodejs.active.resources,gauge,,,,"Number of active resources that are currently keeping the event loop alive, grouped by async resource type.",0,n8n,,, +n8n.nodejs.active.resources,gauge,,,,Number of active resources keeping the event loop alive grouped by async resource type.,0,n8n,,, n8n.nodejs.active.resources.total,gauge,,,,Total number of active resources.,0,n8n,,, -n8n.nodejs.event.loop.lag.seconds,gauge,,,,Event loop lag in seconds,0,n8n,,, n8n.nodejs.eventloop.lag.max.seconds,gauge,,,,The maximum recorded event loop delay.,0,n8n,,, n8n.nodejs.eventloop.lag.mean.seconds,gauge,,,,The mean of the recorded event loop delays.,0,n8n,,, n8n.nodejs.eventloop.lag.min.seconds,gauge,,,,The minimum recorded event loop delay.,0,n8n,,, @@ -36,47 +32,45 @@ n8n.nodejs.eventloop.lag.p99.seconds,gauge,,,,The 99th percentile of the recorde n8n.nodejs.eventloop.lag.seconds,gauge,,,,Lag of event loop in seconds.,0,n8n,,, n8n.nodejs.eventloop.lag.stddev.seconds,gauge,,,,The standard deviation of the recorded event loop delays.,0,n8n,,, n8n.nodejs.external.memory.bytes,gauge,,,,Node.js external memory size in bytes.,0,n8n,,, -n8n.nodejs.gc.duration.seconds.bucket,count,,,,Histogram bucket for garbage collection duration by kind,0,n8n,,, -n8n.nodejs.gc.duration.seconds.count,count,,,,The count of garbage collection duration by kind,0,n8n,,, -n8n.nodejs.gc.duration.seconds.sum,count,,,,The sum of garbage collection duration by kind,0,n8n,,, +n8n.nodejs.gc.duration.seconds.bucket,count,,,,Histogram bucket for garbage collection duration by kind.,0,n8n,,, +n8n.nodejs.gc.duration.seconds.count,count,,,,The count of garbage collection duration samples.,0,n8n,,, +n8n.nodejs.gc.duration.seconds.sum,count,,,,The sum of garbage collection duration in seconds.,0,n8n,,, n8n.nodejs.heap.size.total.bytes,gauge,,,,Process heap size from Node.js in bytes.,0,n8n,,, n8n.nodejs.heap.size.used.bytes,gauge,,,,Process heap size used from Node.js in bytes.,0,n8n,,, n8n.nodejs.heap.space.size.available.bytes,gauge,,,,Process heap space size available from Node.js in bytes.,0,n8n,,, n8n.nodejs.heap.space.size.total.bytes,gauge,,,,Process heap space size total from Node.js in bytes.,0,n8n,,, n8n.nodejs.heap.space.size.used.bytes,gauge,,,,Process heap space size used from Node.js in bytes.,0,n8n,,, -n8n.nodejs.heap.total.bytes,gauge,,,,Total heap size allocated in bytes,0,n8n,,, -n8n.nodejs.heap.used.bytes,gauge,,,,Heap memory used in bytes,0,n8n,,, n8n.process.cpu.seconds.count,count,,,,Total user and system CPU time spent in seconds.,0,n8n,,, n8n.process.cpu.system.seconds.count,count,,,,Total system CPU time spent in seconds.,0,n8n,,, n8n.process.cpu.user.seconds.count,count,,,,Total user CPU time spent in seconds.,0,n8n,,, n8n.process.heap.bytes,gauge,,,,Process heap size in bytes.,0,n8n,,, n8n.process.max.fds,gauge,,,,Maximum number of open file descriptors.,0,n8n,,, n8n.process.open.fds,gauge,,,,Number of open file descriptors.,0,n8n,,, +n8n.process.pss.bytes,gauge,,,,Proportional set size of the process in bytes. Available in n8n 2.x and later on Linux.,0,n8n,,, n8n.process.resident.memory.bytes,gauge,,,,Resident memory size in bytes.,0,n8n,,, -n8n.process.start.time.seconds,gauge,,,,Start time of the process since unix epoch in seconds.,0,n8n,,, -n8n.process.uptime.seconds,gauge,,,,Process uptime in seconds.,0,n8n,,, +n8n.process.uptime.seconds,gauge,,second,,Process uptime in seconds.,0,n8n,,, n8n.process.virtual.memory.bytes,gauge,,,,Virtual memory size in bytes.,0,n8n,,, -n8n.queue.job.active.total,gauge,,,,Number of jobs currently being processed,0,n8n,,, -n8n.queue.job.attempts.count,count,,,,Total number of job attempts,0,n8n,,, -n8n.queue.job.completed.count,count,,,,Number of jobs completed successfully,0,n8n,,, -n8n.queue.job.delayed.total,gauge,,,,Number of jobs scheduled to run later,0,n8n,,, -n8n.queue.job.dequeued.count,count,,,,Number of jobs dequeued (picked up from queue),0,n8n,,, -n8n.queue.job.enqueued.count,count,,,,Number of jobs added to the queue,0,n8n,,, -n8n.queue.job.failed.count,count,,,,Number of jobs that have failed,0,n8n,,, -n8n.queue.job.waiting.duration.seconds.bucket,count,,,,Histogram bucket for duration jobs spend waiting before being processed,0,n8n,,, -n8n.queue.job.waiting.duration.seconds.count,count,,,,The count of duration jobs spend waiting before being processed,0,n8n,,, -n8n.queue.job.waiting.duration.seconds.sum,count,,,,The sum of duration jobs spend waiting before being processed,0,n8n,,, -n8n.queue.job.waiting.total,gauge,,,,Number of jobs currently waiting in the queue,0,n8n,,, -n8n.queue.jobs.count,count,,,,Total number of queue jobs,0,n8n,,, -n8n.queue.jobs.duration.seconds.bucket,count,,,,Histogram bucket for job duration in seconds,0,n8n,,, -n8n.queue.jobs.duration.seconds.count,count,,,,The count of job duration in seconds,0,n8n,,, -n8n.queue.jobs.duration.seconds.sum,count,,,,The sum of job duration in seconds,0,n8n,,, -n8n.readiness.check,gauge,,,,Readiness check status (1 if ready with status code 200 otherwise 0) with status code tag,0,n8n,,,status_code -n8n.workflow.executions.active,gauge,,,,Number of active workflow executions,0,n8n,,, -n8n.workflow.executions.count,count,,,,Total number of workflow executions,0,n8n,,, -n8n.workflow.executions.duration.seconds.bucket,count,,,,Histogram bucket for workflow execution duration in seconds,0,n8n,,, -n8n.workflow.executions.duration.seconds.count,count,,,,The count of workflow execution duration in seconds,0,n8n,,, -n8n.workflow.executions.duration.seconds.sum,count,,,,The sum of workflow execution duration in seconds,0,n8n,,, -n8n.workflow.failed.count,count,,,,Total number of workflows that failed,0,n8n,,, -n8n.workflow.started.count,count,,,,Total number of workflows started,0,n8n,,, -n8n.workflow.success.count,count,,,,Total number of workflows completed successfully,0,n8n,,, +n8n.production.executions,gauge,,,,Total number of production workflow executions. Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, +n8n.production.root.executions,gauge,,,,Total number of production root workflow executions (excludes sub-workflows). Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, +n8n.queue.job.completed.count,count,,,,Number of jobs completed successfully (n8n.queue.job.completed event).,0,n8n,,, +n8n.queue.job.dequeued.count,count,,,,Number of jobs dequeued by workers (n8n.queue.job.dequeued event). Emitted by worker processes in queue mode.,0,n8n,,, +n8n.queue.job.enqueued.count,count,,,,Number of jobs added to the queue (n8n.queue.job.enqueued event).,0,n8n,,, +n8n.queue.job.failed.count,count,,,,Number of jobs that have failed (n8n.queue.job.failed event).,0,n8n,,, +n8n.readiness.check,gauge,,,,Readiness check status (1 if ready with status code 200 otherwise 0) with status code tag.,0,n8n,,,status_code +n8n.runner.task.requested.count,count,,,,Total number of runner tasks requested by worker processes. Available in n8n 2.x and later.,0,n8n,,, +n8n.scaling.mode.queue.jobs.active,gauge,,,,Current number of jobs being processed across all workers in scaling mode.,0,n8n,,, +n8n.scaling.mode.queue.jobs.completed.count,count,,,,Total number of jobs completed across all workers in scaling mode since instance start.,0,n8n,,, +n8n.scaling.mode.queue.jobs.failed.count,count,,,,Total number of jobs failed across all workers in scaling mode since instance start.,0,n8n,,, +n8n.scaling.mode.queue.jobs.waiting,gauge,,,,Current number of enqueued jobs waiting for pickup in scaling mode.,0,n8n,,, +n8n.token.exchange.failures.count,count,,,,Total number of token exchange failures broken down by reason. Available in n8n 2.x and later. Only emits samples after the first failure.,0,n8n,,, +n8n.token.exchange.identity.linked.count,count,,,,Total number of identities linked to existing users via token exchange. Available in n8n 2.x and later.,0,n8n,,, +n8n.token.exchange.jit.provisioning.count,count,,,,Total number of users JIT-provisioned via token exchange. Available in n8n 2.x and later.,0,n8n,,, +n8n.token.exchange.requests.count,count,,,,Total number of token exchange requests. Available in n8n 2.x and later.,0,n8n,,, +n8n.users.total,gauge,,,,Total number of users. Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, +n8n.workflow.execution.duration.seconds.bucket,count,,,,Histogram bucket for workflow execution duration in seconds. Available in n8n 2.x and later.,0,n8n,,, +n8n.workflow.execution.duration.seconds.count,count,,,,The count of workflow execution duration samples. Available in n8n 2.x and later.,0,n8n,,, +n8n.workflow.execution.duration.seconds.sum,count,,,,The sum of workflow execution duration in seconds. Available in n8n 2.x and later.,0,n8n,,, +n8n.workflow.failed.count,count,,,,Total number of workflows that failed (n8n.workflow.failed event).,0,n8n,,, +n8n.workflow.started.count,count,,,,Total number of workflows started (n8n.workflow.started event).,0,n8n,,, +n8n.workflow.success.count,count,,,,Total number of workflows completed successfully (n8n.workflow.success event).,0,n8n,,, +n8n.workflows.total,gauge,,,,Total number of workflows. Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, diff --git a/n8n/tests/common.py b/n8n/tests/common.py index 34e3fb84ead38..5403b42a3317c 100644 --- a/n8n/tests/common.py +++ b/n8n/tests/common.py @@ -3,103 +3,127 @@ # Licensed under a 3-clause BSD style license (see LICENSE) import os +from datadog_checks.base.stubs.aggregator import AggregatorStub from datadog_checks.dev import get_docker_hostname +from datadog_checks.dev.utils import find_free_ports, get_metadata_metrics HERE = os.path.dirname(os.path.abspath(__file__)) COMPOSE_FILE = os.path.join(HERE, 'docker', 'docker-compose.yaml') HOST = get_docker_hostname() -PORT = 5678 +# Allocate free host ports once per session. The values are forwarded to docker compose via +# the ``env_vars`` argument of ``docker_run`` (see ``conftest.py``) so re-runs don't collide +# with stale containers or other locally-bound services. The in-container ports stay fixed. +MAIN_PORT, WORKER_PORT = find_free_ports('127.0.0.1', 2) -def get_fixture_path(filename): + +def get_compose_env_vars() -> dict[str, str]: + """Variables consumed by docker-compose.yaml's ``${...}`` placeholders.""" + return { + 'N8N_MAIN_HOST_PORT': str(MAIN_PORT), + 'N8N_WORKER_HOST_PORT': str(WORKER_PORT), + } + + +N8N_VERSION = os.environ.get('N8N_VERSION', '1.118.1') +N8N_MAJOR = int(N8N_VERSION.split('.', 1)[0]) + +# Submitted by the check itself, not by the OpenMetrics scrape. +CHECK_LEVEL_METRIC_NAMES = frozenset({'n8n.readiness.check'}) + +# Metric families introduced in n8n 2.x — verified live against n8n@1.118.1 and n8n@2.19.5. +V2_ONLY_METRIC_NAMES = frozenset( + { + 'n8n.audit.workflow.activated.count', + 'n8n.audit.workflow.executed.count', + 'n8n.credentials.total', + 'n8n.embed.login.failures.count', + 'n8n.embed.login.requests.count', + 'n8n.enabled.users', + 'n8n.manual.executions', + 'n8n.process.pss.bytes', + 'n8n.production.executions', + 'n8n.production.root.executions', + 'n8n.runner.task.requested.count', + 'n8n.token.exchange.failures.count', + 'n8n.token.exchange.identity.linked.count', + 'n8n.token.exchange.jit.provisioning.count', + 'n8n.token.exchange.requests.count', + 'n8n.users.total', + 'n8n.workflow.execution.duration.seconds.bucket', + 'n8n.workflow.execution.duration.seconds.count', + 'n8n.workflow.execution.duration.seconds.sum', + 'n8n.workflows.total', + } +) + +# Metrics that are mapped and present in metadata but only emit samples after a specific +# event fires (auth failure, libuv request mid-flight). The unit fixture has synthetic +# samples for them; live integration/e2e runs cannot guarantee samples and exclude them +# from the symmetric metadata assertion. +RARE_EVENT_METRIC_NAMES = frozenset( + { + 'n8n.embed.login.failures.count', + 'n8n.token.exchange.failures.count', + # prom-client's per-type libuv request gauge: only has samples while a libuv request is in flight + # at scrape time, so live containers can produce or omit it depending on timing. + 'n8n.nodejs.active.requests', + } +) + + +def get_fixture_path(filename: str) -> str: return os.path.join(HERE, 'fixtures', filename) -OPENMETRICS_URL = f'http://{HOST}:{PORT}' -INSTANCE = { - 'openmetrics_endpoint': f'{OPENMETRICS_URL}/metrics', +def get_metadata_metrics_for_version(major: int = N8N_MAJOR, *, exclude_rare: bool = False) -> dict: + """Return the metadata.csv subset that the given n8n major version is expected to emit.""" + metadata = get_metadata_metrics() + if major < 2: + for name in V2_ONLY_METRIC_NAMES: + metadata.pop(name, None) + if exclude_rare: + for name in RARE_EVENT_METRIC_NAMES: + metadata.pop(name, None) + return metadata + + +def get_openmetrics_metadata_metrics(major: int = N8N_MAJOR, *, exclude_rare: bool = False) -> dict: + """Version-aware metadata subset minus metrics submitted by the check itself.""" + metadata = get_metadata_metrics_for_version(major, exclude_rare=exclude_rare) + for name in CHECK_LEVEL_METRIC_NAMES: + metadata.pop(name, None) + return metadata + + +def get_all_metadata_metrics(major: int = N8N_MAJOR, *, exclude_rare: bool = False) -> dict: + """Version-aware metadata subset including the readiness gauge submitted by the check.""" + return get_metadata_metrics_for_version(major, exclude_rare=exclude_rare) + + +def drop_rare_event_metrics(aggregator: AggregatorStub): + """Strip rare-event metrics from the aggregator before a symmetric metadata assertion. + + These metrics are mapped and present in metadata.csv but only emit samples opportunistically + (auth failures, libuv requests in flight). Live containers may submit them or not depending on + timing, which makes ``check_symmetric_inclusion=True`` flaky in either direction. Dropping them + from the aggregator (and from the metadata subset via ``exclude_rare=True``) keeps the + symmetric check stable while still verifying the rest of the surface end-to-end. + """ + for name in RARE_EVENT_METRIC_NAMES: + aggregator._metrics.pop(name, None) + + +MAIN_INSTANCE = { + 'openmetrics_endpoint': f'http://{HOST}:{MAIN_PORT}/metrics', + 'tags': ['n8n_process:main'], } +WORKER_INSTANCE = { + 'openmetrics_endpoint': f'http://{HOST}:{WORKER_PORT}/metrics', + 'tags': ['n8n_process:worker'], +} +INSTANCE = MAIN_INSTANCE # back-compat default for unit tests E2E_METADATA = { 'docker_volumes': ['/var/run/docker.sock:/var/run/docker.sock:ro'], } - -TEST_METRICS = [ - 'n8n.active.workflow.count', - 'n8n.api.request.duration.seconds.bucket', - 'n8n.api.request.duration.seconds.count', - 'n8n.api.request.duration.seconds.sum', - 'n8n.api.requests.count', - 'n8n.cache.errors.count', - 'n8n.cache.hits.count', - 'n8n.cache.latency.seconds.bucket', - 'n8n.cache.latency.seconds.count', - 'n8n.cache.latency.seconds.sum', - 'n8n.cache.misses.count', - 'n8n.cache.operations.count', - 'n8n.eventbus.connections.total', - 'n8n.eventbus.events.failed.count', - 'n8n.eventbus.events.processed.count', - 'n8n.eventbus.events.count', - 'n8n.eventbus.queue.size', - 'n8n.instance.role.leader', - 'n8n.last.activity', - 'n8n.nodejs.active.handles', - 'n8n.nodejs.active.handles.total', - 'n8n.nodejs.active.requests.total', - 'n8n.nodejs.active.resources', - 'n8n.nodejs.active.resources.total', - 'n8n.nodejs.event.loop.lag.seconds', - 'n8n.nodejs.eventloop.lag.max.seconds', - 'n8n.nodejs.eventloop.lag.mean.seconds', - 'n8n.nodejs.eventloop.lag.min.seconds', - 'n8n.nodejs.eventloop.lag.p50.seconds', - 'n8n.nodejs.eventloop.lag.p90.seconds', - 'n8n.nodejs.eventloop.lag.p99.seconds', - 'n8n.nodejs.eventloop.lag.seconds', - 'n8n.nodejs.eventloop.lag.stddev.seconds', - 'n8n.nodejs.external.memory.bytes', - 'n8n.nodejs.gc.duration.seconds.bucket', - 'n8n.nodejs.gc.duration.seconds.count', - 'n8n.nodejs.gc.duration.seconds.sum', - 'n8n.nodejs.heap.size.total.bytes', - 'n8n.nodejs.heap.size.used.bytes', - 'n8n.nodejs.heap.space.size.available.bytes', - 'n8n.nodejs.heap.space.size.total.bytes', - 'n8n.nodejs.heap.space.size.used.bytes', - 'n8n.nodejs.heap.total.bytes', - 'n8n.nodejs.heap.used.bytes', - 'n8n.process.cpu.system.seconds.count', - 'n8n.process.cpu.user.seconds.count', - 'n8n.process.heap.bytes', - 'n8n.process.max.fds', - 'n8n.process.open.fds', - 'n8n.process.resident.memory.bytes', - 'n8n.process.uptime.seconds', - 'n8n.process.virtual.memory.bytes', - 'n8n.queue.job.active.total', - 'n8n.queue.job.attempts.count', - 'n8n.queue.job.completed.count', - 'n8n.queue.job.delayed.total', - 'n8n.queue.job.dequeued.count', - 'n8n.queue.job.enqueued.count', - 'n8n.queue.job.failed.count', - 'n8n.queue.job.waiting.duration.seconds.bucket', - 'n8n.queue.job.waiting.duration.seconds.count', - 'n8n.queue.job.waiting.duration.seconds.sum', - 'n8n.queue.job.waiting.total', - 'n8n.queue.jobs.duration.seconds.bucket', - 'n8n.queue.jobs.duration.seconds.count', - 'n8n.queue.jobs.duration.seconds.sum', - 'n8n.queue.jobs.count', - 'n8n.readiness.check', - 'n8n.workflow.executions.active', - 'n8n.workflow.executions.duration.seconds.bucket', - 'n8n.workflow.executions.duration.seconds.count', - 'n8n.workflow.executions.duration.seconds.sum', - 'n8n.workflow.executions.count', - 'n8n.workflow.failed.count', - 'n8n.workflow.started.count', - 'n8n.workflow.success.count', - 'n8n.process.cpu.seconds.count', -] diff --git a/n8n/tests/conftest.py b/n8n/tests/conftest.py index c6face31f7d4c..4f560ece01fa9 100644 --- a/n8n/tests/conftest.py +++ b/n8n/tests/conftest.py @@ -3,27 +3,122 @@ # Licensed under a 3-clause BSD style license (see LICENSE) import copy +import subprocess +import time +from typing import Any, Iterator import pytest +import requests from datadog_checks.dev import docker_run from datadog_checks.dev.conditions import CheckEndpoints from . import common +WORKFLOW_OK_PATH = '/workflows/sample_workflow.json' +WORKFLOW_FAIL_PATH = '/workflows/sample_workflow_failing.json' +WORKFLOW_OK_ID = 'testWorkflowOk' +WORKFLOW_FAIL_ID = 'testWorkflowFail' + +WEBHOOK_OK_PATH = '/webhook/test' +WEBHOOK_FAIL_PATH = '/webhook/fail' + +CONTAINER = 'n8n-test' + + +def _docker_exec(*cmd: str) -> str: + return subprocess.check_output(['docker', 'exec', CONTAINER, *cmd], stderr=subprocess.STDOUT).decode() + + +def _wait_for_n8n(timeout: int = 90): + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + try: + if requests.get(f'http://{common.HOST}:{common.MAIN_PORT}/healthz', timeout=2).status_code == 200: + return + except requests.RequestException: + pass + time.sleep(2) + raise RuntimeError('n8n did not become healthy in time') + + +def _activate_imported_workflows(): + """Import sample workflows by stable id, activate them, restart n8n so webhooks register.""" + _docker_exec('n8n', 'import:workflow', f'--input={WORKFLOW_OK_PATH}') + _docker_exec('n8n', 'import:workflow', f'--input={WORKFLOW_FAIL_PATH}') + + for wf_id in (WORKFLOW_OK_ID, WORKFLOW_FAIL_ID): + _docker_exec('n8n', 'update:workflow', f'--id={wf_id}', '--active=true') + + subprocess.check_call( + ['docker', 'compose', '-f', common.COMPOSE_FILE, 'restart', 'n8n'], + stderr=subprocess.STDOUT, + ) + _wait_for_n8n() + + +def _generate_workflow_traffic(iterations: int = 5): + """Trigger workflows + API endpoints so workflow event and HTTP histogram metrics fire. + + Failures are not silently swallowed — at least the OK webhook must respond, otherwise + the test fixture is broken and downstream metric assertions can't be trusted. + """ + base_url = f'http://{common.HOST}:{common.MAIN_PORT}' + api_paths = ('/healthz', '/healthz/readiness', '/rest/login') + ok_responses = 0 + for _ in range(iterations): + try: + ok = requests.get(f'{base_url}{WEBHOOK_OK_PATH}', timeout=5) + if ok.status_code < 500: + ok_responses += 1 + except requests.RequestException: + pass + # Webhook fail is *expected* to error out — that's the point of triggering it. + for path in (WEBHOOK_FAIL_PATH, *api_paths): + try: + requests.get(f'{base_url}{path}', timeout=5) + except requests.RequestException: + pass + if ok_responses == 0: + raise RuntimeError('Test webhook returned no successful responses; workflow registration failed') + + +def _wait_for_workflow_metric(timeout: int = 30): + """Poll /metrics until at least one workflow_started_total sample is non-zero.""" + deadline = time.monotonic() + timeout + metrics_url = common.MAIN_INSTANCE['openmetrics_endpoint'] + while time.monotonic() < deadline: + try: + payload = requests.get(metrics_url, timeout=3).text + for line in payload.splitlines(): + if line.startswith('n8n_workflow_started_total') and not line.endswith(' 0'): + return + except requests.RequestException: + pass + time.sleep(2) + raise RuntimeError('workflow_started_total never went non-zero') + @pytest.fixture(scope='session') -def dd_environment(): - compose_file = common.COMPOSE_FILE +def dd_environment() -> Iterator[dict[str, Any]]: conditions = [ - CheckEndpoints(common.INSTANCE["openmetrics_endpoint"]), + CheckEndpoints(common.MAIN_INSTANCE['openmetrics_endpoint']), + CheckEndpoints(common.WORKER_INSTANCE['openmetrics_endpoint']), ] - with docker_run(compose_file, conditions=conditions): + with docker_run(common.COMPOSE_FILE, conditions=conditions, env_vars=common.get_compose_env_vars()): + _activate_imported_workflows() + _generate_workflow_traffic() + _wait_for_workflow_metric() yield { - 'instances': [common.INSTANCE], + 'instances': [common.MAIN_INSTANCE, common.WORKER_INSTANCE], } @pytest.fixture -def instance(): - return copy.deepcopy(common.INSTANCE) +def instance() -> dict[str, Any]: + return copy.deepcopy(common.MAIN_INSTANCE) + + +@pytest.fixture +def worker_instance() -> dict[str, Any]: + return copy.deepcopy(common.WORKER_INSTANCE) diff --git a/n8n/tests/docker/Dockerfile b/n8n/tests/docker/Dockerfile deleted file mode 100644 index d74b7ccd9c162..0000000000000 --- a/n8n/tests/docker/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -ARG N8N_VERSION=1.118.1 -FROM n8nio/n8n:${N8N_VERSION} - -# Set environment variables to enable metrics and logging -ENV N8N_METRICS=true \ - N8N_LOG_LEVEL=debug \ - N8N_METRICS_INCLUDE_DEFAULT_METRICS=true \ - N8N_METRICS_INCLUDE_CACHE_METRICS=true \ - N8N_METRICS_INCLUDE_MESSAGE_EVENT_BUS_METRICS=true \ - N8N_HOST=0.0.0.0 \ - N8N_PORT=5678 - -# Expose the n8n port -EXPOSE 5678 diff --git a/n8n/tests/docker/README.md b/n8n/tests/docker/README.md index bb1d23cc34ce1..ac2ded112e06f 100644 --- a/n8n/tests/docker/README.md +++ b/n8n/tests/docker/README.md @@ -82,7 +82,8 @@ This setup is designed for integration testing. The n8n instance will: ## Notes -- The container uses the latest official n8n Docker image +- The container uses the official `n8nio/n8n` image at the version selected via the `N8N_VERSION` environment variable (forwarded by `hatch.toml`'s test matrix). The default in `docker-compose.yaml` is `1.118.1`. +- Queue mode is enabled with a Redis container and a separate `n8n-worker` service that exposes its own `/metrics` endpoint on host port `5680` (the default `5679` collides with the n8n 2.x task runner broker). - Data is persisted in a Docker volume named `n8n_data` - The health check waits up to 30 seconds for n8n to start before marking it as healthy diff --git a/n8n/tests/docker/docker-compose.yaml b/n8n/tests/docker/docker-compose.yaml index fb8da72559b78..554114d2819a2 100644 --- a/n8n/tests/docker/docker-compose.yaml +++ b/n8n/tests/docker/docker-compose.yaml @@ -1,33 +1,50 @@ services: + redis: + image: redis:7-alpine + container_name: n8n-test-redis + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 5 + n8n: - build: - context: . - dockerfile: Dockerfile + image: n8nio/n8n:${N8N_VERSION:-1.118.1} container_name: n8n-test ports: - - "5678:5678" + - "${N8N_MAIN_HOST_PORT:-5678}:5678" environment: - # Enable metrics endpoint - - N8N_METRICS=true - - N8N_METRICS_INCLUDE_DEFAULT_METRICS=true - - N8N_METRICS_INCLUDE_CACHE_METRICS=true - - N8N_METRICS_INCLUDE_MESSAGE_EVENT_BUS_METRICS=true - - N8N_METRICS_INCLUDE_API_ENDPOINTS=true - - N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL=true - # Logging configuration + - EXECUTIONS_MODE=queue + - QUEUE_BULL_REDIS_HOST=redis + - QUEUE_BULL_REDIS_PORT=6379 - N8N_LOG_LEVEL=debug - N8N_LOG_OUTPUT=console - # Basic configuration - N8N_HOST=0.0.0.0 - N8N_PORT=5678 - N8N_PROTOCOL=http - # Authentication (optional for testing) - N8N_BASIC_AUTH_ACTIVE=true - N8N_BASIC_AUTH_USER=admin - N8N_BASIC_AUTH_PASSWORD=admin + - N8N_DIAGNOSTICS_ENABLED=false + - N8N_VERSION_NOTIFICATIONS_ENABLED=false + - N8N_TEMPLATES_ENABLED=false + - N8N_RUNNERS_ENABLED=false + - N8N_METRICS=true + - N8N_METRICS_INCLUDE_DEFAULT_METRICS=true + - N8N_METRICS_INCLUDE_CACHE_METRICS=true + - N8N_METRICS_INCLUDE_MESSAGE_EVENT_BUS_METRICS=true + - N8N_METRICS_INCLUDE_API_ENDPOINTS=true + - N8N_METRICS_INCLUDE_QUEUE_METRICS=true + - N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL=true + - N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS=true volumes: - n8n_data:/home/node/.n8n - ${N8N_LOG_FOLDER:-./logs}:/var/log/n8n + - ./sample_workflow.json:/workflows/sample_workflow.json:ro + - ./sample_workflow_failing.json:/workflows/sample_workflow_failing.json:ro + depends_on: + redis: + condition: service_healthy healthcheck: test: ["CMD", "wget", "-q", "--spider", "http://localhost:5678/healthz"] interval: 10s @@ -35,7 +52,35 @@ services: retries: 5 start_period: 30s + n8n-worker: + image: n8nio/n8n:${N8N_VERSION:-1.118.1} + container_name: n8n-test-worker + command: ["worker"] + ports: + - "${N8N_WORKER_HOST_PORT:-5680}:5680" + environment: + - EXECUTIONS_MODE=queue + - QUEUE_BULL_REDIS_HOST=redis + - QUEUE_BULL_REDIS_PORT=6379 + - N8N_LOG_LEVEL=info + - N8N_LOG_OUTPUT=console + - N8N_RUNNERS_ENABLED=false + - N8N_METRICS=true + - N8N_METRICS_INCLUDE_DEFAULT_METRICS=true + - N8N_METRICS_INCLUDE_CACHE_METRICS=true + - N8N_METRICS_INCLUDE_MESSAGE_EVENT_BUS_METRICS=true + - N8N_METRICS_INCLUDE_API_ENDPOINTS=true + - N8N_METRICS_INCLUDE_QUEUE_METRICS=true + - N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL=true + - N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS=true + - QUEUE_HEALTH_CHECK_ACTIVE=true + - QUEUE_HEALTH_CHECK_PORT=5680 + volumes: + - n8n_data:/home/node/.n8n + depends_on: + n8n: + condition: service_healthy + volumes: n8n_data: driver: local - diff --git a/n8n/tests/docker/sample_workflow.json b/n8n/tests/docker/sample_workflow.json new file mode 100644 index 0000000000000..94400565cfd9e --- /dev/null +++ b/n8n/tests/docker/sample_workflow.json @@ -0,0 +1,59 @@ +{ + "id": "testWorkflowOk", + "versionId": "00000000-0000-0000-0000-000000000001", + "name": "Test Workflow", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "test", + "responseMode": "lastNode", + "options": {} + }, + "id": "11111111-1111-1111-1111-111111111111", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "test-webhook-aaaa-bbbb-cccc-111111111111" + }, + { + "parameters": { + "assignments": { + "assignments": [ + { + "id": "1", + "name": "ok", + "value": "true", + "type": "string" + } + ] + }, + "options": {} + }, + "id": "22222222-2222-2222-2222-222222222222", + "name": "Set", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [460, 300] + } + ], + "connections": { + "Webhook": { + "main": [ + [ + { + "node": "Set", + "type": "main", + "index": 0 + } + ] + ] + } + }, + "active": false, + "settings": { + "executionOrder": "v1" + }, + "pinData": {} +} diff --git a/n8n/tests/docker/sample_workflow_failing.json b/n8n/tests/docker/sample_workflow_failing.json new file mode 100644 index 0000000000000..159f08bfc8843 --- /dev/null +++ b/n8n/tests/docker/sample_workflow_failing.json @@ -0,0 +1,50 @@ +{ + "id": "testWorkflowFail", + "versionId": "00000000-0000-0000-0000-000000000002", + "name": "Failing Test Workflow", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "fail", + "responseMode": "lastNode", + "options": {} + }, + "id": "33333333-3333-3333-3333-333333333333", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "test-fail-aaaa-bbbb-cccc-333333333333" + }, + { + "parameters": { + "language": "javaScript", + "jsCode": "throw new Error('intentional failure for metrics tests');" + }, + "id": "44444444-4444-4444-4444-444444444444", + "name": "Code", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [460, 300] + } + ], + "connections": { + "Webhook": { + "main": [ + [ + { + "node": "Code", + "type": "main", + "index": 0 + } + ] + ] + } + }, + "active": false, + "settings": { + "executionOrder": "v1" + }, + "pinData": {} +} diff --git a/n8n/tests/fixtures/n8n.txt b/n8n/tests/fixtures/n8n.txt index c670f02d7fe46..0a47cf518a9c0 100644 --- a/n8n/tests/fixtures/n8n.txt +++ b/n8n/tests/fixtures/n8n.txt @@ -1,34 +1,34 @@ # HELP n8n_process_cpu_user_seconds_total Total user CPU time spent in seconds. # TYPE n8n_process_cpu_user_seconds_total counter -n8n_process_cpu_user_seconds_total 8.298932999999998 +n8n_process_cpu_user_seconds_total 0.921656 # HELP n8n_process_cpu_system_seconds_total Total system CPU time spent in seconds. # TYPE n8n_process_cpu_system_seconds_total counter -n8n_process_cpu_system_seconds_total 3.1041119999999998 +n8n_process_cpu_system_seconds_total 0.157367 # HELP n8n_process_cpu_seconds_total Total user and system CPU time spent in seconds. # TYPE n8n_process_cpu_seconds_total counter -n8n_process_cpu_seconds_total 11.403044999999999 +n8n_process_cpu_seconds_total 1.0790229999999998 # HELP n8n_process_start_time_seconds Start time of the process since unix epoch in seconds. # TYPE n8n_process_start_time_seconds gauge -n8n_process_start_time_seconds 1761656578 +n8n_process_start_time_seconds 1778234580 # HELP n8n_process_resident_memory_bytes Resident memory size in bytes. # TYPE n8n_process_resident_memory_bytes gauge -n8n_process_resident_memory_bytes 245043200 +n8n_process_resident_memory_bytes 267681792 # HELP n8n_process_virtual_memory_bytes Virtual memory size in bytes. # TYPE n8n_process_virtual_memory_bytes gauge -n8n_process_virtual_memory_bytes 33656197120 +n8n_process_virtual_memory_bytes 18517532672 # HELP n8n_process_heap_bytes Process heap size in bytes. # TYPE n8n_process_heap_bytes gauge -n8n_process_heap_bytes 277200896 +n8n_process_heap_bytes 840728576 # HELP n8n_process_open_fds Number of open file descriptors. # TYPE n8n_process_open_fds gauge -n8n_process_open_fds 44 +n8n_process_open_fds 45 # HELP n8n_process_max_fds Maximum number of open file descriptors. # TYPE n8n_process_max_fds gauge @@ -36,59 +36,62 @@ n8n_process_max_fds 1048576 # HELP n8n_nodejs_eventloop_lag_seconds Lag of event loop in seconds. # TYPE n8n_nodejs_eventloop_lag_seconds gauge -n8n_nodejs_eventloop_lag_seconds 0.002765567 +n8n_nodejs_eventloop_lag_seconds 0.008676917 # HELP n8n_nodejs_eventloop_lag_min_seconds The minimum recorded event loop delay. # TYPE n8n_nodejs_eventloop_lag_min_seconds gauge -n8n_nodejs_eventloop_lag_min_seconds 0.010018816 +n8n_nodejs_eventloop_lag_min_seconds 0.006340608 # HELP n8n_nodejs_eventloop_lag_max_seconds The maximum recorded event loop delay. # TYPE n8n_nodejs_eventloop_lag_max_seconds gauge -n8n_nodejs_eventloop_lag_max_seconds 0.011239423 +n8n_nodejs_eventloop_lag_max_seconds 0.030228479 # HELP n8n_nodejs_eventloop_lag_mean_seconds The mean of the recorded event loop delays. # TYPE n8n_nodejs_eventloop_lag_mean_seconds gauge -n8n_nodejs_eventloop_lag_mean_seconds 0.010092521938958708 +n8n_nodejs_eventloop_lag_mean_seconds 0.012079332927643785 # HELP n8n_nodejs_eventloop_lag_stddev_seconds The standard deviation of the recorded event loop delays. # TYPE n8n_nodejs_eventloop_lag_stddev_seconds gauge -n8n_nodejs_eventloop_lag_stddev_seconds 0.00016945350643679045 +n8n_nodejs_eventloop_lag_stddev_seconds 0.0011467288819057616 # HELP n8n_nodejs_eventloop_lag_p50_seconds The 50th percentile of the recorded event loop delays. # TYPE n8n_nodejs_eventloop_lag_p50_seconds gauge -n8n_nodejs_eventloop_lag_p50_seconds 0.010067967 +n8n_nodejs_eventloop_lag_p50_seconds 0.012001279 # HELP n8n_nodejs_eventloop_lag_p90_seconds The 90th percentile of the recorded event loop delays. # TYPE n8n_nodejs_eventloop_lag_p90_seconds gauge -n8n_nodejs_eventloop_lag_p90_seconds 0.010067967 +n8n_nodejs_eventloop_lag_p90_seconds 0.013254655 # HELP n8n_nodejs_eventloop_lag_p99_seconds The 99th percentile of the recorded event loop delays. # TYPE n8n_nodejs_eventloop_lag_p99_seconds gauge -n8n_nodejs_eventloop_lag_p99_seconds 0.011124735 +n8n_nodejs_eventloop_lag_p99_seconds 0.014426111 # HELP n8n_nodejs_active_resources Number of active resources that are currently keeping the event loop alive, grouped by async resource type. # TYPE n8n_nodejs_active_resources gauge -n8n_nodejs_active_resources{type="PipeWrap"} 2 -n8n_nodejs_active_resources{type="TCPServerWrap"} 1 -n8n_nodejs_active_resources{type="TCPSocketWrap"} 1 -n8n_nodejs_active_resources{type="Timeout"} 13 +n8n_nodejs_active_resources{type="PipeWrap"} 5 +n8n_nodejs_active_resources{type="TCPServerWrap"} 2 +n8n_nodejs_active_resources{type="TCPSocketWrap"} 9 +n8n_nodejs_active_resources{type="ProcessWrap"} 1 +n8n_nodejs_active_resources{type="Timeout"} 20 n8n_nodejs_active_resources{type="Immediate"} 1 # HELP n8n_nodejs_active_resources_total Total number of active resources. # TYPE n8n_nodejs_active_resources_total gauge -n8n_nodejs_active_resources_total 18 +n8n_nodejs_active_resources_total 38 # HELP n8n_nodejs_active_handles Number of active libuv handles grouped by handle type. Every handle type is C++ class name. # TYPE n8n_nodejs_active_handles gauge -n8n_nodejs_active_handles{type="Socket"} 3 -n8n_nodejs_active_handles{type="Server"} 1 +n8n_nodejs_active_handles{type="Socket"} 14 +n8n_nodejs_active_handles{type="Server"} 2 +n8n_nodejs_active_handles{type="ChildProcess"} 1 # HELP n8n_nodejs_active_handles_total Total number of active handles. # TYPE n8n_nodejs_active_handles_total gauge -n8n_nodejs_active_handles_total 4 +n8n_nodejs_active_handles_total 17 # HELP n8n_nodejs_active_requests Number of active libuv requests grouped by request type. Every request type is C++ class name. # TYPE n8n_nodejs_active_requests gauge +n8n_nodejs_active_requests{type="FSReqCallback"} 1 # HELP n8n_nodejs_active_requests_total Total number of active requests. # TYPE n8n_nodejs_active_requests_total gauge @@ -96,81 +99,87 @@ n8n_nodejs_active_requests_total 0 # HELP n8n_nodejs_heap_size_total_bytes Process heap size from Node.js in bytes. # TYPE n8n_nodejs_heap_size_total_bytes gauge -n8n_nodejs_heap_size_total_bytes 142774272 +n8n_nodejs_heap_size_total_bytes 146391040 # HELP n8n_nodejs_heap_size_used_bytes Process heap size used from Node.js in bytes. # TYPE n8n_nodejs_heap_size_used_bytes gauge -n8n_nodejs_heap_size_used_bytes 136342632 +n8n_nodejs_heap_size_used_bytes 136336448 # HELP n8n_nodejs_external_memory_bytes Node.js external memory size in bytes. # TYPE n8n_nodejs_external_memory_bytes gauge -n8n_nodejs_external_memory_bytes 20824585 +n8n_nodejs_external_memory_bytes 20993559 # HELP n8n_nodejs_heap_space_size_total_bytes Process heap space size total from Node.js in bytes. # TYPE n8n_nodejs_heap_space_size_total_bytes gauge n8n_nodejs_heap_space_size_total_bytes{space="read_only"} 0 -n8n_nodejs_heap_space_size_total_bytes{space="new"} 1048576 -n8n_nodejs_heap_space_size_total_bytes{space="old"} 122208256 -n8n_nodejs_heap_space_size_total_bytes{space="code"} 4718592 +n8n_nodejs_heap_space_size_total_bytes{space="new"} 2097152 +n8n_nodejs_heap_space_size_total_bytes{space="old"} 116920320 +n8n_nodejs_heap_space_size_total_bytes{space="code"} 5505024 n8n_nodejs_heap_space_size_total_bytes{space="shared"} 0 -n8n_nodejs_heap_space_size_total_bytes{space="trusted"} 7643136 +n8n_nodejs_heap_space_size_total_bytes{space="trusted"} 11624448 +n8n_nodejs_heap_space_size_total_bytes{space="shared_trusted"} 0 n8n_nodejs_heap_space_size_total_bytes{space="new_large_object"} 0 -n8n_nodejs_heap_space_size_total_bytes{space="large_object"} 7000064 -n8n_nodejs_heap_space_size_total_bytes{space="code_large_object"} 155648 +n8n_nodejs_heap_space_size_total_bytes{space="large_object"} 9875456 +n8n_nodejs_heap_space_size_total_bytes{space="code_large_object"} 368640 n8n_nodejs_heap_space_size_total_bytes{space="shared_large_object"} 0 +n8n_nodejs_heap_space_size_total_bytes{space="shared_trusted_large_object"} 0 n8n_nodejs_heap_space_size_total_bytes{space="trusted_large_object"} 0 # HELP n8n_nodejs_heap_space_size_used_bytes Process heap space size used from Node.js in bytes. # TYPE n8n_nodejs_heap_space_size_used_bytes gauge n8n_nodejs_heap_space_size_used_bytes{space="read_only"} 0 -n8n_nodejs_heap_space_size_used_bytes{space="new"} 652896 -n8n_nodejs_heap_space_size_used_bytes{space="old"} 119347344 -n8n_nodejs_heap_space_size_used_bytes{space="code"} 4183424 +n8n_nodejs_heap_space_size_used_bytes{space="new"} 382808 +n8n_nodejs_heap_space_size_used_bytes{space="old"} 111099512 +n8n_nodejs_heap_space_size_used_bytes{space="code"} 4853344 n8n_nodejs_heap_space_size_used_bytes{space="shared"} 0 -n8n_nodejs_heap_space_size_used_bytes{space="trusted"} 5187192 +n8n_nodejs_heap_space_size_used_bytes{space="trusted"} 9839592 +n8n_nodejs_heap_space_size_used_bytes{space="shared_trusted"} 0 n8n_nodejs_heap_space_size_used_bytes{space="new_large_object"} 0 -n8n_nodejs_heap_space_size_used_bytes{space="large_object"} 6837144 -n8n_nodejs_heap_space_size_used_bytes{space="code_large_object"} 138432 +n8n_nodejs_heap_space_size_used_bytes{space="large_object"} 9806288 +n8n_nodejs_heap_space_size_used_bytes{space="code_large_object"} 361728 n8n_nodejs_heap_space_size_used_bytes{space="shared_large_object"} 0 +n8n_nodejs_heap_space_size_used_bytes{space="shared_trusted_large_object"} 0 n8n_nodejs_heap_space_size_used_bytes{space="trusted_large_object"} 0 # HELP n8n_nodejs_heap_space_size_available_bytes Process heap space size available from Node.js in bytes. # TYPE n8n_nodejs_heap_space_size_available_bytes gauge n8n_nodejs_heap_space_size_available_bytes{space="read_only"} 0 -n8n_nodejs_heap_space_size_available_bytes{space="new"} 378016 -n8n_nodejs_heap_space_size_available_bytes{space="old"} 430568 -n8n_nodejs_heap_space_size_available_bytes{space="code"} 239680 +n8n_nodejs_heap_space_size_available_bytes{space="new"} 665704 +n8n_nodejs_heap_space_size_available_bytes{space="old"} 5484264 +n8n_nodejs_heap_space_size_available_bytes{space="code"} 651008 n8n_nodejs_heap_space_size_available_bytes{space="shared"} 0 -n8n_nodejs_heap_space_size_available_bytes{space="trusted"} 2323072 +n8n_nodejs_heap_space_size_available_bytes{space="trusted"} 1771032 +n8n_nodejs_heap_space_size_available_bytes{space="shared_trusted"} 0 n8n_nodejs_heap_space_size_available_bytes{space="new_large_object"} 1048576 n8n_nodejs_heap_space_size_available_bytes{space="large_object"} 0 n8n_nodejs_heap_space_size_available_bytes{space="code_large_object"} 0 n8n_nodejs_heap_space_size_available_bytes{space="shared_large_object"} 0 +n8n_nodejs_heap_space_size_available_bytes{space="shared_trusted_large_object"} 0 n8n_nodejs_heap_space_size_available_bytes{space="trusted_large_object"} 0 # HELP n8n_nodejs_version_info Node.js version info. # TYPE n8n_nodejs_version_info gauge -n8n_nodejs_version_info{version="v22.18.0",major="22",minor="18",patch="0"} 1 +n8n_nodejs_version_info{version="v24.14.1",major="24",minor="14",patch="1"} 1 # HELP n8n_nodejs_gc_duration_seconds Garbage collection duration by kind, one of major, minor, incremental or weakcb. # TYPE n8n_nodejs_gc_duration_seconds histogram -n8n_nodejs_gc_duration_seconds_bucket{le="0.001",kind="minor"} 128 -n8n_nodejs_gc_duration_seconds_bucket{le="0.01",kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_bucket{le="0.1",kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_bucket{le="1",kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_bucket{le="2",kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_bucket{le="5",kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_sum{kind="minor"} 0.09924478498101237 -n8n_nodejs_gc_duration_seconds_count{kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_bucket{le="0.001",kind="incremental"} 1 +n8n_nodejs_gc_duration_seconds_bucket{le="0.001",kind="minor"} 0 +n8n_nodejs_gc_duration_seconds_bucket{le="0.01",kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_bucket{le="0.1",kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_bucket{le="1",kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_bucket{le="2",kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_bucket{le="5",kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_sum{kind="minor"} 0.004925500000128522 +n8n_nodejs_gc_duration_seconds_count{kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_bucket{le="0.001",kind="incremental"} 0 n8n_nodejs_gc_duration_seconds_bucket{le="0.01",kind="incremental"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="0.1",kind="incremental"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="1",kind="incremental"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="2",kind="incremental"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="5",kind="incremental"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="incremental"} 2 -n8n_nodejs_gc_duration_seconds_sum{kind="incremental"} 0.0022786640077829363 +n8n_nodejs_gc_duration_seconds_sum{kind="incremental"} 0.005939041999867186 n8n_nodejs_gc_duration_seconds_count{kind="incremental"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="0.001",kind="major"} 0 n8n_nodejs_gc_duration_seconds_bucket{le="0.01",kind="major"} 0 @@ -179,231 +188,212 @@ n8n_nodejs_gc_duration_seconds_bucket{le="1",kind="major"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="2",kind="major"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="5",kind="major"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="major"} 2 -n8n_nodejs_gc_duration_seconds_sum{kind="major"} 0.1028408939987421 +n8n_nodejs_gc_duration_seconds_sum{kind="major"} 0.032123332999879496 n8n_nodejs_gc_duration_seconds_count{kind="major"} 2 +# HELP n8n_process_pss_bytes Proportional Set Size of the process in bytes. +# TYPE n8n_process_pss_bytes gauge +n8n_process_pss_bytes 220097536 + # HELP n8n_version_info n8n version info. # TYPE n8n_version_info gauge -n8n_version_info{version="v1.117.2",major="1",minor="117",patch="2"} 1 +n8n_version_info{version="v2.19.5",major="2",minor="19",patch="5"} 1 # HELP n8n_instance_role_leader Whether this main instance is the leader (1) or not (0). # TYPE n8n_instance_role_leader gauge n8n_instance_role_leader 1 +# HELP n8n_cache_hits_total Total number of cache hits. +# TYPE n8n_cache_hits_total counter +n8n_cache_hits_total 53 + +# HELP n8n_cache_misses_total Total number of cache misses. +# TYPE n8n_cache_misses_total counter +n8n_cache_misses_total 15 + +# HELP n8n_cache_updates_total Total number of cache updates. +# TYPE n8n_cache_updates_total counter +n8n_cache_updates_total 1 + # HELP n8n_http_request_duration_seconds duration histogram of http responses labeled with: status_code # TYPE n8n_http_request_duration_seconds histogram +n8n_http_request_duration_seconds_bucket{le="0.003"} 5 +n8n_http_request_duration_seconds_bucket{le="0.03"} 5 +n8n_http_request_duration_seconds_bucket{le="0.1"} 5 +n8n_http_request_duration_seconds_bucket{le="0.3"} 5 +n8n_http_request_duration_seconds_bucket{le="1.5"} 5 +n8n_http_request_duration_seconds_bucket{le="10"} 5 +n8n_http_request_duration_seconds_bucket{le="+Inf"} 5 +n8n_http_request_duration_seconds_sum 0.0018007910000000002 +n8n_http_request_duration_seconds_count 5 # HELP n8n_last_activity last instance activity (backend request) in Unix time (seconds). # TYPE n8n_last_activity gauge -n8n_last_activity 1761656582 +n8n_last_activity 1778234587 + +# HELP n8n_scaling_mode_queue_jobs_waiting Current number of enqueued jobs waiting for pickup in scaling mode. +# TYPE n8n_scaling_mode_queue_jobs_waiting gauge +n8n_scaling_mode_queue_jobs_waiting 0 + +# HELP n8n_scaling_mode_queue_jobs_active Current number of jobs being processed across all workers in scaling mode. +# TYPE n8n_scaling_mode_queue_jobs_active gauge +n8n_scaling_mode_queue_jobs_active 0 + +# HELP n8n_scaling_mode_queue_jobs_completed Total number of jobs completed across all workers in scaling mode since instance start. +# TYPE n8n_scaling_mode_queue_jobs_completed counter +n8n_scaling_mode_queue_jobs_completed 8 + +# HELP n8n_scaling_mode_queue_jobs_failed Total number of jobs failed across all workers in scaling mode since instance start. +# TYPE n8n_scaling_mode_queue_jobs_failed counter +n8n_scaling_mode_queue_jobs_failed 0 + +# HELP n8n_workflow_execution_duration_seconds Workflow execution duration in seconds. +# TYPE n8n_workflow_execution_duration_seconds histogram +n8n_workflow_execution_duration_seconds_bucket{le="0.005",status="success",mode="webhook",workflow_id="testWorkflowOk"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.01",status="success",mode="webhook",workflow_id="testWorkflowOk"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.025",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="0.05",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="0.1",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="0.25",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="0.5",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="1",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="2.5",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="5",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="10",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="30",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="60",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="120",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="300",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="600",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="+Inf",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_sum{status="success",mode="webhook",workflow_id="testWorkflowOk"} 0.027999999999999997 +n8n_workflow_execution_duration_seconds_count{status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="0.005",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.01",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.025",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.05",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.1",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.25",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.5",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="1",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="2.5",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="5",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="10",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="30",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="60",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="120",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="300",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="600",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="+Inf",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_sum{status="failed",mode="webhook",workflow_id="testWorkflowFail"} 0.405 +n8n_workflow_execution_duration_seconds_count{status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 # HELP n8n_active_workflow_count Total number of active workflows. # TYPE n8n_active_workflow_count gauge -n8n_active_workflow_count{workflow_id="wf_8a3b2c1d"} 0 -n8n_active_workflow_count{workflow_id="wf_7f4e9a2b"} 0 -n8n_active_workflow_count{workflow_id="wf_5d6c8e1f"} 0 - -# HELP n8n_nodejs_event_loop_lag_seconds Event loop lag in seconds -# TYPE n8n_nodejs_event_loop_lag_seconds gauge -n8n_nodejs_event_loop_lag_seconds 0.0035 - -# HELP n8n_nodejs_heap_total_bytes Total heap size allocated in bytes -# TYPE n8n_nodejs_heap_total_bytes gauge -n8n_nodejs_heap_total_bytes 73400320 - -# HELP n8n_nodejs_heap_used_bytes Heap memory used in bytes -# TYPE n8n_nodejs_heap_used_bytes gauge -n8n_nodejs_heap_used_bytes 51200000 - -# HELP n8n_workflow_executions_total Total number of workflow executions -# TYPE n8n_workflow_executions_total counter -n8n_workflow_executions_total{status="success",workflow_id="wf_8a3b2c1d"} 45 -n8n_workflow_executions_total{status="success",workflow_id="wf_7f4e9a2b"} 38 -n8n_workflow_executions_total{status="success",workflow_id="wf_5d6c8e1f"} 45 -n8n_workflow_executions_total{status="error",workflow_id="wf_8a3b2c1d"} 3 -n8n_workflow_executions_total{status="error",workflow_id="wf_5d6c8e1f"} 4 - -# HELP n8n_workflow_executions_duration_seconds Workflow execution duration in seconds -# TYPE n8n_workflow_executions_duration_seconds histogram -n8n_workflow_executions_duration_seconds_bucket{le="0.1",workflow_id="wf_8a3b2c1d"} 5 -n8n_workflow_executions_duration_seconds_bucket{le="1",workflow_id="wf_8a3b2c1d"} 18 -n8n_workflow_executions_duration_seconds_bucket{le="+Inf",workflow_id="wf_8a3b2c1d"} 48 -n8n_workflow_executions_duration_seconds_sum{workflow_id="wf_8a3b2c1d"} 14.3 -n8n_workflow_executions_duration_seconds_count{workflow_id="wf_8a3b2c1d"} 48 -n8n_workflow_executions_duration_seconds_bucket{le="0.1",workflow_id="wf_7f4e9a2b"} 4 -n8n_workflow_executions_duration_seconds_bucket{le="1",workflow_id="wf_7f4e9a2b"} 15 -n8n_workflow_executions_duration_seconds_bucket{le="+Inf",workflow_id="wf_7f4e9a2b"} 38 -n8n_workflow_executions_duration_seconds_sum{workflow_id="wf_7f4e9a2b"} 11.2 -n8n_workflow_executions_duration_seconds_count{workflow_id="wf_7f4e9a2b"} 38 -n8n_workflow_executions_duration_seconds_bucket{le="0.1",workflow_id="wf_5d6c8e1f"} 3 -n8n_workflow_executions_duration_seconds_bucket{le="1",workflow_id="wf_5d6c8e1f"} 12 -n8n_workflow_executions_duration_seconds_bucket{le="+Inf",workflow_id="wf_5d6c8e1f"} 49 -n8n_workflow_executions_duration_seconds_sum{workflow_id="wf_5d6c8e1f"} 12.7 -n8n_workflow_executions_duration_seconds_count{workflow_id="wf_5d6c8e1f"} 49 - -# HELP n8n_workflow_started_total Total number of workflows started -# TYPE n8n_workflow_started_total counter -n8n_workflow_started_total 25634 -n8n_workflow_started_total{workflow_id="12",workflow_name="CRM Sync"} 8142 -n8n_workflow_started_total{workflow_id="25",workflow_name="Webhook Intake"} 14290 -n8n_workflow_started_total{workflow_id="33",workflow_name="Slack Alerts"} 2202 +n8n_active_workflow_count 2 -# HELP n8n_workflow_success_total Total number of workflows completed successfully -# TYPE n8n_workflow_success_total counter -n8n_workflow_success_total 25209 -n8n_workflow_success_total{workflow_id="12",workflow_name="CRM Sync"} 8059 -n8n_workflow_success_total{workflow_id="25",workflow_name="Webhook Intake"} 14135 -n8n_workflow_success_total{workflow_id="33",workflow_name="Slack Alerts"} 2015 +# HELP n8n_production_executions Total number of production workflow executions (success + error). +# TYPE n8n_production_executions gauge +n8n_production_executions 8 -# HELP n8n_workflow_failed_total Total number of workflows that failed -# TYPE n8n_workflow_failed_total counter -n8n_workflow_failed_total 425 -n8n_workflow_failed_total{workflow_id="12",workflow_name="CRM Sync"} 83 -n8n_workflow_failed_total{workflow_id="25",workflow_name="Webhook Intake"} 155 -n8n_workflow_failed_total{workflow_id="33",workflow_name="Slack Alerts"} 187 - - -# HELP n8n_queue_jobs_total Total number of queue jobs -# TYPE n8n_queue_jobs_total counter -n8n_queue_jobs_total{state="waiting"} 3 -n8n_queue_jobs_total{state="active"} 2 -n8n_queue_jobs_total{state="completed"} 148 -n8n_queue_jobs_total{state="failed"} 5 - -# HELP n8n_queue_jobs_duration_seconds Job duration in seconds -# TYPE n8n_queue_jobs_duration_seconds histogram -n8n_queue_jobs_duration_seconds_bucket{le="0.1"} 22 -n8n_queue_jobs_duration_seconds_bucket{le="1"} 84 -n8n_queue_jobs_duration_seconds_bucket{le="+Inf"} 150 -n8n_queue_jobs_duration_seconds_sum 44.8 -n8n_queue_jobs_duration_seconds_count 150 - -# HELP n8n_queue_job_waiting_total Number of jobs currently waiting in the queue -# TYPE n8n_queue_job_waiting_total gauge -n8n_queue_job_waiting_total{queue="default"} 3 - -# HELP n8n_queue_job_active_total Number of jobs currently being processed -# TYPE n8n_queue_job_active_total gauge -n8n_queue_job_active_total{queue="default"} 2 - -# HELP n8n_queue_job_completed_total Number of jobs completed successfully -# TYPE n8n_queue_job_completed_total counter -n8n_queue_job_completed_total{queue="default"} 15892 +# HELP n8n_production_root_executions Total number of production root workflow executions (excludes sub-workflows). +# TYPE n8n_production_root_executions gauge +n8n_production_root_executions 8 -# HELP n8n_queue_job_failed_total Number of jobs that have failed -# TYPE n8n_queue_job_failed_total counter -n8n_queue_job_failed_total{queue="default"} 47 +# HELP n8n_manual_executions Total number of manual workflow executions (success + error). +# TYPE n8n_manual_executions gauge +n8n_manual_executions 0 -# HELP n8n_queue_job_dequeued_total Number of jobs dequeued (picked up from queue) -# TYPE n8n_queue_job_dequeued_total counter -n8n_queue_job_dequeued_total{queue="default"} 15939 +# HELP n8n_enabled_users Total number of enabled users. +# TYPE n8n_enabled_users gauge +n8n_enabled_users 1 + +# HELP n8n_users Total number of users. +# TYPE n8n_users gauge +n8n_users 1 + +# HELP n8n_workflows Total number of workflows. +# TYPE n8n_workflows gauge +n8n_workflows 2 + +# HELP n8n_credentials Total number of credentials. +# TYPE n8n_credentials gauge +n8n_credentials 0 + +# HELP n8n_token_exchange_requests_total Total number of token exchange requests. +# TYPE n8n_token_exchange_requests_total counter +n8n_token_exchange_requests_total{result="success"} 0 +n8n_token_exchange_requests_total{result="failure"} 0 + +# HELP n8n_token_exchange_failures_total Total number of token exchange failures broken down by reason. +# TYPE n8n_token_exchange_failures_total counter +n8n_token_exchange_failures_total{reason="invalid_token"} 0 -# HELP n8n_queue_job_enqueued_total Number of jobs added to the queue +# HELP n8n_embed_login_requests_total Total number of embed login requests. +# TYPE n8n_embed_login_requests_total counter +n8n_embed_login_requests_total{result="success"} 0 +n8n_embed_login_requests_total{result="failure"} 0 + +# HELP n8n_embed_login_failures_total Total number of embed login failures broken down by reason. +# TYPE n8n_embed_login_failures_total counter +n8n_embed_login_failures_total{reason="unauthorized"} 0 + +# HELP n8n_token_exchange_jit_provisioning_total Total number of users JIT-provisioned via token exchange. +# TYPE n8n_token_exchange_jit_provisioning_total counter +n8n_token_exchange_jit_provisioning_total 0 + +# HELP n8n_token_exchange_identity_linked_total Total number of external identities linked to existing users via token exchange. +# TYPE n8n_token_exchange_identity_linked_total counter +n8n_token_exchange_identity_linked_total 0 + +# HELP n8n_audit_workflow_activated_total Total number of n8n.audit.workflow.activated events. +# TYPE n8n_audit_workflow_activated_total counter +n8n_audit_workflow_activated_total{workflow_id="testWorkflowOk"} 1 +n8n_audit_workflow_activated_total{workflow_id="testWorkflowFail"} 1 + +# HELP n8n_queue_job_enqueued_total Total number of n8n.queue.job.enqueued events. # TYPE n8n_queue_job_enqueued_total counter -n8n_queue_job_enqueued_total{queue="default"} 15670 - -# HELP n8n_queue_job_delayed_total Number of jobs scheduled to run later -# TYPE n8n_queue_job_delayed_total gauge -n8n_queue_job_delayed_total{queue="default"} 5 - -# HELP n8n_queue_job_waiting_duration_seconds Duration jobs spend waiting before being processed -# TYPE n8n_queue_job_waiting_duration_seconds histogram -n8n_queue_job_waiting_duration_seconds_bucket{queue="default",le="0.1"} 50 -n8n_queue_job_waiting_duration_seconds_bucket{queue="default",le="1"} 241 -n8n_queue_job_waiting_duration_seconds_bucket{queue="default",le="5"} 820 -n8n_queue_job_waiting_duration_seconds_bucket{queue="default",le="10"} 1105 -n8n_queue_job_waiting_duration_seconds_bucket{queue="default",le="30"} 1240 -n8n_queue_job_waiting_duration_seconds_bucket{queue="default",le="+Inf"} 1253 -n8n_queue_job_waiting_duration_seconds_sum{queue="default"} 450.32 -n8n_queue_job_waiting_duration_seconds_count{queue="default"} 1253 - -# HELP n8n_api_requests_total Total API requests -# TYPE n8n_api_requests_total counter -n8n_api_requests_total{method="GET",endpoint="/workflows"} 240 -n8n_api_requests_total{method="POST",endpoint="/executions"} 75 - -# HELP n8n_api_request_duration_seconds API request duration in seconds -# TYPE n8n_api_request_duration_seconds histogram -n8n_api_request_duration_seconds_bucket{le="0.1"} 90 -n8n_api_request_duration_seconds_bucket{le="1"} 120 -n8n_api_request_duration_seconds_bucket{le="+Inf"} 125 -n8n_api_request_duration_seconds_sum 15.3 -n8n_api_request_duration_seconds_count 125 - -# HELP n8n_cache_operations_total Total cache operations -# TYPE n8n_cache_operations_total counter -n8n_cache_operations_total{operation="get"} 1250 -n8n_cache_operations_total{operation="set"} 320 -n8n_cache_operations_total{operation="delete"} 10 - -# HELP n8n_cache_hits_total Cache hits -# TYPE n8n_cache_hits_total counter -n8n_cache_hits_total 1080 +n8n_queue_job_enqueued_total 8 -# HELP n8n_cache_misses_total Cache misses -# TYPE n8n_cache_misses_total counter -n8n_cache_misses_total 170 - -# HELP n8n_cache_errors_total Cache errors -# TYPE n8n_cache_errors_total counter -n8n_cache_errors_total 0 - -# HELP n8n_cache_latency_seconds Cache operation latency in seconds -# TYPE n8n_cache_latency_seconds histogram -n8n_cache_latency_seconds_bucket{le="0.001"} 90 -n8n_cache_latency_seconds_bucket{le="0.01"} 240 -n8n_cache_latency_seconds_bucket{le="+Inf"} 260 -n8n_cache_latency_seconds_sum 1.42 -n8n_cache_latency_seconds_count 260 - -# HELP n8n_eventbus_events_total Total events published on the event bus -# TYPE n8n_eventbus_events_total counter -n8n_eventbus_events_total{event_type="workflowStarted"} 140 -n8n_eventbus_events_total{event_type="workflowCompleted"} 135 -n8n_eventbus_events_total{event_type="workflowFailed"} 5 - -# HELP n8n_eventbus_events_processed_total Total processed events -# TYPE n8n_eventbus_events_processed_total counter -n8n_eventbus_events_processed_total 138 - -# HELP n8n_eventbus_events_failed_total Total failed event processing -# TYPE n8n_eventbus_events_failed_total counter -n8n_eventbus_events_failed_total 2 - -# HELP n8n_eventbus_queue_size Current event queue size -# TYPE n8n_eventbus_queue_size gauge -n8n_eventbus_queue_size 1 - -# HELP n8n_eventbus_connections_total Active event bus backend connections -# TYPE n8n_eventbus_connections_total gauge -n8n_eventbus_connections_total 1 - -# HELP n8n_workflow_executions_active Number of active workflow executions -# TYPE n8n_workflow_executions_active gauge -n8n_workflow_executions_active 3 - -# HELP n8n_queue_job_attempts_total Total number of job attempts -# TYPE n8n_queue_job_attempts_total counter -n8n_queue_job_attempts_total{result="success"} 435 -n8n_queue_job_attempts_total{result="failed"} 12 - -# HELP n8n_workflow_started_total Total number of workflows started +# HELP n8n_workflow_started_total Total number of n8n.workflow.started events. # TYPE n8n_workflow_started_total counter -n8n_workflow_started_total 25634 -n8n_workflow_started_total{workflow_id="12",workflow_name="CRM Sync"} 8142 -n8n_workflow_started_total{workflow_id="25",workflow_name="Webhook Intake"} 14290 -n8n_workflow_started_total{workflow_id="33",workflow_name="Slack Alerts"} 2202 +n8n_workflow_started_total{workflow_id="testWorkflowOk"} 4 +n8n_workflow_started_total{workflow_id="testWorkflowFail"} 4 + +# HELP n8n_audit_workflow_executed_total Total number of n8n.audit.workflow.executed events. +# TYPE n8n_audit_workflow_executed_total counter +n8n_audit_workflow_executed_total{workflow_id="testWorkflowOk"} 4 +n8n_audit_workflow_executed_total{workflow_id="testWorkflowFail"} 4 -# HELP n8n_workflow_success_total Total number of workflows completed successfully +# HELP n8n_workflow_success_total Total number of n8n.workflow.success events. # TYPE n8n_workflow_success_total counter -n8n_workflow_success_total 25209 -n8n_workflow_success_total{workflow_id="12",workflow_name="CRM Sync"} 8059 -n8n_workflow_success_total{workflow_id="25",workflow_name="Webhook Intake"} 14135 -n8n_workflow_success_total{workflow_id="33",workflow_name="Slack Alerts"} 2015 +n8n_workflow_success_total{workflow_id="testWorkflowOk"} 4 -# HELP n8n_workflow_failed_total Total number of workflows that failed +# HELP n8n_queue_job_completed_total Total number of n8n.queue.job.completed events. +# TYPE n8n_queue_job_completed_total counter +n8n_queue_job_completed_total 4 + +# HELP n8n_workflow_failed_total Total number of n8n.workflow.failed events. # TYPE n8n_workflow_failed_total counter -n8n_workflow_failed_total 425 -n8n_workflow_failed_total{workflow_id="12",workflow_name="CRM Sync"} 83 -n8n_workflow_failed_total{workflow_id="25",workflow_name="Webhook Intake"} 155 -n8n_workflow_failed_total{workflow_id="33",workflow_name="Slack Alerts"} 187 \ No newline at end of file +n8n_workflow_failed_total{workflow_id="testWorkflowFail"} 4 + +# HELP n8n_queue_job_failed_total Total number of n8n.queue.job.failed events. +# TYPE n8n_queue_job_failed_total counter +n8n_queue_job_failed_total 4 +# HELP n8n_queue_job_dequeued_total Total number of n8n.queue.job.dequeued events. +# TYPE n8n_queue_job_dequeued_total counter +n8n_queue_job_dequeued_total 8 + +# HELP n8n_node_started_total Total number of n8n.node.started events. +# TYPE n8n_node_started_total counter +n8n_node_started_total{workflow_id="testWorkflowOk"} 8 +n8n_node_started_total{workflow_id="testWorkflowFail"} 8 + +# HELP n8n_node_finished_total Total number of n8n.node.finished events. +# TYPE n8n_node_finished_total counter +n8n_node_finished_total{workflow_id="testWorkflowOk"} 8 +n8n_node_finished_total{workflow_id="testWorkflowFail"} 8 + +# HELP n8n_runner_task_requested_total Total number of n8n.runner.task.requested events. +# TYPE n8n_runner_task_requested_total counter +n8n_runner_task_requested_total 4 diff --git a/n8n/tests/fixtures/n8n_custom.txt b/n8n/tests/fixtures/n8n_custom.txt index d06fa2589b0ba..70820dfff85c2 100644 --- a/n8n/tests/fixtures/n8n_custom.txt +++ b/n8n/tests/fixtures/n8n_custom.txt @@ -1,34 +1,34 @@ # HELP test_process_cpu_user_seconds_total Total user CPU time spent in seconds. # TYPE test_process_cpu_user_seconds_total counter -test_process_cpu_user_seconds_total 8.298932999999998 +test_process_cpu_user_seconds_total 0.921656 # HELP test_process_cpu_system_seconds_total Total system CPU time spent in seconds. # TYPE test_process_cpu_system_seconds_total counter -test_process_cpu_system_seconds_total 3.1041119999999998 +test_process_cpu_system_seconds_total 0.157367 # HELP test_process_cpu_seconds_total Total user and system CPU time spent in seconds. # TYPE test_process_cpu_seconds_total counter -test_process_cpu_seconds_total 11.403044999999999 +test_process_cpu_seconds_total 1.0790229999999998 # HELP test_process_start_time_seconds Start time of the process since unix epoch in seconds. # TYPE test_process_start_time_seconds gauge -test_process_start_time_seconds 1761656578 +test_process_start_time_seconds 1778234580 # HELP test_process_resident_memory_bytes Resident memory size in bytes. # TYPE test_process_resident_memory_bytes gauge -test_process_resident_memory_bytes 245043200 +test_process_resident_memory_bytes 267681792 # HELP test_process_virtual_memory_bytes Virtual memory size in bytes. # TYPE test_process_virtual_memory_bytes gauge -test_process_virtual_memory_bytes 33656197120 +test_process_virtual_memory_bytes 18517532672 # HELP test_process_heap_bytes Process heap size in bytes. # TYPE test_process_heap_bytes gauge -test_process_heap_bytes 277200896 +test_process_heap_bytes 840728576 # HELP test_process_open_fds Number of open file descriptors. # TYPE test_process_open_fds gauge -test_process_open_fds 44 +test_process_open_fds 45 # HELP test_process_max_fds Maximum number of open file descriptors. # TYPE test_process_max_fds gauge @@ -36,59 +36,62 @@ test_process_max_fds 1048576 # HELP test_nodejs_eventloop_lag_seconds Lag of event loop in seconds. # TYPE test_nodejs_eventloop_lag_seconds gauge -test_nodejs_eventloop_lag_seconds 0.002765567 +test_nodejs_eventloop_lag_seconds 0.008676917 # HELP test_nodejs_eventloop_lag_min_seconds The minimum recorded event loop delay. # TYPE test_nodejs_eventloop_lag_min_seconds gauge -test_nodejs_eventloop_lag_min_seconds 0.010018816 +test_nodejs_eventloop_lag_min_seconds 0.006340608 # HELP test_nodejs_eventloop_lag_max_seconds The maximum recorded event loop delay. # TYPE test_nodejs_eventloop_lag_max_seconds gauge -test_nodejs_eventloop_lag_max_seconds 0.011239423 +test_nodejs_eventloop_lag_max_seconds 0.030228479 # HELP test_nodejs_eventloop_lag_mean_seconds The mean of the recorded event loop delays. # TYPE test_nodejs_eventloop_lag_mean_seconds gauge -test_nodejs_eventloop_lag_mean_seconds 0.010092521938958708 +test_nodejs_eventloop_lag_mean_seconds 0.012079332927643785 # HELP test_nodejs_eventloop_lag_stddev_seconds The standard deviation of the recorded event loop delays. # TYPE test_nodejs_eventloop_lag_stddev_seconds gauge -test_nodejs_eventloop_lag_stddev_seconds 0.00016945350643679045 +test_nodejs_eventloop_lag_stddev_seconds 0.0011467288819057616 # HELP test_nodejs_eventloop_lag_p50_seconds The 50th percentile of the recorded event loop delays. # TYPE test_nodejs_eventloop_lag_p50_seconds gauge -test_nodejs_eventloop_lag_p50_seconds 0.010067967 +test_nodejs_eventloop_lag_p50_seconds 0.012001279 # HELP test_nodejs_eventloop_lag_p90_seconds The 90th percentile of the recorded event loop delays. # TYPE test_nodejs_eventloop_lag_p90_seconds gauge -test_nodejs_eventloop_lag_p90_seconds 0.010067967 +test_nodejs_eventloop_lag_p90_seconds 0.013254655 # HELP test_nodejs_eventloop_lag_p99_seconds The 99th percentile of the recorded event loop delays. # TYPE test_nodejs_eventloop_lag_p99_seconds gauge -test_nodejs_eventloop_lag_p99_seconds 0.011124735 +test_nodejs_eventloop_lag_p99_seconds 0.014426111 # HELP test_nodejs_active_resources Number of active resources that are currently keeping the event loop alive, grouped by async resource type. # TYPE test_nodejs_active_resources gauge -test_nodejs_active_resources{type="PipeWrap"} 2 -test_nodejs_active_resources{type="TCPServerWrap"} 1 -test_nodejs_active_resources{type="TCPSocketWrap"} 1 -test_nodejs_active_resources{type="Timeout"} 13 +test_nodejs_active_resources{type="PipeWrap"} 5 +test_nodejs_active_resources{type="TCPServerWrap"} 2 +test_nodejs_active_resources{type="TCPSocketWrap"} 9 +test_nodejs_active_resources{type="ProcessWrap"} 1 +test_nodejs_active_resources{type="Timeout"} 20 test_nodejs_active_resources{type="Immediate"} 1 # HELP test_nodejs_active_resources_total Total number of active resources. # TYPE test_nodejs_active_resources_total gauge -test_nodejs_active_resources_total 18 +test_nodejs_active_resources_total 38 # HELP test_nodejs_active_handles Number of active libuv handles grouped by handle type. Every handle type is C++ class name. # TYPE test_nodejs_active_handles gauge -test_nodejs_active_handles{type="Socket"} 3 -test_nodejs_active_handles{type="Server"} 1 +test_nodejs_active_handles{type="Socket"} 14 +test_nodejs_active_handles{type="Server"} 2 +test_nodejs_active_handles{type="ChildProcess"} 1 # HELP test_nodejs_active_handles_total Total number of active handles. # TYPE test_nodejs_active_handles_total gauge -test_nodejs_active_handles_total 4 +test_nodejs_active_handles_total 17 # HELP test_nodejs_active_requests Number of active libuv requests grouped by request type. Every request type is C++ class name. # TYPE test_nodejs_active_requests gauge +test_nodejs_active_requests{type="FSReqCallback"} 1 # HELP test_nodejs_active_requests_total Total number of active requests. # TYPE test_nodejs_active_requests_total gauge @@ -96,81 +99,87 @@ test_nodejs_active_requests_total 0 # HELP test_nodejs_heap_size_total_bytes Process heap size from Node.js in bytes. # TYPE test_nodejs_heap_size_total_bytes gauge -test_nodejs_heap_size_total_bytes 142774272 +test_nodejs_heap_size_total_bytes 146391040 # HELP test_nodejs_heap_size_used_bytes Process heap size used from Node.js in bytes. # TYPE test_nodejs_heap_size_used_bytes gauge -test_nodejs_heap_size_used_bytes 136342632 +test_nodejs_heap_size_used_bytes 136336448 # HELP test_nodejs_external_memory_bytes Node.js external memory size in bytes. # TYPE test_nodejs_external_memory_bytes gauge -test_nodejs_external_memory_bytes 20824585 +test_nodejs_external_memory_bytes 20993559 # HELP test_nodejs_heap_space_size_total_bytes Process heap space size total from Node.js in bytes. # TYPE test_nodejs_heap_space_size_total_bytes gauge test_nodejs_heap_space_size_total_bytes{space="read_only"} 0 -test_nodejs_heap_space_size_total_bytes{space="new"} 1048576 -test_nodejs_heap_space_size_total_bytes{space="old"} 122208256 -test_nodejs_heap_space_size_total_bytes{space="code"} 4718592 +test_nodejs_heap_space_size_total_bytes{space="new"} 2097152 +test_nodejs_heap_space_size_total_bytes{space="old"} 116920320 +test_nodejs_heap_space_size_total_bytes{space="code"} 5505024 test_nodejs_heap_space_size_total_bytes{space="shared"} 0 -test_nodejs_heap_space_size_total_bytes{space="trusted"} 7643136 +test_nodejs_heap_space_size_total_bytes{space="trusted"} 11624448 +test_nodejs_heap_space_size_total_bytes{space="shared_trusted"} 0 test_nodejs_heap_space_size_total_bytes{space="new_large_object"} 0 -test_nodejs_heap_space_size_total_bytes{space="large_object"} 7000064 -test_nodejs_heap_space_size_total_bytes{space="code_large_object"} 155648 +test_nodejs_heap_space_size_total_bytes{space="large_object"} 9875456 +test_nodejs_heap_space_size_total_bytes{space="code_large_object"} 368640 test_nodejs_heap_space_size_total_bytes{space="shared_large_object"} 0 +test_nodejs_heap_space_size_total_bytes{space="shared_trusted_large_object"} 0 test_nodejs_heap_space_size_total_bytes{space="trusted_large_object"} 0 # HELP test_nodejs_heap_space_size_used_bytes Process heap space size used from Node.js in bytes. # TYPE test_nodejs_heap_space_size_used_bytes gauge test_nodejs_heap_space_size_used_bytes{space="read_only"} 0 -test_nodejs_heap_space_size_used_bytes{space="new"} 652896 -test_nodejs_heap_space_size_used_bytes{space="old"} 119347344 -test_nodejs_heap_space_size_used_bytes{space="code"} 4183424 +test_nodejs_heap_space_size_used_bytes{space="new"} 382808 +test_nodejs_heap_space_size_used_bytes{space="old"} 111099512 +test_nodejs_heap_space_size_used_bytes{space="code"} 4853344 test_nodejs_heap_space_size_used_bytes{space="shared"} 0 -test_nodejs_heap_space_size_used_bytes{space="trusted"} 5187192 +test_nodejs_heap_space_size_used_bytes{space="trusted"} 9839592 +test_nodejs_heap_space_size_used_bytes{space="shared_trusted"} 0 test_nodejs_heap_space_size_used_bytes{space="new_large_object"} 0 -test_nodejs_heap_space_size_used_bytes{space="large_object"} 6837144 -test_nodejs_heap_space_size_used_bytes{space="code_large_object"} 138432 +test_nodejs_heap_space_size_used_bytes{space="large_object"} 9806288 +test_nodejs_heap_space_size_used_bytes{space="code_large_object"} 361728 test_nodejs_heap_space_size_used_bytes{space="shared_large_object"} 0 +test_nodejs_heap_space_size_used_bytes{space="shared_trusted_large_object"} 0 test_nodejs_heap_space_size_used_bytes{space="trusted_large_object"} 0 # HELP test_nodejs_heap_space_size_available_bytes Process heap space size available from Node.js in bytes. # TYPE test_nodejs_heap_space_size_available_bytes gauge test_nodejs_heap_space_size_available_bytes{space="read_only"} 0 -test_nodejs_heap_space_size_available_bytes{space="new"} 378016 -test_nodejs_heap_space_size_available_bytes{space="old"} 430568 -test_nodejs_heap_space_size_available_bytes{space="code"} 239680 +test_nodejs_heap_space_size_available_bytes{space="new"} 665704 +test_nodejs_heap_space_size_available_bytes{space="old"} 5484264 +test_nodejs_heap_space_size_available_bytes{space="code"} 651008 test_nodejs_heap_space_size_available_bytes{space="shared"} 0 -test_nodejs_heap_space_size_available_bytes{space="trusted"} 2323072 +test_nodejs_heap_space_size_available_bytes{space="trusted"} 1771032 +test_nodejs_heap_space_size_available_bytes{space="shared_trusted"} 0 test_nodejs_heap_space_size_available_bytes{space="new_large_object"} 1048576 test_nodejs_heap_space_size_available_bytes{space="large_object"} 0 test_nodejs_heap_space_size_available_bytes{space="code_large_object"} 0 test_nodejs_heap_space_size_available_bytes{space="shared_large_object"} 0 +test_nodejs_heap_space_size_available_bytes{space="shared_trusted_large_object"} 0 test_nodejs_heap_space_size_available_bytes{space="trusted_large_object"} 0 # HELP test_nodejs_version_info Node.js version info. # TYPE test_nodejs_version_info gauge -test_nodejs_version_info{version="v22.18.0",major="22",minor="18",patch="0"} 1 +test_nodejs_version_info{version="v24.14.1",major="24",minor="14",patch="1"} 1 # HELP test_nodejs_gc_duration_seconds Garbage collection duration by kind, one of major, minor, incremental or weakcb. # TYPE test_nodejs_gc_duration_seconds histogram -test_nodejs_gc_duration_seconds_bucket{le="0.001",kind="minor"} 128 -test_nodejs_gc_duration_seconds_bucket{le="0.01",kind="minor"} 132 -test_nodejs_gc_duration_seconds_bucket{le="0.1",kind="minor"} 132 -test_nodejs_gc_duration_seconds_bucket{le="1",kind="minor"} 132 -test_nodejs_gc_duration_seconds_bucket{le="2",kind="minor"} 132 -test_nodejs_gc_duration_seconds_bucket{le="5",kind="minor"} 132 -test_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="minor"} 132 -test_nodejs_gc_duration_seconds_sum{kind="minor"} 0.09924478498101237 -test_nodejs_gc_duration_seconds_count{kind="minor"} 132 -test_nodejs_gc_duration_seconds_bucket{le="0.001",kind="incremental"} 1 +test_nodejs_gc_duration_seconds_bucket{le="0.001",kind="minor"} 0 +test_nodejs_gc_duration_seconds_bucket{le="0.01",kind="minor"} 2 +test_nodejs_gc_duration_seconds_bucket{le="0.1",kind="minor"} 2 +test_nodejs_gc_duration_seconds_bucket{le="1",kind="minor"} 2 +test_nodejs_gc_duration_seconds_bucket{le="2",kind="minor"} 2 +test_nodejs_gc_duration_seconds_bucket{le="5",kind="minor"} 2 +test_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="minor"} 2 +test_nodejs_gc_duration_seconds_sum{kind="minor"} 0.004925500000128522 +test_nodejs_gc_duration_seconds_count{kind="minor"} 2 +test_nodejs_gc_duration_seconds_bucket{le="0.001",kind="incremental"} 0 test_nodejs_gc_duration_seconds_bucket{le="0.01",kind="incremental"} 2 test_nodejs_gc_duration_seconds_bucket{le="0.1",kind="incremental"} 2 test_nodejs_gc_duration_seconds_bucket{le="1",kind="incremental"} 2 test_nodejs_gc_duration_seconds_bucket{le="2",kind="incremental"} 2 test_nodejs_gc_duration_seconds_bucket{le="5",kind="incremental"} 2 test_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="incremental"} 2 -test_nodejs_gc_duration_seconds_sum{kind="incremental"} 0.0022786640077829363 +test_nodejs_gc_duration_seconds_sum{kind="incremental"} 0.005939041999867186 test_nodejs_gc_duration_seconds_count{kind="incremental"} 2 test_nodejs_gc_duration_seconds_bucket{le="0.001",kind="major"} 0 test_nodejs_gc_duration_seconds_bucket{le="0.01",kind="major"} 0 @@ -179,232 +188,212 @@ test_nodejs_gc_duration_seconds_bucket{le="1",kind="major"} 2 test_nodejs_gc_duration_seconds_bucket{le="2",kind="major"} 2 test_nodejs_gc_duration_seconds_bucket{le="5",kind="major"} 2 test_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="major"} 2 -test_nodejs_gc_duration_seconds_sum{kind="major"} 0.1028408939987421 +test_nodejs_gc_duration_seconds_sum{kind="major"} 0.032123332999879496 test_nodejs_gc_duration_seconds_count{kind="major"} 2 +# HELP test_process_pss_bytes Proportional Set Size of the process in bytes. +# TYPE test_process_pss_bytes gauge +test_process_pss_bytes 220097536 + # HELP test_version_info n8n version info. # TYPE test_version_info gauge -test_version_info{version="v1.117.2",major="1",minor="117",patch="2"} 1 +test_version_info{version="v2.19.5",major="2",minor="19",patch="5"} 1 # HELP test_instance_role_leader Whether this main instance is the leader (1) or not (0). # TYPE test_instance_role_leader gauge test_instance_role_leader 1 +# HELP test_cache_hits_total Total number of cache hits. +# TYPE test_cache_hits_total counter +test_cache_hits_total 53 + +# HELP test_cache_misses_total Total number of cache misses. +# TYPE test_cache_misses_total counter +test_cache_misses_total 15 + +# HELP test_cache_updates_total Total number of cache updates. +# TYPE test_cache_updates_total counter +test_cache_updates_total 1 + # HELP test_http_request_duration_seconds duration histogram of http responses labeled with: status_code # TYPE test_http_request_duration_seconds histogram +test_http_request_duration_seconds_bucket{le="0.003"} 5 +test_http_request_duration_seconds_bucket{le="0.03"} 5 +test_http_request_duration_seconds_bucket{le="0.1"} 5 +test_http_request_duration_seconds_bucket{le="0.3"} 5 +test_http_request_duration_seconds_bucket{le="1.5"} 5 +test_http_request_duration_seconds_bucket{le="10"} 5 +test_http_request_duration_seconds_bucket{le="+Inf"} 5 +test_http_request_duration_seconds_sum 0.0018007910000000002 +test_http_request_duration_seconds_count 5 # HELP test_last_activity last instance activity (backend request) in Unix time (seconds). # TYPE test_last_activity gauge -test_last_activity 1761656582 +test_last_activity 1778234587 + +# HELP test_scaling_mode_queue_jobs_waiting Current number of enqueued jobs waiting for pickup in scaling mode. +# TYPE test_scaling_mode_queue_jobs_waiting gauge +test_scaling_mode_queue_jobs_waiting 0 + +# HELP test_scaling_mode_queue_jobs_active Current number of jobs being processed across all workers in scaling mode. +# TYPE test_scaling_mode_queue_jobs_active gauge +test_scaling_mode_queue_jobs_active 0 + +# HELP test_scaling_mode_queue_jobs_completed Total number of jobs completed across all workers in scaling mode since instance start. +# TYPE test_scaling_mode_queue_jobs_completed counter +test_scaling_mode_queue_jobs_completed 8 + +# HELP test_scaling_mode_queue_jobs_failed Total number of jobs failed across all workers in scaling mode since instance start. +# TYPE test_scaling_mode_queue_jobs_failed counter +test_scaling_mode_queue_jobs_failed 0 + +# HELP test_workflow_execution_duration_seconds Workflow execution duration in seconds. +# TYPE test_workflow_execution_duration_seconds histogram +test_workflow_execution_duration_seconds_bucket{le="0.005",status="success",mode="webhook",workflow_id="testWorkflowOk"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.01",status="success",mode="webhook",workflow_id="testWorkflowOk"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.025",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="0.05",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="0.1",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="0.25",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="0.5",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="1",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="2.5",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="5",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="10",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="30",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="60",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="120",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="300",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="600",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="+Inf",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_sum{status="success",mode="webhook",workflow_id="testWorkflowOk"} 0.027999999999999997 +test_workflow_execution_duration_seconds_count{status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="0.005",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.01",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.025",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.05",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.1",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.25",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.5",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="1",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="2.5",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="5",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="10",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="30",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="60",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="120",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="300",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="600",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="+Inf",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_sum{status="failed",mode="webhook",workflow_id="testWorkflowFail"} 0.405 +test_workflow_execution_duration_seconds_count{status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 # HELP test_active_workflow_count Total number of active workflows. # TYPE test_active_workflow_count gauge -test_active_workflow_count{workflow_id="wf_8a3b2c1d"} 0 -test_active_workflow_count{workflow_id="wf_7f4e9a2b"} 0 -test_active_workflow_count{workflow_id="wf_5d6c8e1f"} 0 - -# HELP test_nodejs_event_loop_lag_seconds Event loop lag in seconds -# TYPE test_nodejs_event_loop_lag_seconds gauge -test_nodejs_event_loop_lag_seconds 0.0035 - -# HELP test_nodejs_heap_total_bytes Total heap size allocated in bytes -# TYPE test_nodejs_heap_total_bytes gauge -test_nodejs_heap_total_bytes 73400320 - -# HELP test_nodejs_heap_used_bytes Heap memory used in bytes -# TYPE test_nodejs_heap_used_bytes gauge -test_nodejs_heap_used_bytes 51200000 - -# HELP test_workflow_executions_total Total number of workflow executions -# TYPE test_workflow_executions_total counter -test_workflow_executions_total{status="success",workflow_id="wf_8a3b2c1d"} 45 -test_workflow_executions_total{status="success",workflow_id="wf_7f4e9a2b"} 38 -test_workflow_executions_total{status="success",workflow_id="wf_5d6c8e1f"} 45 -test_workflow_executions_total{status="error",workflow_id="wf_8a3b2c1d"} 3 -test_workflow_executions_total{status="error",workflow_id="wf_5d6c8e1f"} 4 - -# HELP test_workflow_executions_duration_seconds Workflow execution duration in seconds -# TYPE test_workflow_executions_duration_seconds histogram -test_workflow_executions_duration_seconds_bucket{le="0.1",workflow_id="wf_8a3b2c1d"} 5 -test_workflow_executions_duration_seconds_bucket{le="1",workflow_id="wf_8a3b2c1d"} 18 -test_workflow_executions_duration_seconds_bucket{le="+Inf",workflow_id="wf_8a3b2c1d"} 48 -test_workflow_executions_duration_seconds_sum{workflow_id="wf_8a3b2c1d"} 14.3 -test_workflow_executions_duration_seconds_count{workflow_id="wf_8a3b2c1d"} 48 -test_workflow_executions_duration_seconds_bucket{le="0.1",workflow_id="wf_7f4e9a2b"} 4 -test_workflow_executions_duration_seconds_bucket{le="1",workflow_id="wf_7f4e9a2b"} 15 -test_workflow_executions_duration_seconds_bucket{le="+Inf",workflow_id="wf_7f4e9a2b"} 38 -test_workflow_executions_duration_seconds_sum{workflow_id="wf_7f4e9a2b"} 11.2 -test_workflow_executions_duration_seconds_count{workflow_id="wf_7f4e9a2b"} 38 -test_workflow_executions_duration_seconds_bucket{le="0.1",workflow_id="wf_5d6c8e1f"} 3 -test_workflow_executions_duration_seconds_bucket{le="1",workflow_id="wf_5d6c8e1f"} 12 -test_workflow_executions_duration_seconds_bucket{le="+Inf",workflow_id="wf_5d6c8e1f"} 49 -test_workflow_executions_duration_seconds_sum{workflow_id="wf_5d6c8e1f"} 12.7 -test_workflow_executions_duration_seconds_count{workflow_id="wf_5d6c8e1f"} 49 - -# HELP test_workflow_started_total Total number of workflows started -# TYPE test_workflow_started_total counter -test_workflow_started_total 25634 -test_workflow_started_total{workflow_id="12",workflow_name="CRM Sync"} 8142 -test_workflow_started_total{workflow_id="25",workflow_name="Webhook Intake"} 14290 -test_workflow_started_total{workflow_id="33",workflow_name="Slack Alerts"} 2202 +test_active_workflow_count 2 -# HELP test_workflow_success_total Total number of workflows completed successfully -# TYPE test_workflow_success_total counter -test_workflow_success_total 25209 -test_workflow_success_total{workflow_id="12",workflow_name="CRM Sync"} 8059 -test_workflow_success_total{workflow_id="25",workflow_name="Webhook Intake"} 14135 -test_workflow_success_total{workflow_id="33",workflow_name="Slack Alerts"} 2015 +# HELP test_production_executions Total number of production workflow executions (success + error). +# TYPE test_production_executions gauge +test_production_executions 8 -# HELP test_workflow_failed_total Total number of workflows that failed -# TYPE test_workflow_failed_total counter -test_workflow_failed_total 425 -test_workflow_failed_total{workflow_id="12",workflow_name="CRM Sync"} 83 -test_workflow_failed_total{workflow_id="25",workflow_name="Webhook Intake"} 155 -test_workflow_failed_total{workflow_id="33",workflow_name="Slack Alerts"} 187 - - -# HELP test_queue_jobs_total Total number of queue jobs -# TYPE test_queue_jobs_total counter -test_queue_jobs_total{state="waiting"} 3 -test_queue_jobs_total{state="active"} 2 -test_queue_jobs_total{state="completed"} 148 -test_queue_jobs_total{state="failed"} 5 - -# HELP test_queue_jobs_duration_seconds Job duration in seconds -# TYPE test_queue_jobs_duration_seconds histogram -test_queue_jobs_duration_seconds_bucket{le="0.1"} 22 -test_queue_jobs_duration_seconds_bucket{le="1"} 84 -test_queue_jobs_duration_seconds_bucket{le="+Inf"} 150 -test_queue_jobs_duration_seconds_sum 44.8 -test_queue_jobs_duration_seconds_count 150 - -# HELP test_queue_job_waiting_total Number of jobs currently waiting in the queue -# TYPE test_queue_job_waiting_total gauge -test_queue_job_waiting_total{queue="default"} 3 - -# HELP test_queue_job_active_total Number of jobs currently being processed -# TYPE test_queue_job_active_total gauge -test_queue_job_active_total{queue="default"} 2 - -# HELP test_queue_job_completed_total Number of jobs completed successfully -# TYPE test_queue_job_completed_total counter -test_queue_job_completed_total{queue="default"} 15892 +# HELP test_production_root_executions Total number of production root workflow executions (excludes sub-workflows). +# TYPE test_production_root_executions gauge +test_production_root_executions 8 -# HELP test_queue_job_failed_total Number of jobs that have failed -# TYPE test_queue_job_failed_total counter -test_queue_job_failed_total{queue="default"} 47 +# HELP test_manual_executions Total number of manual workflow executions (success + error). +# TYPE test_manual_executions gauge +test_manual_executions 0 -# HELP test_queue_job_dequeued_total Number of jobs dequeued (picked up from queue) -# TYPE test_queue_job_dequeued_total counter -test_queue_job_dequeued_total{queue="default"} 15939 +# HELP test_enabled_users Total number of enabled users. +# TYPE test_enabled_users gauge +test_enabled_users 1 + +# HELP test_users Total number of users. +# TYPE test_users gauge +test_users 1 + +# HELP test_workflows Total number of workflows. +# TYPE test_workflows gauge +test_workflows 2 + +# HELP test_credentials Total number of credentials. +# TYPE test_credentials gauge +test_credentials 0 + +# HELP test_token_exchange_requests_total Total number of token exchange requests. +# TYPE test_token_exchange_requests_total counter +test_token_exchange_requests_total{result="success"} 0 +test_token_exchange_requests_total{result="failure"} 0 -# HELP test_queue_job_enqueued_total Number of jobs added to the queue +# HELP test_token_exchange_failures_total Total number of token exchange failures broken down by reason. +# TYPE test_token_exchange_failures_total counter +test_token_exchange_failures_total{reason="invalid_token"} 0 + +# HELP test_embed_login_requests_total Total number of embed login requests. +# TYPE test_embed_login_requests_total counter +test_embed_login_requests_total{result="success"} 0 +test_embed_login_requests_total{result="failure"} 0 + +# HELP test_embed_login_failures_total Total number of embed login failures broken down by reason. +# TYPE test_embed_login_failures_total counter +test_embed_login_failures_total{reason="unauthorized"} 0 + +# HELP test_token_exchange_jit_provisioning_total Total number of users JIT-provisioned via token exchange. +# TYPE test_token_exchange_jit_provisioning_total counter +test_token_exchange_jit_provisioning_total 0 + +# HELP test_token_exchange_identity_linked_total Total number of external identities linked to existing users via token exchange. +# TYPE test_token_exchange_identity_linked_total counter +test_token_exchange_identity_linked_total 0 + +# HELP test_audit_workflow_activated_total Total number of n8n.audit.workflow.activated events. +# TYPE test_audit_workflow_activated_total counter +test_audit_workflow_activated_total{workflow_id="testWorkflowOk"} 1 +test_audit_workflow_activated_total{workflow_id="testWorkflowFail"} 1 + +# HELP test_queue_job_enqueued_total Total number of n8n.queue.job.enqueued events. # TYPE test_queue_job_enqueued_total counter -test_queue_job_enqueued_total{queue="default"} 15670 - -# HELP test_queue_job_delayed_total Number of jobs scheduled to run later -# TYPE test_queue_job_delayed_total gauge -test_queue_job_delayed_total{queue="default"} 5 - -# HELP test_queue_job_waiting_duration_seconds Duration jobs spend waiting before being processed -# TYPE test_queue_job_waiting_duration_seconds histogram -test_queue_job_waiting_duration_seconds_bucket{queue="default",le="0.1"} 50 -test_queue_job_waiting_duration_seconds_bucket{queue="default",le="1"} 241 -test_queue_job_waiting_duration_seconds_bucket{queue="default",le="5"} 820 -test_queue_job_waiting_duration_seconds_bucket{queue="default",le="10"} 1105 -test_queue_job_waiting_duration_seconds_bucket{queue="default",le="30"} 1240 -test_queue_job_waiting_duration_seconds_bucket{queue="default",le="+Inf"} 1253 -test_queue_job_waiting_duration_seconds_sum{queue="default"} 450.32 -test_queue_job_waiting_duration_seconds_count{queue="default"} 1253 - -# HELP test_api_requests_total Total API requests -# TYPE test_api_requests_total counter -test_api_requests_total{method="GET",endpoint="/workflows"} 240 -test_api_requests_total{method="POST",endpoint="/executions"} 75 - -# HELP test_api_request_duration_seconds API request duration in seconds -# TYPE test_api_request_duration_seconds histogram -test_api_request_duration_seconds_bucket{le="0.1"} 90 -test_api_request_duration_seconds_bucket{le="1"} 120 -test_api_request_duration_seconds_bucket{le="+Inf"} 125 -test_api_request_duration_seconds_sum 15.3 -test_api_request_duration_seconds_count 125 - -# HELP test_cache_operations_total Total cache operations -# TYPE test_cache_operations_total counter -test_cache_operations_total{operation="get"} 1250 -test_cache_operations_total{operation="set"} 320 -test_cache_operations_total{operation="delete"} 10 - -# HELP test_cache_hits_total Cache hits -# TYPE test_cache_hits_total counter -test_cache_hits_total 1080 +test_queue_job_enqueued_total 8 -# HELP test_cache_misses_total Cache misses -# TYPE test_cache_misses_total counter -test_cache_misses_total 170 - -# HELP test_cache_errors_total Cache errors -# TYPE test_cache_errors_total counter -test_cache_errors_total 0 - -# HELP test_cache_latency_seconds Cache operation latency in seconds -# TYPE test_cache_latency_seconds histogram -test_cache_latency_seconds_bucket{le="0.001"} 90 -test_cache_latency_seconds_bucket{le="0.01"} 240 -test_cache_latency_seconds_bucket{le="+Inf"} 260 -test_cache_latency_seconds_sum 1.42 -test_cache_latency_seconds_count 260 - -# HELP test_eventbus_events_total Total events published on the event bus -# TYPE test_eventbus_events_total counter -test_eventbus_events_total{event_type="workflowStarted"} 140 -test_eventbus_events_total{event_type="workflowCompleted"} 135 -test_eventbus_events_total{event_type="workflowFailed"} 5 - -# HELP test_eventbus_events_processed_total Total processed events -# TYPE test_eventbus_events_processed_total counter -test_eventbus_events_processed_total 138 - -# HELP test_eventbus_events_failed_total Total failed event processing -# TYPE test_eventbus_events_failed_total counter -test_eventbus_events_failed_total 2 - -# HELP test_eventbus_queue_size Current event queue size -# TYPE test_eventbus_queue_size gauge -test_eventbus_queue_size 1 - -# HELP test_eventbus_connections_total Active event bus backend connections -# TYPE test_eventbus_connections_total gauge -test_eventbus_connections_total 1 - -# HELP test_workflow_executions_active Number of active workflow executions -# TYPE test_workflow_executions_active gauge -test_workflow_executions_active 3 - -# HELP test_queue_job_attempts_total Total number of job attempts -# TYPE test_queue_job_attempts_total counter -test_queue_job_attempts_total{result="success"} 435 -test_queue_job_attempts_total{result="failed"} 12 - -# HELP test_workflow_started_total Total number of workflows started +# HELP test_workflow_started_total Total number of n8n.workflow.started events. # TYPE test_workflow_started_total counter -test_workflow_started_total 25634 -test_workflow_started_total{workflow_id="12",workflow_name="CRM Sync"} 8142 -test_workflow_started_total{workflow_id="25",workflow_name="Webhook Intake"} 14290 -test_workflow_started_total{workflow_id="33",workflow_name="Slack Alerts"} 2202 +test_workflow_started_total{workflow_id="testWorkflowOk"} 4 +test_workflow_started_total{workflow_id="testWorkflowFail"} 4 -# HELP test_workflow_success_total Total number of workflows completed successfully +# HELP test_audit_workflow_executed_total Total number of n8n.audit.workflow.executed events. +# TYPE test_audit_workflow_executed_total counter +test_audit_workflow_executed_total{workflow_id="testWorkflowOk"} 4 +test_audit_workflow_executed_total{workflow_id="testWorkflowFail"} 4 + +# HELP test_workflow_success_total Total number of n8n.workflow.success events. # TYPE test_workflow_success_total counter -test_workflow_success_total 25209 -test_workflow_success_total{workflow_id="12",workflow_name="CRM Sync"} 8059 -test_workflow_success_total{workflow_id="25",workflow_name="Webhook Intake"} 14135 -test_workflow_success_total{workflow_id="33",workflow_name="Slack Alerts"} 2015 +test_workflow_success_total{workflow_id="testWorkflowOk"} 4 + +# HELP test_queue_job_completed_total Total number of n8n.queue.job.completed events. +# TYPE test_queue_job_completed_total counter +test_queue_job_completed_total 4 -# HELP test_workflow_failed_total Total number of workflows that failed +# HELP test_workflow_failed_total Total number of n8n.workflow.failed events. # TYPE test_workflow_failed_total counter -test_workflow_failed_total 425 -test_workflow_failed_total{workflow_id="12",workflow_name="CRM Sync"} 83 -test_workflow_failed_total{workflow_id="25",workflow_name="Webhook Intake"} 155 -test_workflow_failed_total{workflow_id="33",workflow_name="Slack Alerts"} 187 +test_workflow_failed_total{workflow_id="testWorkflowFail"} 4 + +# HELP test_queue_job_failed_total Total number of n8n.queue.job.failed events. +# TYPE test_queue_job_failed_total counter +test_queue_job_failed_total 4 +# HELP test_queue_job_dequeued_total Total number of n8n.queue.job.dequeued events. +# TYPE test_queue_job_dequeued_total counter +test_queue_job_dequeued_total 8 + +# HELP test_node_started_total Total number of n8n.node.started events. +# TYPE test_node_started_total counter +test_node_started_total{workflow_id="testWorkflowOk"} 8 +test_node_started_total{workflow_id="testWorkflowFail"} 8 + +# HELP test_node_finished_total Total number of n8n.node.finished events. +# TYPE test_node_finished_total counter +test_node_finished_total{workflow_id="testWorkflowOk"} 8 +test_node_finished_total{workflow_id="testWorkflowFail"} 8 +# HELP test_runner_task_requested_total Total number of n8n.runner.task.requested events. +# TYPE test_runner_task_requested_total counter +test_runner_task_requested_total 4 diff --git a/n8n/tests/test_e2e.py b/n8n/tests/test_e2e.py index 2571135ebce6a..f35b19bcbc786 100644 --- a/n8n/tests/test_e2e.py +++ b/n8n/tests/test_e2e.py @@ -1,13 +1,33 @@ # (C) Datadog, Inc. 2026-present # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) +from typing import Any, Callable + +import pytest + from datadog_checks.dev.utils import assert_service_checks +from . import common + + +@pytest.mark.e2e +def test_check_n8n_e2e( + dd_agent_check: Callable[..., Any], + instance: dict[str, Any], + worker_instance: dict[str, Any], +): + config = {'init_config': {}, 'instances': [instance, worker_instance]} -def test_check_n8n_e2e(dd_agent_check, instance): - aggregator = dd_agent_check(instance, rate=True) + aggregator = dd_agent_check(config, rate=True) - # Assert the readiness check metric is present with status_code tag - aggregator.assert_metric('n8n.readiness.check', value=1, tags=["status_code:200"], at_least=1) + aggregator.assert_metric('n8n.readiness.check', value=1, tags=['status_code:200', 'n8n_process:main'], at_least=1) + # Worker also exposes /healthz/readiness via QUEUE_HEALTH_CHECK_ACTIVE on its own port. + aggregator.assert_metric('n8n.readiness.check', value=1, tags=['status_code:200', 'n8n_process:worker'], at_least=1) + common.drop_rare_event_metrics(aggregator) + aggregator.assert_metrics_using_metadata( + common.get_all_metadata_metrics(exclude_rare=True), + check_submission_type=True, + check_symmetric_inclusion=True, + ) assert_service_checks(aggregator) diff --git a/n8n/tests/test_integration.py b/n8n/tests/test_integration.py new file mode 100644 index 0000000000000..965df76703ebd --- /dev/null +++ b/n8n/tests/test_integration.py @@ -0,0 +1,58 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +from typing import Any, Callable + +import pytest + +from datadog_checks.base.stubs.aggregator import AggregatorStub +from datadog_checks.n8n import N8nCheck + +from . import common + +pytestmark = [pytest.mark.usefixtures('dd_environment'), pytest.mark.integration] + + +def _run_check_twice(instance: dict[str, Any], dd_run_check: Callable[[N8nCheck], Any]) -> N8nCheck: + check = N8nCheck('n8n', {}, [instance]) + # First run primes any one-shot/cached metrics; the second exercises the steady state. + dd_run_check(check) + dd_run_check(check) + return check + + +@pytest.fixture +def warmed_main( + instance: dict[str, Any], + dd_run_check: Callable[[N8nCheck], Any], + aggregator: AggregatorStub, +) -> N8nCheck: + return _run_check_twice(instance, dd_run_check) + + +@pytest.fixture +def warmed_both( + instance: dict[str, Any], + worker_instance: dict[str, Any], + dd_run_check: Callable[[N8nCheck], Any], + aggregator: AggregatorStub, +) -> AggregatorStub: + """Run the check against both the main and worker /metrics endpoints into one aggregator.""" + _run_check_twice(instance, dd_run_check) + _run_check_twice(worker_instance, dd_run_check) + return aggregator + + +def test_all_metadata_metrics_emitted(warmed_both: AggregatorStub): + """Across main and worker, every metadata metric for this n8n version is emitted.""" + common.drop_rare_event_metrics(warmed_both) + warmed_both.assert_metrics_using_metadata( + common.get_all_metadata_metrics(exclude_rare=True), + check_submission_type=True, + check_symmetric_inclusion=True, + ) + + +def test_readiness_check_metric(warmed_main: N8nCheck, aggregator: AggregatorStub): + aggregator.assert_metric('n8n.readiness.check', value=1, tags=['status_code:200', 'n8n_process:main'], at_least=1) diff --git a/n8n/tests/test_unit.py b/n8n/tests/test_unit.py index cc3b314428044..0e3ac92985289 100644 --- a/n8n/tests/test_unit.py +++ b/n8n/tests/test_unit.py @@ -2,91 +2,121 @@ # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) +from typing import Any, Callable from unittest import mock -from datadog_checks.dev.utils import get_metadata_metrics +import pytest +from requests.exceptions import ConnectionError + +from datadog_checks.base.stubs.aggregator import AggregatorStub +from datadog_checks.base.stubs.datadog_agent import DatadogAgentStub from datadog_checks.n8n import N8nCheck from . import common -def test_unit_metrics(dd_run_check, instance, aggregator, mock_http_response): +def test_check_emits_metrics_as_in_metadata( + dd_run_check: Callable[[N8nCheck], Any], + aggregator: AggregatorStub, + mock_http_response: Callable[..., Any], +): mock_http_response(file_path=common.get_fixture_path('n8n.txt')) + instance: dict[str, Any] = {'openmetrics_endpoint': 'http://localhost:5678/metrics'} check = N8nCheck('n8n', {}, [instance]) - dd_run_check(check) + with mock.patch.object(N8nCheck, '_check_n8n_readiness', return_value=None): + dd_run_check(check) - for metric in common.TEST_METRICS: - aggregator.assert_metric(metric) - aggregator.assert_all_metrics_covered() - aggregator.assert_metrics_using_metadata(get_metadata_metrics()) + aggregator.assert_metrics_using_metadata( + common.get_openmetrics_metadata_metrics(major=2), + check_submission_type=True, + check_symmetric_inclusion=True, + ) -def test_metrics_custom_prefx(dd_run_check, aggregator, mock_http_response): +def test_metrics_custom_prefix( + dd_run_check: Callable[[N8nCheck], Any], + aggregator: AggregatorStub, + mock_http_response: Callable[..., Any], +): mock_http_response(file_path=common.get_fixture_path('n8n_custom.txt')) - instance = { + instance: dict[str, Any] = { 'openmetrics_endpoint': 'http://localhost:5678/metrics', 'raw_metric_prefix': 'test_', } check = N8nCheck('n8n', {}, [instance]) - dd_run_check(check) + with mock.patch.object(N8nCheck, '_check_n8n_readiness', return_value=None): + dd_run_check(check) - for metric in common.TEST_METRICS: - aggregator.assert_metric(metric) - aggregator.assert_all_metrics_covered() - aggregator.assert_metrics_using_metadata(get_metadata_metrics()) + aggregator.assert_metrics_using_metadata( + common.get_openmetrics_metadata_metrics(major=2), + check_submission_type=True, + check_symmetric_inclusion=True, + ) -def test_readiness_check_ready(aggregator, instance): +@pytest.fixture +def initialized_check(instance: dict[str, Any]) -> N8nCheck: + check = N8nCheck('n8n', {}, [instance]) + check.load_configuration_models() + return check + + +@pytest.mark.parametrize( + 'status_code, expected_value', + [ + pytest.param(200, 1, id='ready'), + pytest.param(503, 0, id='not_ready'), + ], +) +def test_readiness_check( + aggregator: AggregatorStub, + initialized_check: N8nCheck, + status_code: int, + expected_value: int, +): with mock.patch( 'requests.Session.get', - return_value=mock.Mock(ok=True, status_code=200), + return_value=mock.Mock(ok=expected_value == 1, status_code=status_code), ): - check = N8nCheck('n8n', {}, [instance]) - check._check_n8n_readiness() + initialized_check._check_n8n_readiness() - # Assert metric value is 1 (ready) with status_code:200 tag - aggregator.assert_metric('n8n.readiness.check', value=1, tags=['status_code:200']) + aggregator.assert_metric( + 'n8n.readiness.check', + value=expected_value, + tags=['n8n_process:main', f'status_code:{status_code}'], + ) -def test_readiness_check_not_ready(aggregator, instance): - with mock.patch( - 'requests.Session.get', - return_value=mock.Mock(ok=False, status_code=503), - ): - check = N8nCheck('n8n', {}, [instance]) - check._check_n8n_readiness() +def test_readiness_check_unreachable(aggregator: AggregatorStub, initialized_check: N8nCheck): + with mock.patch('requests.Session.get', side_effect=ConnectionError('boom')): + initialized_check._check_n8n_readiness() - # Assert metric value is 0 (not ready) with status_code:503 tag - aggregator.assert_metric('n8n.readiness.check', value=0, tags=['status_code:503']) + aggregator.assert_metric('n8n.readiness.check', value=0, tags=['n8n_process:main', 'status_code:none']) -def test_readiness_check_no_status_code(aggregator, instance): - with mock.patch( - 'requests.Session.get', - return_value=mock.Mock(ok=False, status_code=None), - ): - check = N8nCheck('n8n', {}, [instance]) - check._check_n8n_readiness() - - # Assert metric value is 0 (not ready) with status_code:null tag - aggregator.assert_metric('n8n.readiness.check', value=0, tags=['status_code:null']) +def test_readiness_uses_endpoint_host_not_metrics_path(initialized_check: N8nCheck): + """The readiness endpoint must be derived from the host, not appended to /metrics.""" + expected = f'http://{common.HOST}:{common.MAIN_PORT}/healthz/readiness' + assert initialized_check._readiness_endpoint() == expected -def test_version_metadata(datadog_agent, dd_run_check, mock_http_response, instance): - """ - Test version metadata collection from Prometheus metrics - """ +def test_version_metadata( + datadog_agent: DatadogAgentStub, + dd_run_check: Callable[[N8nCheck], Any], + mock_http_response: Callable[..., Any], + instance: dict[str, Any], +): mock_http_response(file_path=common.get_fixture_path('n8n.txt')) check = N8nCheck('n8n', {}, [instance]) check.check_id = 'n8n_test' - dd_run_check(check) - # Version from fixture: n8n_version_info{version="v1.117.2",major="1",minor="117",patch="2"} 1 + with mock.patch.object(N8nCheck, '_check_n8n_readiness', return_value=None): + dd_run_check(check) version_metadata = { 'version.scheme': 'semver', - 'version.major': '1', - 'version.minor': '117', - 'version.patch': '2', - 'version.raw': 'v1.117.2', + 'version.major': '2', + 'version.minor': '19', + 'version.patch': '5', + 'version.raw': 'v2.19.5', } datadog_agent.assert_metadata('n8n_test', version_metadata) From 299129596f29b067b84fc8bf9e12b0191a44c1cd Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Fri, 8 May 2026 12:54:26 +0200 Subject: [PATCH 2/7] Add changelog for PR #23635 --- n8n/changelog.d/23635.added | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 n8n/changelog.d/23635.added diff --git a/n8n/changelog.d/23635.added b/n8n/changelog.d/23635.added new file mode 100644 index 0000000000000..9ab2409ea457f --- /dev/null +++ b/n8n/changelog.d/23635.added @@ -0,0 +1,6 @@ +- Map every n8n 2.x metric family verified live against n8n 2.19.5: ``workflow.execution.duration.seconds.*`` histogram, ``audit.workflow.activated``/``executed``, ``embed.login.requests``/``failures``, ``token.exchange.requests``/``failures``/``identity.linked``/``jit.provisioning``, ``process.pss.bytes``, ``runner.task.requested``, and the ``workflow_statistics`` family (``production.executions``, ``production.root.executions``, ``manual.executions``, ``users.total``, ``enabled.users``, ``workflows.total``, ``credentials.total``). +- Restore valid metrics that the integration was previously dropping: ``queue.job.dequeued``, ``nodejs.active.requests``. +- Add worker-only families ``node.started``, ``node.finished``, ``queue.job.dequeued``, ``runner.task.requested`` and document scraping the n8n worker process as a separate Datadog instance. +- Remove the gating of OpenMetrics scraping on ``/healthz/readiness`` — ``n8n.readiness.check`` is still submitted, but metrics keep flowing when readiness reports degraded so SRE-relevant signals (queue depth, process state) are not lost during incidents. +- Document version-specific metric availability and the n8n env flags that gate them (``N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS``, ``N8N_METRICS_INCLUDE_WORKFLOW_EXECUTION_DURATION``, ``N8N_METRICS_INCLUDE_QUEUE_METRICS``). +- Use the actual ``/metrics`` URL in the ``openmetrics_endpoint`` example in ``conf.yaml.example``/``spec.yaml`` (was previously the host root, which silently mismatched the scrape path the check uses). \ No newline at end of file From 12f31227faa81031cd86f7a25cfde95826e47e0e Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Fri, 8 May 2026 14:27:57 +0200 Subject: [PATCH 3/7] Refine n8n metric coverage and e2e setup --- n8n/README.md | 15 ++++---- n8n/datadog_checks/n8n/metrics.py | 10 ++++-- n8n/metadata.csv | 6 +++- n8n/tests/common.py | 60 ++++++++++++++++--------------- n8n/tests/conftest.py | 37 +++++++------------ n8n/tests/fixtures/n8n.txt | 16 +++++++++ n8n/tests/fixtures/n8n_custom.txt | 16 +++++++++ n8n/tests/test_e2e.py | 6 +--- 8 files changed, 96 insertions(+), 70 deletions(-) diff --git a/n8n/README.md b/n8n/README.md index 76339740046a4..dfccf49c70c6f 100644 --- a/n8n/README.md +++ b/n8n/README.md @@ -2,7 +2,7 @@ ## Overview -This check monitors [n8n][1] through the Datadog Agent. +This check monitors [n8n][1] through the Datadog Agent. Collect n8n metrics including: - Cache metrics: hit, miss, and update counts. @@ -42,7 +42,7 @@ N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL=true N8N_METRICS_INCLUDE_API_ENDPOINTS=true N8N_METRICS_INCLUDE_QUEUE_METRICS=true -# Optional: n8n 2.x adds workflow_statistics gauges (workflows, users, executions, ...) — opt in +# Optional: n8n 2.x adds workflow_statistics gauges (workflows, users, executions, ...) - opt in N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS=true # Optional: Customize the metric prefix (default is 'n8n_') @@ -59,7 +59,7 @@ Some n8n counters are registered dynamically the first time the corresponding ev #### Queue mode and workers -In queue mode, n8n runs separate worker processes that execute jobs picked up from a Redis-backed queue. Each worker exposes its own `/metrics` endpoint and emits a different subset of metrics than the main process. Worker-observed metrics include `n8n.queue.job.dequeued.count`, `n8n.node.started.count`, `n8n.node.finished.count`, and (n8n 2.x) `n8n.runner.task.requested.count`. Main-only metrics include `n8n.instance.role.leader` and the `n8n.scaling.mode.queue.jobs.*` family. +In queue mode, n8n runs separate worker processes that execute jobs picked up from a Redis-backed queue. Each worker exposes its own `/metrics` endpoint and emits a different subset of metrics than the main process. Worker-observed metrics include `n8n.queue.job.dequeued.count`, `n8n.node.started.count`, `n8n.node.finished.count`, and `n8n.runner.task.requested.count`. Main-only metrics include `n8n.instance.role.leader` and the `n8n.scaling.mode.queue.jobs.*` family. To expose worker metrics, set `QUEUE_HEALTH_CHECK_ACTIVE=true` and `QUEUE_HEALTH_CHECK_PORT=` on each worker. **In n8n 2.x, port `5679` is reserved for the task runner broker, so pick a different port (for example `5680`).** @@ -76,16 +76,13 @@ instances: Several metric families were introduced in n8n 2.x and are not emitted on n8n 1.x: - `n8n.workflow.execution.duration.seconds.*` (histogram) -- `n8n.audit.workflow.activated.count`, `n8n.audit.workflow.executed.count` +- `n8n.audit.workflow.activated.count`, `n8n.audit.workflow.deactivated.count`, `n8n.audit.workflow.executed.count`, `n8n.audit.workflow.resumed.count`, `n8n.audit.workflow.version.updated.count`, and `n8n.audit.workflow.waiting.count` - `n8n.embed.login.requests.count` (tagged with `result:success`/`failure`), `n8n.embed.login.failures.count` (tagged with `reason`) - `n8n.token.exchange.requests.count` (tagged with `result:success`/`failure`), `n8n.token.exchange.failures.count` (tagged with `reason`), `n8n.token.exchange.identity.linked.count`, `n8n.token.exchange.jit.provisioning.count` - `n8n.process.pss.bytes` (Linux only) -- `n8n.runner.task.requested.count` (worker-only) -- The `n8n.{production,manual,production.root}.executions`, `n8n.users.total`, `n8n.enabled.users`, `n8n.workflows.total`, and `n8n.credentials.total` family — only emitted when `N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS=true` is set. +- The `n8n.{production,manual,production.root}.executions`, `n8n.users.total`, `n8n.enabled.users`, `n8n.workflows.total`, and `n8n.credentials.total` family - only emitted when `N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS=true` is set. -The failures-only counters (`*.failures.count`) and the libuv `n8n.nodejs.active.requests` gauge only emit samples once the corresponding event fires (an auth failure, an in-flight libuv request); a healthy idle deployment may not produce any data points for them. - -The `metadata.csv` description for each affected metric calls out its version requirement. +Some metrics only emit samples after the corresponding runtime event occurs. For example, failures-only counters (`*.failures.count`) need an authentication failure, audit workflow counters need the matching workflow state transition, and the libuv `n8n.nodejs.active.requests` gauge needs an in-flight libuv request. A healthy idle deployment may not produce data points for these metrics until that activity occurs. #### Tag cardinality diff --git a/n8n/datadog_checks/n8n/metrics.py b/n8n/datadog_checks/n8n/metrics.py index 7a3be922b6f09..4961712939f6b 100644 --- a/n8n/datadog_checks/n8n/metrics.py +++ b/n8n/datadog_checks/n8n/metrics.py @@ -3,7 +3,7 @@ # Licensed under a 3-clause BSD style license (see LICENSE) # Metrics emitted by n8n's /metrics endpoint, verified live against n8n@1.118.1 -# and n8n@2.19.5 with the test environment in `tests/docker/`. +# and n8n@2.19.5. # # The OpenMetrics base check strips `_total` from counter names before lookup # and appends `.count` on submission, so counter keys here are written without @@ -13,7 +13,7 @@ # `n8n...` becomes counter `___total`) and only appear once # the corresponding event fires at runtime. In queue mode, worker processes # emit `node_started_total`, `node_finished_total`, `queue_job_dequeued_total`, -# and (n8n 2.x+) `runner_task_requested_total`. +# and `runner_task_requested_total`. # # Several families were introduced in n8n 2.x (see the README "Version-specific # metrics" section). The `workflow_statistics_*` and SSO/embed token-exchange @@ -22,7 +22,11 @@ METRIC_MAP = { 'active_workflow_count': 'active.workflow.count', 'audit_workflow_activated': 'audit.workflow.activated', # n8n 2.x+ + 'audit_workflow_deactivated': 'audit.workflow.deactivated', # n8n 2.x+ 'audit_workflow_executed': 'audit.workflow.executed', # n8n 2.x+ + 'audit_workflow_resumed': 'audit.workflow.resumed', # n8n 2.x+ + 'audit_workflow_version_updated': 'audit.workflow.version.updated', # n8n 2.x+ + 'audit_workflow_waiting': 'audit.workflow.waiting', # n8n 2.x+ 'cache_hits': 'cache.hits', 'cache_misses': 'cache.misses', 'cache_updates': 'cache.updates', @@ -80,7 +84,7 @@ 'queue_job_dequeued': 'queue.job.dequeued', 'queue_job_enqueued': 'queue.job.enqueued', 'queue_job_failed': 'queue.job.failed', - 'runner_task_requested': 'runner.task.requested', # n8n 2.x+ + 'runner_task_requested': 'runner.task.requested', 'scaling_mode_queue_jobs_active': 'scaling.mode.queue.jobs.active', 'scaling_mode_queue_jobs_completed': 'scaling.mode.queue.jobs.completed', 'scaling_mode_queue_jobs_failed': 'scaling.mode.queue.jobs.failed', diff --git a/n8n/metadata.csv b/n8n/metadata.csv index 4ba6daa38c79e..3cf493e4dba2b 100644 --- a/n8n/metadata.csv +++ b/n8n/metadata.csv @@ -1,7 +1,11 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags n8n.active.workflow.count,gauge,,,,Total number of active workflows.,0,n8n,,, n8n.audit.workflow.activated.count,count,,,,Total number of audited workflow activations. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.deactivated.count,count,,,,Total number of audited workflow deactivations. Available in n8n 2.x and later.,0,n8n,,, n8n.audit.workflow.executed.count,count,,,,Total number of audited workflow executions. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.resumed.count,count,,,,Total number of audited workflow resumptions. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.version.updated.count,count,,,,Total number of audited workflow version updates. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.waiting.count,count,,,,Total number of audited workflow executions entering a waiting state. Available in n8n 2.x and later.,0,n8n,,, n8n.cache.hits.count,count,,,,Total number of cache hits.,0,n8n,,, n8n.cache.misses.count,count,,,,Total number of cache misses.,0,n8n,,, n8n.cache.updates.count,count,,,,Total number of cache updates.,0,n8n,,, @@ -57,7 +61,7 @@ n8n.queue.job.dequeued.count,count,,,,Number of jobs dequeued by workers (n8n.qu n8n.queue.job.enqueued.count,count,,,,Number of jobs added to the queue (n8n.queue.job.enqueued event).,0,n8n,,, n8n.queue.job.failed.count,count,,,,Number of jobs that have failed (n8n.queue.job.failed event).,0,n8n,,, n8n.readiness.check,gauge,,,,Readiness check status (1 if ready with status code 200 otherwise 0) with status code tag.,0,n8n,,,status_code -n8n.runner.task.requested.count,count,,,,Total number of runner tasks requested by worker processes. Available in n8n 2.x and later.,0,n8n,,, +n8n.runner.task.requested.count,count,,,,Total number of runner tasks requested by worker processes.,0,n8n,,, n8n.scaling.mode.queue.jobs.active,gauge,,,,Current number of jobs being processed across all workers in scaling mode.,0,n8n,,, n8n.scaling.mode.queue.jobs.completed.count,count,,,,Total number of jobs completed across all workers in scaling mode since instance start.,0,n8n,,, n8n.scaling.mode.queue.jobs.failed.count,count,,,,Total number of jobs failed across all workers in scaling mode since instance start.,0,n8n,,, diff --git a/n8n/tests/common.py b/n8n/tests/common.py index 5403b42a3317c..85dded9652b2d 100644 --- a/n8n/tests/common.py +++ b/n8n/tests/common.py @@ -16,15 +16,6 @@ # with stale containers or other locally-bound services. The in-container ports stay fixed. MAIN_PORT, WORKER_PORT = find_free_ports('127.0.0.1', 2) - -def get_compose_env_vars() -> dict[str, str]: - """Variables consumed by docker-compose.yaml's ``${...}`` placeholders.""" - return { - 'N8N_MAIN_HOST_PORT': str(MAIN_PORT), - 'N8N_WORKER_HOST_PORT': str(WORKER_PORT), - } - - N8N_VERSION = os.environ.get('N8N_VERSION', '1.118.1') N8N_MAJOR = int(N8N_VERSION.split('.', 1)[0]) @@ -35,7 +26,11 @@ def get_compose_env_vars() -> dict[str, str]: V2_ONLY_METRIC_NAMES = frozenset( { 'n8n.audit.workflow.activated.count', + 'n8n.audit.workflow.deactivated.count', 'n8n.audit.workflow.executed.count', + 'n8n.audit.workflow.resumed.count', + 'n8n.audit.workflow.version.updated.count', + 'n8n.audit.workflow.waiting.count', 'n8n.credentials.total', 'n8n.embed.login.failures.count', 'n8n.embed.login.requests.count', @@ -44,7 +39,6 @@ def get_compose_env_vars() -> dict[str, str]: 'n8n.process.pss.bytes', 'n8n.production.executions', 'n8n.production.root.executions', - 'n8n.runner.task.requested.count', 'n8n.token.exchange.failures.count', 'n8n.token.exchange.identity.linked.count', 'n8n.token.exchange.jit.provisioning.count', @@ -58,12 +52,17 @@ def get_compose_env_vars() -> dict[str, str]: ) # Metrics that are mapped and present in metadata but only emit samples after a specific -# event fires (auth failure, libuv request mid-flight). The unit fixture has synthetic -# samples for them; live integration/e2e runs cannot guarantee samples and exclude them -# from the symmetric metadata assertion. +# event fires (auth failure, audit state transition, libuv request mid-flight). The unit +# fixture has synthetic samples for them; live integration/e2e runs cannot guarantee +# samples and exclude them from the symmetric metadata assertion. RARE_EVENT_METRIC_NAMES = frozenset( { + 'n8n.audit.workflow.deactivated.count', + 'n8n.audit.workflow.resumed.count', + 'n8n.audit.workflow.version.updated.count', + 'n8n.audit.workflow.waiting.count', 'n8n.embed.login.failures.count', + 'n8n.runner.task.requested.count', 'n8n.token.exchange.failures.count', # prom-client's per-type libuv request gauge: only has samples while a libuv request is in flight # at scrape time, so live containers can produce or omit it depending on timing. @@ -71,6 +70,26 @@ def get_compose_env_vars() -> dict[str, str]: } ) +MAIN_INSTANCE = { + 'openmetrics_endpoint': f'http://{HOST}:{MAIN_PORT}/metrics', + 'tags': ['n8n_process:main'], +} +WORKER_INSTANCE = { + 'openmetrics_endpoint': f'http://{HOST}:{WORKER_PORT}/metrics', + 'tags': ['n8n_process:worker'], +} +INSTANCE = MAIN_INSTANCE # back-compat default for unit tests + +E2E_METADATA = {'docker_volumes': ['/var/run/docker.sock:/var/run/docker.sock:ro']} + + +def get_compose_env_vars() -> dict[str, str]: + """Variables consumed by docker-compose.yaml's ``${...}`` placeholders.""" + return { + 'N8N_MAIN_HOST_PORT': str(MAIN_PORT), + 'N8N_WORKER_HOST_PORT': str(WORKER_PORT), + } + def get_fixture_path(filename: str) -> str: return os.path.join(HERE, 'fixtures', filename) @@ -112,18 +131,3 @@ def drop_rare_event_metrics(aggregator: AggregatorStub): """ for name in RARE_EVENT_METRIC_NAMES: aggregator._metrics.pop(name, None) - - -MAIN_INSTANCE = { - 'openmetrics_endpoint': f'http://{HOST}:{MAIN_PORT}/metrics', - 'tags': ['n8n_process:main'], -} -WORKER_INSTANCE = { - 'openmetrics_endpoint': f'http://{HOST}:{WORKER_PORT}/metrics', - 'tags': ['n8n_process:worker'], -} -INSTANCE = MAIN_INSTANCE # back-compat default for unit tests - -E2E_METADATA = { - 'docker_volumes': ['/var/run/docker.sock:/var/run/docker.sock:ro'], -} diff --git a/n8n/tests/conftest.py b/n8n/tests/conftest.py index 4f560ece01fa9..6539f85e00ab2 100644 --- a/n8n/tests/conftest.py +++ b/n8n/tests/conftest.py @@ -5,6 +5,7 @@ import copy import subprocess import time +from contextlib import suppress from typing import Any, Iterator import pytest @@ -33,11 +34,9 @@ def _docker_exec(*cmd: str) -> str: def _wait_for_n8n(timeout: int = 90): deadline = time.monotonic() + timeout while time.monotonic() < deadline: - try: - if requests.get(f'http://{common.HOST}:{common.MAIN_PORT}/healthz', timeout=2).status_code == 200: + with suppress(requests.RequestException): + if requests.get(common.MAIN_INSTANCE['openmetrics_endpoint'], timeout=2).status_code == 200: return - except requests.RequestException: - pass time.sleep(2) raise RuntimeError('n8n did not become healthy in time') @@ -50,10 +49,7 @@ def _activate_imported_workflows(): for wf_id in (WORKFLOW_OK_ID, WORKFLOW_FAIL_ID): _docker_exec('n8n', 'update:workflow', f'--id={wf_id}', '--active=true') - subprocess.check_call( - ['docker', 'compose', '-f', common.COMPOSE_FILE, 'restart', 'n8n'], - stderr=subprocess.STDOUT, - ) + subprocess.check_call(['docker', 'restart', CONTAINER], stderr=subprocess.STDOUT) _wait_for_n8n() @@ -67,18 +63,14 @@ def _generate_workflow_traffic(iterations: int = 5): api_paths = ('/healthz', '/healthz/readiness', '/rest/login') ok_responses = 0 for _ in range(iterations): - try: + with suppress(requests.RequestException): ok = requests.get(f'{base_url}{WEBHOOK_OK_PATH}', timeout=5) if ok.status_code < 500: ok_responses += 1 - except requests.RequestException: - pass # Webhook fail is *expected* to error out — that's the point of triggering it. for path in (WEBHOOK_FAIL_PATH, *api_paths): - try: + with suppress(requests.RequestException): requests.get(f'{base_url}{path}', timeout=5) - except requests.RequestException: - pass if ok_responses == 0: raise RuntimeError('Test webhook returned no successful responses; workflow registration failed') @@ -88,30 +80,27 @@ def _wait_for_workflow_metric(timeout: int = 30): deadline = time.monotonic() + timeout metrics_url = common.MAIN_INSTANCE['openmetrics_endpoint'] while time.monotonic() < deadline: - try: + with suppress(requests.RequestException): payload = requests.get(metrics_url, timeout=3).text for line in payload.splitlines(): if line.startswith('n8n_workflow_started_total') and not line.endswith(' 0'): return - except requests.RequestException: - pass time.sleep(2) raise RuntimeError('workflow_started_total never went non-zero') @pytest.fixture(scope='session') -def dd_environment() -> Iterator[dict[str, Any]]: +def dd_environment() -> Iterator[tuple[dict[str, Any], dict[str, Any]]]: conditions = [ CheckEndpoints(common.MAIN_INSTANCE['openmetrics_endpoint']), CheckEndpoints(common.WORKER_INSTANCE['openmetrics_endpoint']), + _activate_imported_workflows, + _generate_workflow_traffic, + _wait_for_workflow_metric, ] with docker_run(common.COMPOSE_FILE, conditions=conditions, env_vars=common.get_compose_env_vars()): - _activate_imported_workflows() - _generate_workflow_traffic() - _wait_for_workflow_metric() - yield { - 'instances': [common.MAIN_INSTANCE, common.WORKER_INSTANCE], - } + config = {'instances': [common.MAIN_INSTANCE, common.WORKER_INSTANCE]} + yield config, common.E2E_METADATA @pytest.fixture diff --git a/n8n/tests/fixtures/n8n.txt b/n8n/tests/fixtures/n8n.txt index 0a47cf518a9c0..62b13a73a7dcd 100644 --- a/n8n/tests/fixtures/n8n.txt +++ b/n8n/tests/fixtures/n8n.txt @@ -351,6 +351,10 @@ n8n_token_exchange_identity_linked_total 0 n8n_audit_workflow_activated_total{workflow_id="testWorkflowOk"} 1 n8n_audit_workflow_activated_total{workflow_id="testWorkflowFail"} 1 +# HELP n8n_audit_workflow_deactivated_total Total number of n8n.audit.workflow.deactivated events. +# TYPE n8n_audit_workflow_deactivated_total counter +n8n_audit_workflow_deactivated_total{workflow_id="testWorkflowOk"} 1 + # HELP n8n_queue_job_enqueued_total Total number of n8n.queue.job.enqueued events. # TYPE n8n_queue_job_enqueued_total counter n8n_queue_job_enqueued_total 8 @@ -365,6 +369,18 @@ n8n_workflow_started_total{workflow_id="testWorkflowFail"} 4 n8n_audit_workflow_executed_total{workflow_id="testWorkflowOk"} 4 n8n_audit_workflow_executed_total{workflow_id="testWorkflowFail"} 4 +# HELP n8n_audit_workflow_resumed_total Total number of n8n.audit.workflow.resumed events. +# TYPE n8n_audit_workflow_resumed_total counter +n8n_audit_workflow_resumed_total{workflow_id="testWorkflowOk"} 1 + +# HELP n8n_audit_workflow_version_updated_total Total number of n8n.audit.workflow.version.updated events. +# TYPE n8n_audit_workflow_version_updated_total counter +n8n_audit_workflow_version_updated_total{workflow_id="testWorkflowOk"} 1 + +# HELP n8n_audit_workflow_waiting_total Total number of n8n.audit.workflow.waiting events. +# TYPE n8n_audit_workflow_waiting_total counter +n8n_audit_workflow_waiting_total{workflow_id="testWorkflowOk"} 1 + # HELP n8n_workflow_success_total Total number of n8n.workflow.success events. # TYPE n8n_workflow_success_total counter n8n_workflow_success_total{workflow_id="testWorkflowOk"} 4 diff --git a/n8n/tests/fixtures/n8n_custom.txt b/n8n/tests/fixtures/n8n_custom.txt index 70820dfff85c2..0442331ad78c2 100644 --- a/n8n/tests/fixtures/n8n_custom.txt +++ b/n8n/tests/fixtures/n8n_custom.txt @@ -351,6 +351,10 @@ test_token_exchange_identity_linked_total 0 test_audit_workflow_activated_total{workflow_id="testWorkflowOk"} 1 test_audit_workflow_activated_total{workflow_id="testWorkflowFail"} 1 +# HELP test_audit_workflow_deactivated_total Total number of n8n.audit.workflow.deactivated events. +# TYPE test_audit_workflow_deactivated_total counter +test_audit_workflow_deactivated_total{workflow_id="testWorkflowOk"} 1 + # HELP test_queue_job_enqueued_total Total number of n8n.queue.job.enqueued events. # TYPE test_queue_job_enqueued_total counter test_queue_job_enqueued_total 8 @@ -365,6 +369,18 @@ test_workflow_started_total{workflow_id="testWorkflowFail"} 4 test_audit_workflow_executed_total{workflow_id="testWorkflowOk"} 4 test_audit_workflow_executed_total{workflow_id="testWorkflowFail"} 4 +# HELP test_audit_workflow_resumed_total Total number of n8n.audit.workflow.resumed events. +# TYPE test_audit_workflow_resumed_total counter +test_audit_workflow_resumed_total{workflow_id="testWorkflowOk"} 1 + +# HELP test_audit_workflow_version_updated_total Total number of n8n.audit.workflow.version.updated events. +# TYPE test_audit_workflow_version_updated_total counter +test_audit_workflow_version_updated_total{workflow_id="testWorkflowOk"} 1 + +# HELP test_audit_workflow_waiting_total Total number of n8n.audit.workflow.waiting events. +# TYPE test_audit_workflow_waiting_total counter +test_audit_workflow_waiting_total{workflow_id="testWorkflowOk"} 1 + # HELP test_workflow_success_total Total number of n8n.workflow.success events. # TYPE test_workflow_success_total counter test_workflow_success_total{workflow_id="testWorkflowOk"} 4 diff --git a/n8n/tests/test_e2e.py b/n8n/tests/test_e2e.py index f35b19bcbc786..6075e820d5fa9 100644 --- a/n8n/tests/test_e2e.py +++ b/n8n/tests/test_e2e.py @@ -13,12 +13,8 @@ @pytest.mark.e2e def test_check_n8n_e2e( dd_agent_check: Callable[..., Any], - instance: dict[str, Any], - worker_instance: dict[str, Any], ): - config = {'init_config': {}, 'instances': [instance, worker_instance]} - - aggregator = dd_agent_check(config, rate=True) + aggregator = dd_agent_check(rate=True) aggregator.assert_metric('n8n.readiness.check', value=1, tags=['status_code:200', 'n8n_process:main'], at_least=1) # Worker also exposes /healthz/readiness via QUEUE_HEALTH_CHECK_ACTIVE on its own port. From 8523188de8efe960fdd710eb2701eae0f0291865 Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Fri, 8 May 2026 14:33:12 +0200 Subject: [PATCH 4/7] Document raw_metric_prefix requirement when customizing N8N_METRICS_PREFIX --- n8n/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/n8n/README.md b/n8n/README.md index dfccf49c70c6f..7882427834165 100644 --- a/n8n/README.md +++ b/n8n/README.md @@ -51,7 +51,13 @@ N8N_METRICS_PREFIX=n8n_ For more details, see the n8n documentation on [enabling Prometheus metrics][10]. -Set `openmetrics_endpoint` in `conf.yaml` to the full `/metrics` URL of your n8n process, for example `http://localhost:5678/metrics`. +If you change `N8N_METRICS_PREFIX` from its default of `n8n_`, you **must** also set `raw_metric_prefix` in the integration's `conf.yaml` to the same value. Otherwise the check will not recognize the exposed metric names and will silently submit nothing: + +```yaml +instances: + - openmetrics_endpoint: http://localhost:5678/metrics + raw_metric_prefix: my_custom_prefix_ +``` #### Event-driven counters From 1be3b3dc6fdf7294017743c5e05b3af4e2d6e2d7 Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Fri, 8 May 2026 14:37:51 +0200 Subject: [PATCH 5/7] Reformat changelog so towncrier renders sub-bullets correctly --- n8n/changelog.d/23635.added | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/n8n/changelog.d/23635.added b/n8n/changelog.d/23635.added index 9ab2409ea457f..8baf20ccd7533 100644 --- a/n8n/changelog.d/23635.added +++ b/n8n/changelog.d/23635.added @@ -1,6 +1,9 @@ -- Map every n8n 2.x metric family verified live against n8n 2.19.5: ``workflow.execution.duration.seconds.*`` histogram, ``audit.workflow.activated``/``executed``, ``embed.login.requests``/``failures``, ``token.exchange.requests``/``failures``/``identity.linked``/``jit.provisioning``, ``process.pss.bytes``, ``runner.task.requested``, and the ``workflow_statistics`` family (``production.executions``, ``production.root.executions``, ``manual.executions``, ``users.total``, ``enabled.users``, ``workflows.total``, ``credentials.total``). +Update the n8n metric coverage and test harness, verified live against n8n 1.118.1 and 2.19.5: + +- Map every n8n 2.x metric family verified live against n8n 2.19.5: ``workflow.execution.duration.seconds.*`` histogram, ``audit.workflow.activated``/``deactivated``/``executed``/``resumed``/``version.updated``/``waiting``, ``embed.login.requests``/``failures``, ``token.exchange.requests``/``failures``/``identity.linked``/``jit.provisioning``, ``process.pss.bytes``, ``runner.task.requested``, and the ``workflow_statistics`` family (``production.executions``, ``production.root.executions``, ``manual.executions``, ``users.total``, ``enabled.users``, ``workflows.total``, ``credentials.total``). - Restore valid metrics that the integration was previously dropping: ``queue.job.dequeued``, ``nodejs.active.requests``. - Add worker-only families ``node.started``, ``node.finished``, ``queue.job.dequeued``, ``runner.task.requested`` and document scraping the n8n worker process as a separate Datadog instance. - Remove the gating of OpenMetrics scraping on ``/healthz/readiness`` — ``n8n.readiness.check`` is still submitted, but metrics keep flowing when readiness reports degraded so SRE-relevant signals (queue depth, process state) are not lost during incidents. - Document version-specific metric availability and the n8n env flags that gate them (``N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS``, ``N8N_METRICS_INCLUDE_WORKFLOW_EXECUTION_DURATION``, ``N8N_METRICS_INCLUDE_QUEUE_METRICS``). -- Use the actual ``/metrics`` URL in the ``openmetrics_endpoint`` example in ``conf.yaml.example``/``spec.yaml`` (was previously the host root, which silently mismatched the scrape path the check uses). \ No newline at end of file +- Use the actual ``/metrics`` URL in the ``openmetrics_endpoint`` example in ``conf.yaml.example``/``spec.yaml`` (was previously the host root, which silently mismatched the scrape path the check uses). +- Document that ``raw_metric_prefix`` in ``conf.yaml`` must be kept in sync with a customised ``N8N_METRICS_PREFIX`` for the check to recognise the exposed metric names. From af60d114c9ce55450e53d476e0d8502b6a044126 Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Fri, 8 May 2026 14:56:50 +0200 Subject: [PATCH 6/7] Add tests/lab traffic generator for n8n A long-running n8n simulation that layers on top of the integration test environment so a real Datadog Agent can ship metrics to a Datadog org for dashboard / monitor iteration. - tests/lab/workflows/: five lab-only workflow JSONs covering distinct shapes (fast, slow Wait node, always-fail Code, flaky 30%, four-step chain). - tests/lab/traffic_generator.py: click CLI (start/generate/stop) that runs ddev env start --base, copies + imports + activates the lab workflows, restarts n8n, and drives a configurable async traffic mix against the webhooks and REST API. - tests/lab/config.yaml: webhook + REST probabilities and tick / reload intervals; hot-reloaded while the generator runs. - tests/lab/.ddev.toml: pins the lab to an `n8nlab` ddev org. - tests/lab/run_lab.sh: bash entrypoint with an EXIT trap so Ctrl+C always runs lab:stop. - hatch.toml: new [envs.lab] env with click/httpx/pyyaml/rich and start/generate/stop scripts. --- n8n/hatch.toml | 11 ++ n8n/tests/lab/README.md | 91 +++++++++ n8n/tests/lab/config.yaml | 25 +++ n8n/tests/lab/run_lab.sh | 47 +++++ n8n/tests/lab/traffic_generator.py | 260 +++++++++++++++++++++++++ n8n/tests/lab/workflows/lab_chain.json | 82 ++++++++ n8n/tests/lab/workflows/lab_fail.json | 38 ++++ n8n/tests/lab/workflows/lab_fast.json | 42 ++++ n8n/tests/lab/workflows/lab_flaky.json | 38 ++++ n8n/tests/lab/workflows/lab_slow.json | 54 +++++ 10 files changed, 688 insertions(+) create mode 100644 n8n/tests/lab/README.md create mode 100644 n8n/tests/lab/config.yaml create mode 100755 n8n/tests/lab/run_lab.sh create mode 100644 n8n/tests/lab/traffic_generator.py create mode 100644 n8n/tests/lab/workflows/lab_chain.json create mode 100644 n8n/tests/lab/workflows/lab_fail.json create mode 100644 n8n/tests/lab/workflows/lab_fast.json create mode 100644 n8n/tests/lab/workflows/lab_flaky.json create mode 100644 n8n/tests/lab/workflows/lab_slow.json diff --git a/n8n/hatch.toml b/n8n/hatch.toml index 945448e0ac48b..b1232ac43fbf2 100644 --- a/n8n/hatch.toml +++ b/n8n/hatch.toml @@ -10,3 +10,14 @@ matrix.version.env-vars = [ { key = "N8N_VERSION", value = "1.118.1", if = ["1"] }, { key = "N8N_VERSION", value = "2.19.5", if = ["2"] }, ] + +[envs.lab] +dependencies = ["click", "httpx", "pyyaml", "rich"] + +[envs.lab.scripts] +start = "python -m tests.lab.traffic_generator start {args}" +generate = "python -m tests.lab.traffic_generator generate {args}" +stop = "python -m tests.lab.traffic_generator stop {args}" + +[envs.lab.env-vars] +N8N_IS_LAB = "true" diff --git a/n8n/tests/lab/README.md b/n8n/tests/lab/README.md new file mode 100644 index 0000000000000..493cde1b93056 --- /dev/null +++ b/n8n/tests/lab/README.md @@ -0,0 +1,91 @@ +# n8n integration lab + +A long-running n8n simulation that pushes real metrics to a Datadog org so you can iterate on dashboards, monitors, and customer reports against live data. + +It reuses the integration test environment (so you get queue mode, a worker, the full Datadog Agent) and layers on top: + +- five lab-only workflows with distinct shapes (fast, slow, always-fail, flaky, multi-step chain), and +- an async traffic generator that drives a configurable webhook + REST API mix and reloads its config on the fly. + +## Setup + +### Datadog credentials + +The lab uses a `.ddev.toml` in this directory (already committed) to point at an `n8nlab` ddev org. Add the matching entry to your global `~/.ddev/config.toml`: + +```toml +[orgs.n8nlab] +api_key = "" +site = "datadoghq.com" +``` + +Use any org name you like; just keep `org = "n8nlab"` in `tests/lab/.ddev.toml` aligned with what you put in your global config. + +### Traffic configuration + +`tests/lab/config.yaml` controls the traffic mix. Probabilities are independent draws per tick, and values above `1.0` mean "more than one call per tick on average": + +```yaml +webhook_probabilities: + /webhook/lab/fast: 0.9 # bulk traffic, fast histogram bucket + /webhook/lab/slow: 0.4 # populates higher histogram buckets + /webhook/lab/fail: 0.15 # populates workflow_failed + /webhook/lab/flaky: 0.5 # mixed success/failure + /webhook/lab/chain: 0.3 # 4 Set nodes -> 4x node.* events +api_probabilities: + /healthz: 1.0 + /healthz/readiness: 0.5 + /rest/login: 0.2 # 401s +tick_seconds: 1.0 +reload_interval: 5 +``` + +Edit this file while the lab is running and the generator will pick it up on the next `reload_interval` tick. + +## Usage + +### One-shot (recommended) + +```bash +./tests/lab/run_lab.sh # default env: py3.13-2 (n8n 2.19.5) +./tests/lab/run_lab.sh -e py3.13-1 # n8n 1.118.1 +``` + +The script brings up the env, imports & activates the lab workflows, restarts n8n so webhooks register, and starts the traffic generator. `Ctrl+C` triggers a `cleanup` trap that runs `lab:stop` to tear everything down. + +### Individual hatch commands + +```bash +hatch run lab:start -e py3.13-2 # ddev env start + import lab workflows + restart +hatch run lab:generate # traffic loop (foreground; Ctrl+C to stop) +hatch run lab:stop -e py3.13-2 # ddev env stop +``` + +## What this exercises + +The lab is wired to populate every metric family the integration maps that does not require an SSO/embed flow: + +| Metric family | How the lab drives it | +| --- | --- | +| `n8n.workflow.started/.success/.failed.count` | every webhook hit goes through the EventBus | +| `n8n.workflow.execution.duration.seconds.*` (n8n 2.x) | the slow & chain workflows spread the histogram | +| `n8n.node.started/.finished.count` | the worker fires per-node events; the chain workflow yields 4× per call | +| `n8n.queue.job.enqueued/.dequeued/.completed/.failed.count` | queue mode is enabled in the test compose | +| `n8n.scaling.mode.queue.jobs.{active,waiting,completed,failed}` | main process tracks queue depth | +| `n8n.http.request.duration.seconds.*` | the API mix (`/healthz`, `/rest/login`) drives status code labels | +| `n8n.cache.hits/.misses/.updates.count` | cache traffic comes from n8n itself during execution | +| `n8n.last.activity` | refreshed on every API call | +| `n8n.{production,production.root,manual,enabled.users,users,workflows,credentials}.total` | enabled in the test compose via `N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS` | + +What it does **not** exercise (these need extra infra and are documented in the README "Version-specific metrics" section): + +- `n8n.token.exchange.*` and `n8n.embed.login.*` — require an SSO IdP / embed integration. +- `n8n.audit.workflow.*` — fire on UI-driven activate/deactivate; not currently driven by the generator. Future iteration could call the n8n REST API to toggle workflow active state on a slow timer. + +## Stopping the lab + +`Ctrl+C` from `run_lab.sh` cleans up automatically. If you ran the hatch commands directly: + +```bash +hatch run lab:stop -e py3.13-2 +``` diff --git a/n8n/tests/lab/config.yaml b/n8n/tests/lab/config.yaml new file mode 100644 index 0000000000000..9e1e2e76706b2 --- /dev/null +++ b/n8n/tests/lab/config.yaml @@ -0,0 +1,25 @@ +# n8n lab traffic configuration. Edit this file while the lab is running and +# changes are picked up every `reload_interval` seconds. + +# Probability of hitting each webhook on every traffic tick. Independent draws — +# multiple endpoints can fire on the same tick. Values can exceed 1.0 to issue +# multiple invocations per tick (e.g. 2.5 = 2 calls + a 50% chance of a third). +webhook_probabilities: + /webhook/lab/fast: 0.9 # bulk of the workflow_started counter and HTTP histogram + /webhook/lab/slow: 0.4 # Wait node spreads execution-duration buckets + /webhook/lab/fail: 0.15 # populates workflow_failed and node_finished{status="failed"} + /webhook/lab/flaky: 0.5 # mixed success/failure, ~30% fail rate + /webhook/lab/chain: 0.3 # 4 Set nodes => 4x node.started/finished events per call + +# Probability of hitting each REST API endpoint per tick. Used to drive the +# http_request_duration_seconds histogram across status code labels. +api_probabilities: + /healthz: 1.0 + /healthz/readiness: 0.5 + /rest/login: 0.2 # 401s — useful for status_code label coverage + +# How long to sleep between traffic ticks (seconds). +tick_seconds: 1.0 + +# Reload this file every N seconds (live config). +reload_interval: 5 diff --git a/n8n/tests/lab/run_lab.sh b/n8n/tests/lab/run_lab.sh new file mode 100755 index 0000000000000..5e8d56e5ce525 --- /dev/null +++ b/n8n/tests/lab/run_lab.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -e + +ORIGINAL_DIR=$(pwd) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ENV="py3.13-2" + +while [[ $# -gt 0 ]]; do + case $1 in + -e|--env) + ENV="$2" + shift 2 + ;; + -h|--help) + echo "Usage: $0 [-e|--env ENV]" + echo "" + echo "Options:" + echo " -e, --env ENV ddev environment to use (default: py3.13-2)" + echo " -h, --help Show this help message" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +cleanup() { + echo "" + echo "Cleaning up..." + cd "$SCRIPT_DIR" + hatch run lab:stop -e "$ENV" || true + cd "$ORIGINAL_DIR" + exit 0 +} + +# `lab:generate` runs through `hatch`, which traps SIGINT itself, so we +# install our own EXIT trap to make sure `lab:stop` always runs even on Ctrl+C. +trap cleanup EXIT + +cd "$SCRIPT_DIR" +hatch run lab:start -e "$ENV" + +echo "Starting traffic (Ctrl+C to stop)..." +hatch run lab:generate diff --git a/n8n/tests/lab/traffic_generator.py b/n8n/tests/lab/traffic_generator.py new file mode 100644 index 0000000000000..be73b6df6e58c --- /dev/null +++ b/n8n/tests/lab/traffic_generator.py @@ -0,0 +1,260 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +"""n8n lab traffic generator. + +Brings up the standard n8n test environment via ``ddev env start --base``, +imports a richer set of workflows than the integration tests use, activates +them, and then drives a continuous, configurable traffic mix against the +running container so a real Datadog Agent can ship the resulting metrics. +""" + +from __future__ import annotations + +import asyncio +import random +import signal +import subprocess +import sys +import time +from pathlib import Path +from typing import Any + +import click +import httpx +import yaml +from rich.console import Console +from rich.table import Table + +ConfigDict = dict[str, Any] +LAB_DIR = Path(__file__).resolve().parent +WORKFLOWS_DIR = LAB_DIR / "workflows" +CONFIG_PATH = LAB_DIR / "config.yaml" + +CONTAINER = "n8n-test" +MAIN_BASE_URL = "http://localhost:5678" + +# Stable IDs that match the workflow JSON files. Kept here to drive the +# import/activate/restart loop without re-parsing the JSON. +LAB_WORKFLOW_IDS: list[str] = [ + "labFastSuccess", + "labSlowSuccess", + "labAlwaysFail", + "labFlaky", + "labLongChain", +] + +shutdown_event = asyncio.Event() +current_config: ConfigDict = {} + + +def _load_config(path: Path) -> tuple[ConfigDict, str]: + try: + with open(path) as f: + data = yaml.safe_load(f) or {} + except FileNotFoundError: + return current_config, f"Config file {path} not found; using current values." + except yaml.YAMLError as exc: + return current_config, f"Failed to parse {path}: {exc}; using current values." + + if not isinstance(data, dict): + return current_config, f"{path} must be a mapping at the top level; using current values." + + return data, "" + + +def _docker_exec(*cmd: str, check: bool = True) -> subprocess.CompletedProcess: + return subprocess.run( + ["docker", "exec", CONTAINER, *cmd], + check=check, + capture_output=True, + text=True, + ) + + +def _docker_cp(src: Path, dest: str) -> None: + subprocess.check_call(["docker", "cp", str(src), f"{CONTAINER}:{dest}"]) + + +def _wait_for_endpoint(url: str, *, timeout: int = 90) -> None: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + try: + if httpx.get(url, timeout=2).status_code == 200: + return + except httpx.RequestError: + pass + time.sleep(2) + raise RuntimeError(f"Endpoint {url} never became reachable") + + +def _import_lab_workflows(console: Console) -> None: + """Copy the lab workflow files into the running container, import & activate them.""" + console.print("[bold cyan]Copying lab workflows into the container...[/bold cyan]") + _docker_exec("mkdir", "-p", "/lab/workflows") + for path in sorted(WORKFLOWS_DIR.glob("*.json")): + _docker_cp(path, f"/lab/workflows/{path.name}") + + console.print("[bold cyan]Importing & activating lab workflows...[/bold cyan]") + for path in sorted(WORKFLOWS_DIR.glob("*.json")): + result = _docker_exec("n8n", "import:workflow", f"--input=/lab/workflows/{path.name}", check=False) + if result.returncode != 0: + console.print(f"[bold red]Failed to import {path.name}:[/bold red]\n{result.stdout}\n{result.stderr}") + sys.exit(1) + for wf_id in LAB_WORKFLOW_IDS: + _docker_exec("n8n", "update:workflow", f"--id={wf_id}", "--active=true") + + console.print("[bold cyan]Restarting n8n so webhooks register...[/bold cyan]") + subprocess.check_call( + ["docker", "compose", "-f", str(LAB_DIR.parent / "docker" / "docker-compose.yaml"), "restart", "n8n"] + ) + _wait_for_endpoint(f"{MAIN_BASE_URL}/healthz") + console.print("[bold green]Lab workflows are live.[/bold green]") + + +def _signal_handler(_sig, _frame) -> None: + shutdown_event.set() + + +def _print_row(console: Console, ts: str, scenario: str, target: str, status: str, latency_ms: str) -> None: + table = Table(show_header=False, box=None, show_edge=False) + table.add_column("Timestamp", style="dim", width=20) + table.add_column("Scenario", width=10) + table.add_column("Endpoint", width=28) + table.add_column("Status", justify="right", width=14) + table.add_column("Latency (ms)", justify="right", width=14) + table.add_row(ts, scenario, target, status, latency_ms) + console.print(table) + + +async def _hit(client: httpx.AsyncClient, console: Console, scenario: str, path: str) -> None: + url = f"{MAIN_BASE_URL}{path}" + ts = time.strftime("%H:%M:%S") + start = time.perf_counter() + try: + resp = await client.get(url, timeout=10.0) + latency_ms = f"{(time.perf_counter() - start) * 1000:.0f}" + style = "green" if 200 <= resp.status_code < 400 else "red" + _print_row(console, ts, scenario, path, f"[{style}]{resp.status_code}[/]", latency_ms) + except httpx.TimeoutException: + _print_row(console, ts, scenario, path, "[bold yellow]TIMEOUT[/]", "") + except httpx.RequestError as exc: + _print_row(console, ts, scenario, path, f"[bold red]ERR[/] {type(exc).__name__}", "") + + +def _draws(probability: float) -> int: + """Return the number of times an event should fire this tick. + + ``probability`` is interpreted as expected count: ``2.5`` => 2 firings + a + 50% chance of a third. Values <= 1 act like a single Bernoulli trial. + """ + whole = int(probability) + fractional = probability - whole + extra = 1 if random.random() < fractional else 0 + return whole + extra + + +async def _config_reloader(path: Path, console: Console) -> None: + global current_config + while not shutdown_event.is_set(): + new_config, error = _load_config(path) + if error: + console.print(f"[bold yellow]{error}[/bold yellow]") + elif new_config != current_config: + current_config = new_config + console.print(f"[bold cyan]Reloaded config from {path}[/bold cyan]") + try: + await asyncio.wait_for(shutdown_event.wait(), timeout=float(current_config.get("reload_interval", 5))) + except asyncio.TimeoutError: + pass + + +async def _run_traffic(console: Console) -> None: + global current_config + current_config, error = _load_config(CONFIG_PATH) + if error: + console.print(f"[bold red]{error}[/bold red]") + sys.exit(1) + + console.print(f"[dim]Traffic config: {CONFIG_PATH}\nEdit it while the lab runs to change the mix.[/dim]\n") + + reloader = asyncio.create_task(_config_reloader(CONFIG_PATH, console)) + async with httpx.AsyncClient() as client: + try: + while not shutdown_event.is_set(): + tasks = [] + for path, probability in (current_config.get("webhook_probabilities") or {}).items(): + for _ in range(_draws(float(probability))): + tasks.append(_hit(client, console, "webhook", path)) + for path, probability in (current_config.get("api_probabilities") or {}).items(): + for _ in range(_draws(float(probability))): + tasks.append(_hit(client, console, "api", path)) + if tasks: + await asyncio.gather(*tasks, return_exceptions=True) + try: + await asyncio.wait_for( + shutdown_event.wait(), + timeout=float(current_config.get("tick_seconds", 1.0)), + ) + except asyncio.TimeoutError: + pass + finally: + reloader.cancel() + try: + await reloader + except asyncio.CancelledError: + pass + + +@click.group() +def cli() -> None: + """n8n traffic lab commands.""" + + +@cli.command() +@click.option("-e", "--env", default="py3.13-2", help="ddev env name to start (matches hatch matrix entry).") +def start(env: str) -> None: + """Bring up the n8n test environment + agent and import lab workflows on top.""" + console = Console() + console.print(f"[bold cyan]Starting environment {env} via ddev (this also starts the Agent)...[/bold cyan]") + rc = subprocess.call(["ddev", "env", "start", "n8n", "--base", env, "-e", "DD_LOGS_ENABLED=true"]) + if rc != 0: + console.print(f"[bold red]ddev env start failed (exit {rc})[/bold red]") + sys.exit(rc) + + _wait_for_endpoint(f"{MAIN_BASE_URL}/healthz") + _import_lab_workflows(console) + console.print( + "\n[bold green]Lab is up.[/bold green] " + "Run [bold]hatch run lab:generate[/bold] to start traffic, " + "[bold]hatch run lab:stop[/bold] to tear down." + ) + + +@cli.command() +def generate() -> None: + """Drive a continuous, configurable traffic mix against the running lab.""" + console = Console() + signal.signal(signal.SIGINT, _signal_handler) + signal.signal(signal.SIGTERM, _signal_handler) + try: + asyncio.run(_run_traffic(console)) + except KeyboardInterrupt: + console.print("\n[bold yellow]Traffic stopped.[/bold yellow]") + + +@cli.command() +@click.option("-e", "--env", default="py3.13-2", help="ddev env name to stop.") +def stop(env: str) -> None: + """Tear down the lab environment.""" + console = Console() + console.print(f"[bold cyan]Stopping environment {env}...[/bold cyan]") + rc = subprocess.call(["ddev", "env", "stop", "n8n", env]) + if rc != 0: + console.print(f"[bold red]ddev env stop failed (exit {rc})[/bold red]") + sys.exit(rc) + console.print("[bold green]Lab stopped.[/bold green]") + + +if __name__ == "__main__": + cli() diff --git a/n8n/tests/lab/workflows/lab_chain.json b/n8n/tests/lab/workflows/lab_chain.json new file mode 100644 index 0000000000000..27c8279d63dd1 --- /dev/null +++ b/n8n/tests/lab/workflows/lab_chain.json @@ -0,0 +1,82 @@ +{ + "id": "labLongChain", + "versionId": "10000000-0000-0000-0000-000000000005", + "name": "Lab Long Chain", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "lab/chain", + "responseMode": "lastNode", + "options": {} + }, + "id": "11111111-0000-0000-0000-000000000005", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "lab-chain-aaaa-bbbb-cccc-000000000005" + }, + { + "parameters": { + "assignments": { + "assignments": [{"id": "1", "name": "step", "value": "one", "type": "string"}] + }, + "options": {} + }, + "id": "22222222-0000-0000-0000-000000000005", + "name": "Step 1", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [460, 300] + }, + { + "parameters": { + "assignments": { + "assignments": [{"id": "2", "name": "step", "value": "two", "type": "string"}] + }, + "options": {} + }, + "id": "33333333-0000-0000-0000-000000000005", + "name": "Step 2", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [680, 300] + }, + { + "parameters": { + "assignments": { + "assignments": [{"id": "3", "name": "step", "value": "three", "type": "string"}] + }, + "options": {} + }, + "id": "44444444-0000-0000-0000-000000000005", + "name": "Step 3", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [900, 300] + }, + { + "parameters": { + "assignments": { + "assignments": [{"id": "4", "name": "scenario", "value": "chain", "type": "string"}] + }, + "options": {} + }, + "id": "55555555-0000-0000-0000-000000000005", + "name": "Step 4", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [1120, 300] + } + ], + "connections": { + "Webhook": {"main": [[{"node": "Step 1", "type": "main", "index": 0}]]}, + "Step 1": {"main": [[{"node": "Step 2", "type": "main", "index": 0}]]}, + "Step 2": {"main": [[{"node": "Step 3", "type": "main", "index": 0}]]}, + "Step 3": {"main": [[{"node": "Step 4", "type": "main", "index": 0}]]} + }, + "active": false, + "settings": {"executionOrder": "v1"}, + "pinData": {} +} diff --git a/n8n/tests/lab/workflows/lab_fail.json b/n8n/tests/lab/workflows/lab_fail.json new file mode 100644 index 0000000000000..327cb511ced65 --- /dev/null +++ b/n8n/tests/lab/workflows/lab_fail.json @@ -0,0 +1,38 @@ +{ + "id": "labAlwaysFail", + "versionId": "10000000-0000-0000-0000-000000000003", + "name": "Lab Always Fail", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "lab/fail", + "responseMode": "lastNode", + "options": {} + }, + "id": "11111111-0000-0000-0000-000000000003", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "lab-fail-aaaa-bbbb-cccc-000000000003" + }, + { + "parameters": { + "language": "javaScript", + "jsCode": "throw new Error('intentional lab failure');" + }, + "id": "22222222-0000-0000-0000-000000000003", + "name": "Code", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [460, 300] + } + ], + "connections": { + "Webhook": {"main": [[{"node": "Code", "type": "main", "index": 0}]]} + }, + "active": false, + "settings": {"executionOrder": "v1"}, + "pinData": {} +} diff --git a/n8n/tests/lab/workflows/lab_fast.json b/n8n/tests/lab/workflows/lab_fast.json new file mode 100644 index 0000000000000..ebada5057e67b --- /dev/null +++ b/n8n/tests/lab/workflows/lab_fast.json @@ -0,0 +1,42 @@ +{ + "id": "labFastSuccess", + "versionId": "10000000-0000-0000-0000-000000000001", + "name": "Lab Fast Success", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "lab/fast", + "responseMode": "lastNode", + "options": {} + }, + "id": "11111111-0000-0000-0000-000000000001", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "lab-fast-aaaa-bbbb-cccc-000000000001" + }, + { + "parameters": { + "assignments": { + "assignments": [ + {"id": "1", "name": "scenario", "value": "fast", "type": "string"} + ] + }, + "options": {} + }, + "id": "22222222-0000-0000-0000-000000000001", + "name": "Set", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [460, 300] + } + ], + "connections": { + "Webhook": {"main": [[{"node": "Set", "type": "main", "index": 0}]]} + }, + "active": false, + "settings": {"executionOrder": "v1"}, + "pinData": {} +} diff --git a/n8n/tests/lab/workflows/lab_flaky.json b/n8n/tests/lab/workflows/lab_flaky.json new file mode 100644 index 0000000000000..2485f7646ecd1 --- /dev/null +++ b/n8n/tests/lab/workflows/lab_flaky.json @@ -0,0 +1,38 @@ +{ + "id": "labFlaky", + "versionId": "10000000-0000-0000-0000-000000000004", + "name": "Lab Flaky", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "lab/flaky", + "responseMode": "lastNode", + "options": {} + }, + "id": "11111111-0000-0000-0000-000000000004", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "lab-flaky-aaaa-bbbb-cccc-000000000004" + }, + { + "parameters": { + "language": "javaScript", + "jsCode": "if (Math.random() < 0.3) { throw new Error('flaky lab failure'); } return [{json: {ok: true, scenario: 'flaky'}}];" + }, + "id": "22222222-0000-0000-0000-000000000004", + "name": "Code", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [460, 300] + } + ], + "connections": { + "Webhook": {"main": [[{"node": "Code", "type": "main", "index": 0}]]} + }, + "active": false, + "settings": {"executionOrder": "v1"}, + "pinData": {} +} diff --git a/n8n/tests/lab/workflows/lab_slow.json b/n8n/tests/lab/workflows/lab_slow.json new file mode 100644 index 0000000000000..b8adbbcd2d204 --- /dev/null +++ b/n8n/tests/lab/workflows/lab_slow.json @@ -0,0 +1,54 @@ +{ + "id": "labSlowSuccess", + "versionId": "10000000-0000-0000-0000-000000000002", + "name": "Lab Slow Success", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "lab/slow", + "responseMode": "lastNode", + "options": {} + }, + "id": "11111111-0000-0000-0000-000000000002", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "lab-slow-aaaa-bbbb-cccc-000000000002" + }, + { + "parameters": { + "amount": 500, + "unit": "ms" + }, + "id": "22222222-0000-0000-0000-000000000002", + "name": "Wait", + "type": "n8n-nodes-base.wait", + "typeVersion": 1, + "position": [460, 300] + }, + { + "parameters": { + "assignments": { + "assignments": [ + {"id": "1", "name": "scenario", "value": "slow", "type": "string"} + ] + }, + "options": {} + }, + "id": "33333333-0000-0000-0000-000000000002", + "name": "Set", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [680, 300] + } + ], + "connections": { + "Webhook": {"main": [[{"node": "Wait", "type": "main", "index": 0}]]}, + "Wait": {"main": [[{"node": "Set", "type": "main", "index": 0}]]} + }, + "active": false, + "settings": {"executionOrder": "v1"}, + "pinData": {} +} From 43e7fc80ecd1ebdb15dbdeab7cd8ff01d5c24d4f Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Fri, 8 May 2026 15:09:08 +0200 Subject: [PATCH 7/7] Add missing n8n event metric mappings --- n8n/README.md | 8 ++++---- n8n/changelog.d/23635.added | 12 +++++++++--- n8n/datadog_checks/n8n/metrics.py | 8 +++++++- n8n/metadata.csv | 6 ++++++ n8n/tests/common.py | 6 ++++++ n8n/tests/fixtures/n8n.txt | 23 +++++++++++++++++++++++ n8n/tests/fixtures/n8n_custom.txt | 23 +++++++++++++++++++++++ 7 files changed, 78 insertions(+), 8 deletions(-) diff --git a/n8n/README.md b/n8n/README.md index 7882427834165..6b59475f62fc0 100644 --- a/n8n/README.md +++ b/n8n/README.md @@ -6,9 +6,9 @@ This check monitors [n8n][1] through the Datadog Agent. Collect n8n metrics including: - Cache metrics: hit, miss, and update counts. -- Workflow metrics: started, success, failed counters; in n8n 2.x, an execution-duration histogram. +- Workflow metrics: started, success, failed counters, audit workflow lifecycle counters; in n8n 2.x, an execution-duration histogram. - Node metrics: per-node started and finished counters emitted by worker processes in queue mode. -- Queue metrics: queue depth, enqueued/dequeued/completed/failed counters, and scaling-mode worker gauges. +- Queue metrics: queue depth, enqueued/dequeued/completed/failed/stalled counters, and scaling-mode worker gauges. - HTTP metrics: request duration histograms tagged with status code. - Process and Node.js runtime metrics. @@ -61,11 +61,11 @@ instances: #### Event-driven counters -Some n8n counters are registered dynamically the first time the corresponding event fires. For example, `n8n.workflow.started.count`, `n8n.workflow.success.count`, `n8n.workflow.failed.count`, and the queue and node event counters do not appear until at least one workflow has been executed. This is expected behavior and is not a sign of a misconfigured integration. +Some n8n counters are registered dynamically the first time the corresponding event fires. For example, `n8n.workflow.started.count`, `n8n.workflow.success.count`, `n8n.workflow.failed.count`, audit workflow lifecycle counters, and the queue and node event counters do not appear until the corresponding workflow or queue event has occurred. This is expected behavior and is not a sign of a misconfigured integration. #### Queue mode and workers -In queue mode, n8n runs separate worker processes that execute jobs picked up from a Redis-backed queue. Each worker exposes its own `/metrics` endpoint and emits a different subset of metrics than the main process. Worker-observed metrics include `n8n.queue.job.dequeued.count`, `n8n.node.started.count`, `n8n.node.finished.count`, and `n8n.runner.task.requested.count`. Main-only metrics include `n8n.instance.role.leader` and the `n8n.scaling.mode.queue.jobs.*` family. +In queue mode, n8n runs separate worker processes that execute jobs picked up from a Redis-backed queue. Each worker exposes its own `/metrics` endpoint and emits a different subset of metrics than the main process. Worker-observed metrics include `n8n.queue.job.dequeued.count`, `n8n.queue.job.stalled.count`, `n8n.node.started.count`, `n8n.node.finished.count`, and `n8n.runner.task.requested.count`. Main-only metrics include `n8n.instance.role.leader` and the `n8n.scaling.mode.queue.jobs.*` family. To expose worker metrics, set `QUEUE_HEALTH_CHECK_ACTIVE=true` and `QUEUE_HEALTH_CHECK_PORT=` on each worker. **In n8n 2.x, port `5679` is reserved for the task runner broker, so pick a different port (for example `5680`).** diff --git a/n8n/changelog.d/23635.added b/n8n/changelog.d/23635.added index 8baf20ccd7533..73efef022e019 100644 --- a/n8n/changelog.d/23635.added +++ b/n8n/changelog.d/23635.added @@ -1,9 +1,15 @@ Update the n8n metric coverage and test harness, verified live against n8n 1.118.1 and 2.19.5: -- Map every n8n 2.x metric family verified live against n8n 2.19.5: ``workflow.execution.duration.seconds.*`` histogram, ``audit.workflow.activated``/``deactivated``/``executed``/``resumed``/``version.updated``/``waiting``, ``embed.login.requests``/``failures``, ``token.exchange.requests``/``failures``/``identity.linked``/``jit.provisioning``, ``process.pss.bytes``, ``runner.task.requested``, and the ``workflow_statistics`` family (``production.executions``, ``production.root.executions``, ``manual.executions``, ``users.total``, ``enabled.users``, ``workflows.total``, ``credentials.total``). +- Add missing common event-driven metrics: ``audit.workflow.archived``, ``audit.workflow.created``, ``audit.workflow.deleted``, ``audit.workflow.unarchived``, ``audit.workflow.updated``, and ``queue.job.stalled``. +- Add n8n 2.x workflow duration metrics: ``workflow.execution.duration.seconds.*``. +- Add n8n 2.x audit workflow metrics: ``audit.workflow.activated``, ``audit.workflow.deactivated``, ``audit.workflow.executed``, ``audit.workflow.resumed``, ``audit.workflow.version.updated``, and ``audit.workflow.waiting``. +- Add n8n 2.x embed login metrics: ``embed.login.requests`` and ``embed.login.failures``. +- Add n8n 2.x token exchange metrics: ``token.exchange.requests``, ``token.exchange.failures``, ``token.exchange.identity.linked``, and ``token.exchange.jit.provisioning``. +- Add n8n 2.x process memory metric: ``process.pss.bytes``. +- Add n8n 2.x workflow statistics metrics: ``production.executions``, ``production.root.executions``, ``manual.executions``, ``users.total``, ``enabled.users``, ``workflows.total``, and ``credentials.total``. - Restore valid metrics that the integration was previously dropping: ``queue.job.dequeued``, ``nodejs.active.requests``. -- Add worker-only families ``node.started``, ``node.finished``, ``queue.job.dequeued``, ``runner.task.requested`` and document scraping the n8n worker process as a separate Datadog instance. -- Remove the gating of OpenMetrics scraping on ``/healthz/readiness`` — ``n8n.readiness.check`` is still submitted, but metrics keep flowing when readiness reports degraded so SRE-relevant signals (queue depth, process state) are not lost during incidents. +- Add worker-only families ``node.started``, ``node.finished``, ``queue.job.dequeued``, and ``runner.task.requested`` and document scraping the n8n worker process as a separate Datadog instance. +- Remove the gating of OpenMetrics scraping on ``/healthz/readiness`` - ``n8n.readiness.check`` is still submitted, but metrics keep flowing when readiness reports degraded so SRE-relevant signals (queue depth, process state) are not lost during incidents. - Document version-specific metric availability and the n8n env flags that gate them (``N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS``, ``N8N_METRICS_INCLUDE_WORKFLOW_EXECUTION_DURATION``, ``N8N_METRICS_INCLUDE_QUEUE_METRICS``). - Use the actual ``/metrics`` URL in the ``openmetrics_endpoint`` example in ``conf.yaml.example``/``spec.yaml`` (was previously the host root, which silently mismatched the scrape path the check uses). - Document that ``raw_metric_prefix`` in ``conf.yaml`` must be kept in sync with a customised ``N8N_METRICS_PREFIX`` for the check to recognise the exposed metric names. diff --git a/n8n/datadog_checks/n8n/metrics.py b/n8n/datadog_checks/n8n/metrics.py index 4961712939f6b..7dd2dcbcf03a4 100644 --- a/n8n/datadog_checks/n8n/metrics.py +++ b/n8n/datadog_checks/n8n/metrics.py @@ -13,7 +13,7 @@ # `n8n...` becomes counter `___total`) and only appear once # the corresponding event fires at runtime. In queue mode, worker processes # emit `node_started_total`, `node_finished_total`, `queue_job_dequeued_total`, -# and `runner_task_requested_total`. +# `queue_job_stalled_total`, and `runner_task_requested_total`. # # Several families were introduced in n8n 2.x (see the README "Version-specific # metrics" section). The `workflow_statistics_*` and SSO/embed token-exchange @@ -22,9 +22,14 @@ METRIC_MAP = { 'active_workflow_count': 'active.workflow.count', 'audit_workflow_activated': 'audit.workflow.activated', # n8n 2.x+ + 'audit_workflow_archived': 'audit.workflow.archived', + 'audit_workflow_created': 'audit.workflow.created', 'audit_workflow_deactivated': 'audit.workflow.deactivated', # n8n 2.x+ + 'audit_workflow_deleted': 'audit.workflow.deleted', 'audit_workflow_executed': 'audit.workflow.executed', # n8n 2.x+ 'audit_workflow_resumed': 'audit.workflow.resumed', # n8n 2.x+ + 'audit_workflow_unarchived': 'audit.workflow.unarchived', + 'audit_workflow_updated': 'audit.workflow.updated', 'audit_workflow_version_updated': 'audit.workflow.version.updated', # n8n 2.x+ 'audit_workflow_waiting': 'audit.workflow.waiting', # n8n 2.x+ 'cache_hits': 'cache.hits', @@ -84,6 +89,7 @@ 'queue_job_dequeued': 'queue.job.dequeued', 'queue_job_enqueued': 'queue.job.enqueued', 'queue_job_failed': 'queue.job.failed', + 'queue_job_stalled': 'queue.job.stalled', 'runner_task_requested': 'runner.task.requested', 'scaling_mode_queue_jobs_active': 'scaling.mode.queue.jobs.active', 'scaling_mode_queue_jobs_completed': 'scaling.mode.queue.jobs.completed', diff --git a/n8n/metadata.csv b/n8n/metadata.csv index 3cf493e4dba2b..fb85893d676a6 100644 --- a/n8n/metadata.csv +++ b/n8n/metadata.csv @@ -1,9 +1,14 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags n8n.active.workflow.count,gauge,,,,Total number of active workflows.,0,n8n,,, n8n.audit.workflow.activated.count,count,,,,Total number of audited workflow activations. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.archived.count,count,,,,Total number of audited workflow archive events.,0,n8n,,, +n8n.audit.workflow.created.count,count,,,,Total number of audited workflow creations.,0,n8n,,, n8n.audit.workflow.deactivated.count,count,,,,Total number of audited workflow deactivations. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.deleted.count,count,,,,Total number of audited workflow deletions.,0,n8n,,, n8n.audit.workflow.executed.count,count,,,,Total number of audited workflow executions. Available in n8n 2.x and later.,0,n8n,,, n8n.audit.workflow.resumed.count,count,,,,Total number of audited workflow resumptions. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.unarchived.count,count,,,,Total number of audited workflow unarchive events.,0,n8n,,, +n8n.audit.workflow.updated.count,count,,,,Total number of audited workflow updates.,0,n8n,,, n8n.audit.workflow.version.updated.count,count,,,,Total number of audited workflow version updates. Available in n8n 2.x and later.,0,n8n,,, n8n.audit.workflow.waiting.count,count,,,,Total number of audited workflow executions entering a waiting state. Available in n8n 2.x and later.,0,n8n,,, n8n.cache.hits.count,count,,,,Total number of cache hits.,0,n8n,,, @@ -60,6 +65,7 @@ n8n.queue.job.completed.count,count,,,,Number of jobs completed successfully (n8 n8n.queue.job.dequeued.count,count,,,,Number of jobs dequeued by workers (n8n.queue.job.dequeued event). Emitted by worker processes in queue mode.,0,n8n,,, n8n.queue.job.enqueued.count,count,,,,Number of jobs added to the queue (n8n.queue.job.enqueued event).,0,n8n,,, n8n.queue.job.failed.count,count,,,,Number of jobs that have failed (n8n.queue.job.failed event).,0,n8n,,, +n8n.queue.job.stalled.count,count,,,,Number of jobs that stalled (n8n.queue.job.stalled event).,0,n8n,,, n8n.readiness.check,gauge,,,,Readiness check status (1 if ready with status code 200 otherwise 0) with status code tag.,0,n8n,,,status_code n8n.runner.task.requested.count,count,,,,Total number of runner tasks requested by worker processes.,0,n8n,,, n8n.scaling.mode.queue.jobs.active,gauge,,,,Current number of jobs being processed across all workers in scaling mode.,0,n8n,,, diff --git a/n8n/tests/common.py b/n8n/tests/common.py index 85dded9652b2d..9694a9e748326 100644 --- a/n8n/tests/common.py +++ b/n8n/tests/common.py @@ -57,11 +57,17 @@ # samples and exclude them from the symmetric metadata assertion. RARE_EVENT_METRIC_NAMES = frozenset( { + 'n8n.audit.workflow.archived.count', + 'n8n.audit.workflow.created.count', 'n8n.audit.workflow.deactivated.count', + 'n8n.audit.workflow.deleted.count', 'n8n.audit.workflow.resumed.count', + 'n8n.audit.workflow.unarchived.count', + 'n8n.audit.workflow.updated.count', 'n8n.audit.workflow.version.updated.count', 'n8n.audit.workflow.waiting.count', 'n8n.embed.login.failures.count', + 'n8n.queue.job.stalled.count', 'n8n.runner.task.requested.count', 'n8n.token.exchange.failures.count', # prom-client's per-type libuv request gauge: only has samples while a libuv request is in flight diff --git a/n8n/tests/fixtures/n8n.txt b/n8n/tests/fixtures/n8n.txt index 62b13a73a7dcd..bf9603a77f907 100644 --- a/n8n/tests/fixtures/n8n.txt +++ b/n8n/tests/fixtures/n8n.txt @@ -351,10 +351,22 @@ n8n_token_exchange_identity_linked_total 0 n8n_audit_workflow_activated_total{workflow_id="testWorkflowOk"} 1 n8n_audit_workflow_activated_total{workflow_id="testWorkflowFail"} 1 +# HELP n8n_audit_workflow_archived_total Total number of n8n.audit.workflow.archived events. +# TYPE n8n_audit_workflow_archived_total counter +n8n_audit_workflow_archived_total{workflow_id="testWorkflowOk"} 1 + +# HELP n8n_audit_workflow_created_total Total number of n8n.audit.workflow.created events. +# TYPE n8n_audit_workflow_created_total counter +n8n_audit_workflow_created_total{workflow_id="testWorkflowOk"} 1 + # HELP n8n_audit_workflow_deactivated_total Total number of n8n.audit.workflow.deactivated events. # TYPE n8n_audit_workflow_deactivated_total counter n8n_audit_workflow_deactivated_total{workflow_id="testWorkflowOk"} 1 +# HELP n8n_audit_workflow_deleted_total Total number of n8n.audit.workflow.deleted events. +# TYPE n8n_audit_workflow_deleted_total counter +n8n_audit_workflow_deleted_total{workflow_id="testWorkflowOk"} 1 + # HELP n8n_queue_job_enqueued_total Total number of n8n.queue.job.enqueued events. # TYPE n8n_queue_job_enqueued_total counter n8n_queue_job_enqueued_total 8 @@ -373,6 +385,14 @@ n8n_audit_workflow_executed_total{workflow_id="testWorkflowFail"} 4 # TYPE n8n_audit_workflow_resumed_total counter n8n_audit_workflow_resumed_total{workflow_id="testWorkflowOk"} 1 +# HELP n8n_audit_workflow_unarchived_total Total number of n8n.audit.workflow.unarchived events. +# TYPE n8n_audit_workflow_unarchived_total counter +n8n_audit_workflow_unarchived_total{workflow_id="testWorkflowOk"} 1 + +# HELP n8n_audit_workflow_updated_total Total number of n8n.audit.workflow.updated events. +# TYPE n8n_audit_workflow_updated_total counter +n8n_audit_workflow_updated_total{workflow_id="testWorkflowOk"} 1 + # HELP n8n_audit_workflow_version_updated_total Total number of n8n.audit.workflow.version.updated events. # TYPE n8n_audit_workflow_version_updated_total counter n8n_audit_workflow_version_updated_total{workflow_id="testWorkflowOk"} 1 @@ -396,6 +416,9 @@ n8n_workflow_failed_total{workflow_id="testWorkflowFail"} 4 # HELP n8n_queue_job_failed_total Total number of n8n.queue.job.failed events. # TYPE n8n_queue_job_failed_total counter n8n_queue_job_failed_total 4 +# HELP n8n_queue_job_stalled_total Total number of n8n.queue.job.stalled events. +# TYPE n8n_queue_job_stalled_total counter +n8n_queue_job_stalled_total 1 # HELP n8n_queue_job_dequeued_total Total number of n8n.queue.job.dequeued events. # TYPE n8n_queue_job_dequeued_total counter n8n_queue_job_dequeued_total 8 diff --git a/n8n/tests/fixtures/n8n_custom.txt b/n8n/tests/fixtures/n8n_custom.txt index 0442331ad78c2..26d3ee593f24c 100644 --- a/n8n/tests/fixtures/n8n_custom.txt +++ b/n8n/tests/fixtures/n8n_custom.txt @@ -351,10 +351,22 @@ test_token_exchange_identity_linked_total 0 test_audit_workflow_activated_total{workflow_id="testWorkflowOk"} 1 test_audit_workflow_activated_total{workflow_id="testWorkflowFail"} 1 +# HELP test_audit_workflow_archived_total Total number of n8n.audit.workflow.archived events. +# TYPE test_audit_workflow_archived_total counter +test_audit_workflow_archived_total{workflow_id="testWorkflowOk"} 1 + +# HELP test_audit_workflow_created_total Total number of n8n.audit.workflow.created events. +# TYPE test_audit_workflow_created_total counter +test_audit_workflow_created_total{workflow_id="testWorkflowOk"} 1 + # HELP test_audit_workflow_deactivated_total Total number of n8n.audit.workflow.deactivated events. # TYPE test_audit_workflow_deactivated_total counter test_audit_workflow_deactivated_total{workflow_id="testWorkflowOk"} 1 +# HELP test_audit_workflow_deleted_total Total number of n8n.audit.workflow.deleted events. +# TYPE test_audit_workflow_deleted_total counter +test_audit_workflow_deleted_total{workflow_id="testWorkflowOk"} 1 + # HELP test_queue_job_enqueued_total Total number of n8n.queue.job.enqueued events. # TYPE test_queue_job_enqueued_total counter test_queue_job_enqueued_total 8 @@ -373,6 +385,14 @@ test_audit_workflow_executed_total{workflow_id="testWorkflowFail"} 4 # TYPE test_audit_workflow_resumed_total counter test_audit_workflow_resumed_total{workflow_id="testWorkflowOk"} 1 +# HELP test_audit_workflow_unarchived_total Total number of n8n.audit.workflow.unarchived events. +# TYPE test_audit_workflow_unarchived_total counter +test_audit_workflow_unarchived_total{workflow_id="testWorkflowOk"} 1 + +# HELP test_audit_workflow_updated_total Total number of n8n.audit.workflow.updated events. +# TYPE test_audit_workflow_updated_total counter +test_audit_workflow_updated_total{workflow_id="testWorkflowOk"} 1 + # HELP test_audit_workflow_version_updated_total Total number of n8n.audit.workflow.version.updated events. # TYPE test_audit_workflow_version_updated_total counter test_audit_workflow_version_updated_total{workflow_id="testWorkflowOk"} 1 @@ -396,6 +416,9 @@ test_workflow_failed_total{workflow_id="testWorkflowFail"} 4 # HELP test_queue_job_failed_total Total number of n8n.queue.job.failed events. # TYPE test_queue_job_failed_total counter test_queue_job_failed_total 4 +# HELP test_queue_job_stalled_total Total number of n8n.queue.job.stalled events. +# TYPE test_queue_job_stalled_total counter +test_queue_job_stalled_total 1 # HELP test_queue_job_dequeued_total Total number of n8n.queue.job.dequeued events. # TYPE test_queue_job_dequeued_total counter test_queue_job_dequeued_total 8