diff --git a/n8n/README.md b/n8n/README.md index 18eb081e7890a..6b59475f62fc0 100644 --- a/n8n/README.md +++ b/n8n/README.md @@ -2,15 +2,15 @@ ## Overview -This check monitors [n8n][1] through the Datadog Agent. +This check monitors [n8n][1] through the Datadog Agent. Collect n8n metrics including: -- Cache metrics: Hit and miss statistics. -- Message event bus metrics: Event-related metrics. -- Workflow metrics: Can include workflow ID labels. -- Node metrics: Can include node type labels. -- Credential metrics: Can include credential type labels. -- Queue metrics +- Cache metrics: hit, miss, and update counts. +- Workflow metrics: started, success, failed counters, audit workflow lifecycle counters; in n8n 2.x, an execution-duration histogram. +- Node metrics: per-node started and finished counters emitted by worker processes in queue mode. +- Queue metrics: queue depth, enqueued/dequeued/completed/failed/stalled counters, and scaling-mode worker gauges. +- HTTP metrics: request duration histograms tagged with status code. +- Process and Node.js runtime metrics. ## Setup @@ -40,6 +40,10 @@ N8N_METRICS_INCLUDE_CACHE_METRICS=true N8N_METRICS_INCLUDE_MESSAGE_EVENT_BUS_METRICS=true N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL=true N8N_METRICS_INCLUDE_API_ENDPOINTS=true +N8N_METRICS_INCLUDE_QUEUE_METRICS=true + +# Optional: n8n 2.x adds workflow_statistics gauges (workflows, users, executions, ...) - opt in +N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS=true # Optional: Customize the metric prefix (default is 'n8n_') N8N_METRICS_PREFIX=n8n_ @@ -47,6 +51,49 @@ N8N_METRICS_PREFIX=n8n_ For more details, see the n8n documentation on [enabling Prometheus metrics][10]. +If you change `N8N_METRICS_PREFIX` from its default of `n8n_`, you **must** also set `raw_metric_prefix` in the integration's `conf.yaml` to the same value. Otherwise the check will not recognize the exposed metric names and will silently submit nothing: + +```yaml +instances: + - openmetrics_endpoint: http://localhost:5678/metrics + raw_metric_prefix: my_custom_prefix_ +``` + +#### Event-driven counters + +Some n8n counters are registered dynamically the first time the corresponding event fires. For example, `n8n.workflow.started.count`, `n8n.workflow.success.count`, `n8n.workflow.failed.count`, audit workflow lifecycle counters, and the queue and node event counters do not appear until the corresponding workflow or queue event has occurred. This is expected behavior and is not a sign of a misconfigured integration. + +#### Queue mode and workers + +In queue mode, n8n runs separate worker processes that execute jobs picked up from a Redis-backed queue. Each worker exposes its own `/metrics` endpoint and emits a different subset of metrics than the main process. Worker-observed metrics include `n8n.queue.job.dequeued.count`, `n8n.queue.job.stalled.count`, `n8n.node.started.count`, `n8n.node.finished.count`, and `n8n.runner.task.requested.count`. Main-only metrics include `n8n.instance.role.leader` and the `n8n.scaling.mode.queue.jobs.*` family. + +To expose worker metrics, set `QUEUE_HEALTH_CHECK_ACTIVE=true` and `QUEUE_HEALTH_CHECK_PORT=` on each worker. **In n8n 2.x, port `5679` is reserved for the task runner broker, so pick a different port (for example `5680`).** + +For full coverage in queue deployments, configure one Datadog instance per n8n process exposing `/metrics`, including main and worker processes: + +```yaml +instances: + - openmetrics_endpoint: http://n8n-main:5678/metrics + - openmetrics_endpoint: http://n8n-worker:5680/metrics +``` + +#### Version-specific metrics + +Several metric families were introduced in n8n 2.x and are not emitted on n8n 1.x: + +- `n8n.workflow.execution.duration.seconds.*` (histogram) +- `n8n.audit.workflow.activated.count`, `n8n.audit.workflow.deactivated.count`, `n8n.audit.workflow.executed.count`, `n8n.audit.workflow.resumed.count`, `n8n.audit.workflow.version.updated.count`, and `n8n.audit.workflow.waiting.count` +- `n8n.embed.login.requests.count` (tagged with `result:success`/`failure`), `n8n.embed.login.failures.count` (tagged with `reason`) +- `n8n.token.exchange.requests.count` (tagged with `result:success`/`failure`), `n8n.token.exchange.failures.count` (tagged with `reason`), `n8n.token.exchange.identity.linked.count`, `n8n.token.exchange.jit.provisioning.count` +- `n8n.process.pss.bytes` (Linux only) +- The `n8n.{production,manual,production.root}.executions`, `n8n.users.total`, `n8n.enabled.users`, `n8n.workflows.total`, and `n8n.credentials.total` family - only emitted when `N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS=true` is set. + +Some metrics only emit samples after the corresponding runtime event occurs. For example, failures-only counters (`*.failures.count`) need an authentication failure, audit workflow counters need the matching workflow state transition, and the libuv `n8n.nodejs.active.requests` gauge needs an in-flight libuv request. A healthy idle deployment may not produce data points for these metrics until that activity occurs. + +#### Tag cardinality + +When `N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL=true`, http and workflow execution histograms are tagged with `workflow_id` (and similar labels for nodes). On deployments with many distinct workflows or nodes, this can produce high-cardinality metrics. Drop the label via `exclude_labels` or omit `N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL` to keep tag cardinality bounded. + #### Configure the Datadog Agent 1. Edit the `n8n.d/conf.yaml` file, in the `conf.d/` folder at the root of your Agent's configuration directory to start collecting your n8n performance data. See the [sample n8n.d/conf.yaml][4] for all available configuration options. diff --git a/n8n/assets/configuration/spec.yaml b/n8n/assets/configuration/spec.yaml index f828a10ec05c0..cea34bff83932 100644 --- a/n8n/assets/configuration/spec.yaml +++ b/n8n/assets/configuration/spec.yaml @@ -12,7 +12,7 @@ files: openmetrics_endpoint.required: true openmetrics_endpoint.hidden: false openmetrics_endpoint.display_priority: 1 - openmetrics_endpoint.value.example: http://localhost:5678 + openmetrics_endpoint.value.example: http://localhost:5678/metrics openmetrics_endpoint.description: | Endpoint exposing the n8n's metrics in the OpenMetrics format. For more information, refer to: https://docs.n8n.io/hosting/logging-monitoring/monitoring/ diff --git a/n8n/changelog.d/23635.added b/n8n/changelog.d/23635.added new file mode 100644 index 0000000000000..73efef022e019 --- /dev/null +++ b/n8n/changelog.d/23635.added @@ -0,0 +1,15 @@ +Update the n8n metric coverage and test harness, verified live against n8n 1.118.1 and 2.19.5: + +- Add missing common event-driven metrics: ``audit.workflow.archived``, ``audit.workflow.created``, ``audit.workflow.deleted``, ``audit.workflow.unarchived``, ``audit.workflow.updated``, and ``queue.job.stalled``. +- Add n8n 2.x workflow duration metrics: ``workflow.execution.duration.seconds.*``. +- Add n8n 2.x audit workflow metrics: ``audit.workflow.activated``, ``audit.workflow.deactivated``, ``audit.workflow.executed``, ``audit.workflow.resumed``, ``audit.workflow.version.updated``, and ``audit.workflow.waiting``. +- Add n8n 2.x embed login metrics: ``embed.login.requests`` and ``embed.login.failures``. +- Add n8n 2.x token exchange metrics: ``token.exchange.requests``, ``token.exchange.failures``, ``token.exchange.identity.linked``, and ``token.exchange.jit.provisioning``. +- Add n8n 2.x process memory metric: ``process.pss.bytes``. +- Add n8n 2.x workflow statistics metrics: ``production.executions``, ``production.root.executions``, ``manual.executions``, ``users.total``, ``enabled.users``, ``workflows.total``, and ``credentials.total``. +- Restore valid metrics that the integration was previously dropping: ``queue.job.dequeued``, ``nodejs.active.requests``. +- Add worker-only families ``node.started``, ``node.finished``, ``queue.job.dequeued``, and ``runner.task.requested`` and document scraping the n8n worker process as a separate Datadog instance. +- Remove the gating of OpenMetrics scraping on ``/healthz/readiness`` - ``n8n.readiness.check`` is still submitted, but metrics keep flowing when readiness reports degraded so SRE-relevant signals (queue depth, process state) are not lost during incidents. +- Document version-specific metric availability and the n8n env flags that gate them (``N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS``, ``N8N_METRICS_INCLUDE_WORKFLOW_EXECUTION_DURATION``, ``N8N_METRICS_INCLUDE_QUEUE_METRICS``). +- Use the actual ``/metrics`` URL in the ``openmetrics_endpoint`` example in ``conf.yaml.example``/``spec.yaml`` (was previously the host root, which silently mismatched the scrape path the check uses). +- Document that ``raw_metric_prefix`` in ``conf.yaml`` must be kept in sync with a customised ``N8N_METRICS_PREFIX`` for the check to recognise the exposed metric names. diff --git a/n8n/datadog_checks/n8n/check.py b/n8n/datadog_checks/n8n/check.py index 00c41569b83d5..012a70e20bc7a 100644 --- a/n8n/datadog_checks/n8n/check.py +++ b/n8n/datadog_checks/n8n/check.py @@ -2,58 +2,52 @@ # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse + +from requests.exceptions import RequestException from datadog_checks.base import OpenMetricsBaseCheckV2 from datadog_checks.n8n.metrics import METRIC_MAP, RENAME_LABELS_MAP from .config_models import ConfigMixin -DEFAULT_READY_ENDPOINT = '/healthz/readiness' +DEFAULT_READY_PATH = '/healthz/readiness' class N8nCheck(OpenMetricsBaseCheckV2, ConfigMixin): __NAMESPACE__ = 'n8n' DEFAULT_METRIC_LIMIT = 0 - def __init__(self, name, init_config, instances=None): - super(N8nCheck, self).__init__( - name, - init_config, - instances, - ) - self.openmetrics_endpoint = self.instance["openmetrics_endpoint"] - self.tags = self.instance.get('tags', []) - self._ready_endpoint = DEFAULT_READY_ENDPOINT - - def get_default_config(self): + def get_default_config(self) -> dict: return { 'metrics': [METRIC_MAP], 'rename_labels': RENAME_LABELS_MAP, 'raw_metric_prefix': 'n8n_', } - def _check_n8n_readiness(self): - endpoint = urljoin(self.openmetrics_endpoint, self._ready_endpoint) - response = self.http.get(endpoint) - - # Determine metric value and status_code tag - if response.status_code is None: - self.log.warning("The readiness endpoint did not return a status code") - metric_value = 0 - metric_tags = self.tags + ['status_code:null'] - elif response.status_code == 200: - # Ready - submit 1 - metric_value = 1 - metric_tags = self.tags + [f'status_code:{response.status_code}'] - else: - # Not ready - submit 0 - metric_value = 0 - metric_tags = self.tags + [f'status_code:{response.status_code}'] - - # Submit metric with appropriate value and status_code tag - self.gauge('readiness.check', metric_value, tags=metric_tags) - - def check(self, instance): - super().check(instance) + def _readiness_endpoint(self) -> str: + parsed = urlparse(self.config.openmetrics_endpoint) + base = f'{parsed.scheme}://{parsed.netloc}' + return urljoin(base, DEFAULT_READY_PATH) + + def _check_n8n_readiness(self) -> None: + endpoint = self._readiness_endpoint() + tags = list(self.config.tags or ()) + + try: + response = self.http.get(endpoint) + except RequestException as e: + self.log.warning("Could not reach n8n readiness endpoint %s: %s", endpoint, e) + self.gauge('readiness.check', 0, tags=tags + ['status_code:none']) + return + + is_ready = response.status_code == 200 + self.gauge( + 'readiness.check', + 1 if is_ready else 0, + tags=tags + [f'status_code:{response.status_code}'], + ) + + def check(self, instance: dict) -> None: self._check_n8n_readiness() + super().check(instance) diff --git a/n8n/datadog_checks/n8n/data/conf.yaml.example b/n8n/datadog_checks/n8n/data/conf.yaml.example index e80f23c8c08c1..5f96c4acb66fe 100644 --- a/n8n/datadog_checks/n8n/data/conf.yaml.example +++ b/n8n/datadog_checks/n8n/data/conf.yaml.example @@ -18,7 +18,7 @@ instances: ## https://docs.n8n.io/hosting/logging-monitoring/monitoring/ ## https://docs.n8n.io/hosting/configuration/environment-variables/endpoints/ # - - openmetrics_endpoint: http://localhost:5678 + - openmetrics_endpoint: http://localhost:5678/metrics ## @param raw_metric_prefix - string - optional - default: n8n_ ## The prefix prepended to all metrics from n8n. diff --git a/n8n/datadog_checks/n8n/metrics.py b/n8n/datadog_checks/n8n/metrics.py index 5e29ba629340c..7dd2dcbcf03a4 100644 --- a/n8n/datadog_checks/n8n/metrics.py +++ b/n8n/datadog_checks/n8n/metrics.py @@ -2,36 +2,58 @@ # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) -# Metrics mapping without prefix - use raw_metric_prefix config to strip prefixes like 'n8n_', 'n8n_my_team_', etc. -# Namespace will be applied by the check -# Note: OpenMetrics automatically appends .count to counter metrics, so don't add it here +# Metrics emitted by n8n's /metrics endpoint, verified live against n8n@1.118.1 +# and n8n@2.19.5. +# +# The OpenMetrics base check strips `_total` from counter names before lookup +# and appends `.count` on submission, so counter keys here are written without +# the `_total` suffix (e.g. `cache_hits_total` -> key `cache_hits`). +# +# Many counters are dynamically registered from EventBus events (event +# `n8n...` becomes counter `___total`) and only appear once +# the corresponding event fires at runtime. In queue mode, worker processes +# emit `node_started_total`, `node_finished_total`, `queue_job_dequeued_total`, +# `queue_job_stalled_total`, and `runner_task_requested_total`. +# +# Several families were introduced in n8n 2.x (see the README "Version-specific +# metrics" section). The `workflow_statistics_*` and SSO/embed token-exchange +# families require additional flags (`N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS`, +# token-exchange counters always register but only emit on auth events). METRIC_MAP = { 'active_workflow_count': 'active.workflow.count', - 'api_request_duration_seconds': 'api.request.duration.seconds', - 'api_requests': 'api.requests', - 'cache_errors': 'cache.errors', + 'audit_workflow_activated': 'audit.workflow.activated', # n8n 2.x+ + 'audit_workflow_archived': 'audit.workflow.archived', + 'audit_workflow_created': 'audit.workflow.created', + 'audit_workflow_deactivated': 'audit.workflow.deactivated', # n8n 2.x+ + 'audit_workflow_deleted': 'audit.workflow.deleted', + 'audit_workflow_executed': 'audit.workflow.executed', # n8n 2.x+ + 'audit_workflow_resumed': 'audit.workflow.resumed', # n8n 2.x+ + 'audit_workflow_unarchived': 'audit.workflow.unarchived', + 'audit_workflow_updated': 'audit.workflow.updated', + 'audit_workflow_version_updated': 'audit.workflow.version.updated', # n8n 2.x+ + 'audit_workflow_waiting': 'audit.workflow.waiting', # n8n 2.x+ 'cache_hits': 'cache.hits', - 'cache_latency_seconds': 'cache.latency.seconds', 'cache_misses': 'cache.misses', - 'cache_operations': 'cache.operations', - 'eventbus_connections_total': 'eventbus.connections.total', - 'eventbus_events_failed': 'eventbus.events.failed', - 'eventbus_events_processed': 'eventbus.events.processed', - 'eventbus_events': 'eventbus.events', - 'eventbus_queue_size': 'eventbus.queue.size', + 'cache_updates': 'cache.updates', + 'credentials': 'credentials.total', # n8n 2.x+, requires N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS + 'embed_login_failures': 'embed.login.failures', # n8n 2.x+ + 'embed_login_requests': 'embed.login.requests', # n8n 2.x+ + 'enabled_users': 'enabled.users', # n8n 2.x+, requires N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS 'http_request_duration_seconds': 'http.request.duration.seconds', 'instance_role_leader': 'instance.role.leader', 'last_activity': { 'name': 'last.activity', 'type': 'time_elapsed', }, + 'manual_executions': 'manual.executions', # n8n 2.x+, requires N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS + 'node_finished': 'node.finished', + 'node_started': 'node.started', 'nodejs_active_handles': 'nodejs.active.handles', 'nodejs_active_handles_total': 'nodejs.active.handles.total', 'nodejs_active_requests': 'nodejs.active.requests', 'nodejs_active_requests_total': 'nodejs.active.requests.total', 'nodejs_active_resources': 'nodejs.active.resources', 'nodejs_active_resources_total': 'nodejs.active.resources.total', - 'nodejs_event_loop_lag_seconds': 'nodejs.event.loop.lag.seconds', 'nodejs_eventloop_lag_max_seconds': 'nodejs.eventloop.lag.max.seconds', 'nodejs_eventloop_lag_mean_seconds': 'nodejs.eventloop.lag.mean.seconds', 'nodejs_eventloop_lag_min_seconds': 'nodejs.eventloop.lag.min.seconds', @@ -47,47 +69,45 @@ 'nodejs_heap_space_size_available_bytes': 'nodejs.heap.space.size.available.bytes', 'nodejs_heap_space_size_total_bytes': 'nodejs.heap.space.size.total.bytes', 'nodejs_heap_space_size_used_bytes': 'nodejs.heap.space.size.used.bytes', - 'nodejs_heap_total_bytes': 'nodejs.heap.total.bytes', - 'nodejs_heap_used_bytes': 'nodejs.heap.used.bytes', + 'nodejs_version_info': {'type': 'metadata', 'label': 'version', 'name': 'nodejs.version'}, + 'process_cpu_seconds': 'process.cpu.seconds', 'process_cpu_system_seconds': 'process.cpu.system.seconds', 'process_cpu_user_seconds': 'process.cpu.user.seconds', 'process_heap_bytes': 'process.heap.bytes', 'process_max_fds': 'process.max.fds', 'process_open_fds': 'process.open.fds', + 'process_pss_bytes': 'process.pss.bytes', # n8n 2.x+ 'process_resident_memory_bytes': 'process.resident.memory.bytes', 'process_start_time_seconds': { 'name': 'process.uptime.seconds', 'type': 'time_elapsed', }, 'process_virtual_memory_bytes': 'process.virtual.memory.bytes', - 'queue_job_active_total': 'queue.job.active.total', - 'queue_job_attempts': 'queue.job.attempts', + 'production_executions': 'production.executions', # n8n 2.x+, requires N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS + 'production_root_executions': 'production.root.executions', # n8n 2.x+, requires flag 'queue_job_completed': 'queue.job.completed', - 'queue_job_delayed_total': 'queue.job.delayed.total', 'queue_job_dequeued': 'queue.job.dequeued', 'queue_job_enqueued': 'queue.job.enqueued', 'queue_job_failed': 'queue.job.failed', - 'queue_job_waiting_duration_seconds': 'queue.job.waiting.duration.seconds', - 'queue_job_waiting_total': 'queue.job.waiting.total', - 'queue_jobs_duration_seconds': 'queue.jobs.duration.seconds', - 'queue_jobs': 'queue.jobs', - 'workflow_executions_active': 'workflow.executions.active', - 'workflow_executions_duration_seconds': 'workflow.executions.duration.seconds', - 'workflow_executions': 'workflow.executions', + 'queue_job_stalled': 'queue.job.stalled', + 'runner_task_requested': 'runner.task.requested', + 'scaling_mode_queue_jobs_active': 'scaling.mode.queue.jobs.active', + 'scaling_mode_queue_jobs_completed': 'scaling.mode.queue.jobs.completed', + 'scaling_mode_queue_jobs_failed': 'scaling.mode.queue.jobs.failed', + 'scaling_mode_queue_jobs_waiting': 'scaling.mode.queue.jobs.waiting', + 'token_exchange_failures': 'token.exchange.failures', # n8n 2.x+ + 'token_exchange_identity_linked': 'token.exchange.identity.linked', # n8n 2.x+ + 'token_exchange_jit_provisioning': 'token.exchange.jit.provisioning', # n8n 2.x+ + 'token_exchange_requests': 'token.exchange.requests', # n8n 2.x+ + 'users': 'users.total', # n8n 2.x+, requires N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS + 'version_info': {'type': 'metadata', 'label': 'version', 'name': 'version'}, + 'workflow_execution_duration_seconds': 'workflow.execution.duration.seconds', # n8n 2.x+ 'workflow_failed': 'workflow.failed', 'workflow_started': 'workflow.started', 'workflow_success': 'workflow.success', - 'process_cpu_seconds': 'process.cpu.seconds', - 'version_info': 'version.info', - 'nodejs_version_info': 'nodejs.version.info', + 'workflows': 'workflows.total', # n8n 2.x+, requires N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS } -N8N_VERSION = {'version_info': {'type': 'metadata', 'label': 'version', 'name': 'version'}} -NODEJS_VERSION = {'nodejs_version_info': {'type': 'metadata', 'label': 'version', 'name': 'nodejs.version'}} - -METRIC_MAP.update(N8N_VERSION) -METRIC_MAP.update(NODEJS_VERSION) - RENAME_LABELS_MAP = { 'name': 'n8n_name', 'namespace': 'n8n_namespace', diff --git a/n8n/hatch.toml b/n8n/hatch.toml index 15f12fe355887..b1232ac43fbf2 100644 --- a/n8n/hatch.toml +++ b/n8n/hatch.toml @@ -3,9 +3,21 @@ [[envs.default.matrix]] python = ["3.13"] -version = ["1.118.1"] +version = ["1", "2"] [envs.default.overrides] matrix.version.env-vars = [ - { key = "N8N_VERSION", value = "1.118.1", if = ["1.118.1"] }, -] \ No newline at end of file + { key = "N8N_VERSION", value = "1.118.1", if = ["1"] }, + { key = "N8N_VERSION", value = "2.19.5", if = ["2"] }, +] + +[envs.lab] +dependencies = ["click", "httpx", "pyyaml", "rich"] + +[envs.lab.scripts] +start = "python -m tests.lab.traffic_generator start {args}" +generate = "python -m tests.lab.traffic_generator generate {args}" +stop = "python -m tests.lab.traffic_generator stop {args}" + +[envs.lab.env-vars] +N8N_IS_LAB = "true" diff --git a/n8n/metadata.csv b/n8n/metadata.csv index 29f8c23c7483e..fb85893d676a6 100644 --- a/n8n/metadata.csv +++ b/n8n/metadata.csv @@ -1,32 +1,37 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags n8n.active.workflow.count,gauge,,,,Total number of active workflows.,0,n8n,,, -n8n.api.request.duration.seconds.bucket,count,,,,Histogram bucket for API request duration in seconds,0,n8n,,, -n8n.api.request.duration.seconds.count,count,,,,The count of API request duration in seconds,0,n8n,,, -n8n.api.request.duration.seconds.sum,count,,,,The sum of API request duration in seconds,0,n8n,,, -n8n.api.requests.count,count,,,,Total API requests,0,n8n,,, -n8n.cache.errors.count,count,,,,Cache errors,0,n8n,,, -n8n.cache.hits.count,count,,,,Cache hits,0,n8n,,, -n8n.cache.latency.seconds.bucket,count,,,,Histogram bucket for cache operation latency in seconds,0,n8n,,, -n8n.cache.latency.seconds.count,count,,,,The count of cache operation latency in seconds,0,n8n,,, -n8n.cache.latency.seconds.sum,count,,,,The sum of cache operation latency in seconds,0,n8n,,, -n8n.cache.misses.count,count,,,,Cache misses,0,n8n,,, -n8n.cache.operations.count,count,,,,Total cache operations,0,n8n,,, -n8n.eventbus.connections.total,gauge,,,,Active event bus backend connections,0,n8n,,, -n8n.eventbus.events.count,count,,,,Total events published on the event bus,0,n8n,,, -n8n.eventbus.events.failed.count,count,,,,Total failed event processing,0,n8n,,, -n8n.eventbus.events.processed.count,count,,,,Total processed events,0,n8n,,, -n8n.eventbus.queue.size,gauge,,,,Current event queue size,0,n8n,,, -n8n.http.request.duration.seconds.count,count,,,,The count of the http responses duration labeled with: status_code,0,n8n,,, -n8n.http.request.duration.seconds.sum,count,,,,The sum of the http responses duration labeled with: status_code,0,n8n,,, +n8n.audit.workflow.activated.count,count,,,,Total number of audited workflow activations. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.archived.count,count,,,,Total number of audited workflow archive events.,0,n8n,,, +n8n.audit.workflow.created.count,count,,,,Total number of audited workflow creations.,0,n8n,,, +n8n.audit.workflow.deactivated.count,count,,,,Total number of audited workflow deactivations. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.deleted.count,count,,,,Total number of audited workflow deletions.,0,n8n,,, +n8n.audit.workflow.executed.count,count,,,,Total number of audited workflow executions. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.resumed.count,count,,,,Total number of audited workflow resumptions. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.unarchived.count,count,,,,Total number of audited workflow unarchive events.,0,n8n,,, +n8n.audit.workflow.updated.count,count,,,,Total number of audited workflow updates.,0,n8n,,, +n8n.audit.workflow.version.updated.count,count,,,,Total number of audited workflow version updates. Available in n8n 2.x and later.,0,n8n,,, +n8n.audit.workflow.waiting.count,count,,,,Total number of audited workflow executions entering a waiting state. Available in n8n 2.x and later.,0,n8n,,, +n8n.cache.hits.count,count,,,,Total number of cache hits.,0,n8n,,, +n8n.cache.misses.count,count,,,,Total number of cache misses.,0,n8n,,, +n8n.cache.updates.count,count,,,,Total number of cache updates.,0,n8n,,, +n8n.credentials.total,gauge,,,,Total number of credentials. Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, +n8n.embed.login.failures.count,count,,,,Total number of embed login failures broken down by reason. Available in n8n 2.x and later. Only emits samples after the first failure.,0,n8n,,, +n8n.embed.login.requests.count,count,,,,Total number of embed login requests (tagged with `result:success`/`result:failure`). Available in n8n 2.x and later.,0,n8n,,, +n8n.enabled.users,gauge,,,,Total number of enabled users. Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, +n8n.http.request.duration.seconds.bucket,count,,,,Histogram bucket for HTTP request duration in seconds labeled with status_code.,0,n8n,,, +n8n.http.request.duration.seconds.count,count,,,,The count of HTTP request duration samples.,0,n8n,,, +n8n.http.request.duration.seconds.sum,count,,,,The sum of HTTP request duration in seconds.,0,n8n,,, n8n.instance.role.leader,gauge,,,,Whether this main instance is the leader (1) or not (0).,0,n8n,,, n8n.last.activity,gauge,,second,,Time elapsed since the last instance activity (backend request).,0,n8n,,, +n8n.manual.executions,gauge,,,,Total number of manual workflow executions. Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, +n8n.node.finished.count,count,,,,Total number of node executions that finished. Emitted by worker processes in queue mode (n8n.node.finished event).,0,n8n,,, +n8n.node.started.count,count,,,,Total number of node executions that started. Emitted by worker processes in queue mode (n8n.node.started event).,0,n8n,,, n8n.nodejs.active.handles,gauge,,,,Number of active libuv handles grouped by handle type. Every handle type is C++ class name.,0,n8n,,, n8n.nodejs.active.handles.total,gauge,,,,Total number of active handles.,0,n8n,,, -n8n.nodejs.active.requests,gauge,,,,Number of active libuv requests grouped by request type. Every request type is C++ class name.,0,n8n,,, +n8n.nodejs.active.requests,gauge,,,,Number of active libuv requests grouped by request type. Only emits samples for request types currently in flight at scrape time (from prom-client's default collector via process._getActiveRequests).,0,n8n,,, n8n.nodejs.active.requests.total,gauge,,,,Total number of active requests.,0,n8n,,, -n8n.nodejs.active.resources,gauge,,,,"Number of active resources that are currently keeping the event loop alive, grouped by async resource type.",0,n8n,,, +n8n.nodejs.active.resources,gauge,,,,Number of active resources keeping the event loop alive grouped by async resource type.,0,n8n,,, n8n.nodejs.active.resources.total,gauge,,,,Total number of active resources.,0,n8n,,, -n8n.nodejs.event.loop.lag.seconds,gauge,,,,Event loop lag in seconds,0,n8n,,, n8n.nodejs.eventloop.lag.max.seconds,gauge,,,,The maximum recorded event loop delay.,0,n8n,,, n8n.nodejs.eventloop.lag.mean.seconds,gauge,,,,The mean of the recorded event loop delays.,0,n8n,,, n8n.nodejs.eventloop.lag.min.seconds,gauge,,,,The minimum recorded event loop delay.,0,n8n,,, @@ -36,47 +41,46 @@ n8n.nodejs.eventloop.lag.p99.seconds,gauge,,,,The 99th percentile of the recorde n8n.nodejs.eventloop.lag.seconds,gauge,,,,Lag of event loop in seconds.,0,n8n,,, n8n.nodejs.eventloop.lag.stddev.seconds,gauge,,,,The standard deviation of the recorded event loop delays.,0,n8n,,, n8n.nodejs.external.memory.bytes,gauge,,,,Node.js external memory size in bytes.,0,n8n,,, -n8n.nodejs.gc.duration.seconds.bucket,count,,,,Histogram bucket for garbage collection duration by kind,0,n8n,,, -n8n.nodejs.gc.duration.seconds.count,count,,,,The count of garbage collection duration by kind,0,n8n,,, -n8n.nodejs.gc.duration.seconds.sum,count,,,,The sum of garbage collection duration by kind,0,n8n,,, +n8n.nodejs.gc.duration.seconds.bucket,count,,,,Histogram bucket for garbage collection duration by kind.,0,n8n,,, +n8n.nodejs.gc.duration.seconds.count,count,,,,The count of garbage collection duration samples.,0,n8n,,, +n8n.nodejs.gc.duration.seconds.sum,count,,,,The sum of garbage collection duration in seconds.,0,n8n,,, n8n.nodejs.heap.size.total.bytes,gauge,,,,Process heap size from Node.js in bytes.,0,n8n,,, n8n.nodejs.heap.size.used.bytes,gauge,,,,Process heap size used from Node.js in bytes.,0,n8n,,, n8n.nodejs.heap.space.size.available.bytes,gauge,,,,Process heap space size available from Node.js in bytes.,0,n8n,,, n8n.nodejs.heap.space.size.total.bytes,gauge,,,,Process heap space size total from Node.js in bytes.,0,n8n,,, n8n.nodejs.heap.space.size.used.bytes,gauge,,,,Process heap space size used from Node.js in bytes.,0,n8n,,, -n8n.nodejs.heap.total.bytes,gauge,,,,Total heap size allocated in bytes,0,n8n,,, -n8n.nodejs.heap.used.bytes,gauge,,,,Heap memory used in bytes,0,n8n,,, n8n.process.cpu.seconds.count,count,,,,Total user and system CPU time spent in seconds.,0,n8n,,, n8n.process.cpu.system.seconds.count,count,,,,Total system CPU time spent in seconds.,0,n8n,,, n8n.process.cpu.user.seconds.count,count,,,,Total user CPU time spent in seconds.,0,n8n,,, n8n.process.heap.bytes,gauge,,,,Process heap size in bytes.,0,n8n,,, n8n.process.max.fds,gauge,,,,Maximum number of open file descriptors.,0,n8n,,, n8n.process.open.fds,gauge,,,,Number of open file descriptors.,0,n8n,,, +n8n.process.pss.bytes,gauge,,,,Proportional set size of the process in bytes. Available in n8n 2.x and later on Linux.,0,n8n,,, n8n.process.resident.memory.bytes,gauge,,,,Resident memory size in bytes.,0,n8n,,, -n8n.process.start.time.seconds,gauge,,,,Start time of the process since unix epoch in seconds.,0,n8n,,, -n8n.process.uptime.seconds,gauge,,,,Process uptime in seconds.,0,n8n,,, +n8n.process.uptime.seconds,gauge,,second,,Process uptime in seconds.,0,n8n,,, n8n.process.virtual.memory.bytes,gauge,,,,Virtual memory size in bytes.,0,n8n,,, -n8n.queue.job.active.total,gauge,,,,Number of jobs currently being processed,0,n8n,,, -n8n.queue.job.attempts.count,count,,,,Total number of job attempts,0,n8n,,, -n8n.queue.job.completed.count,count,,,,Number of jobs completed successfully,0,n8n,,, -n8n.queue.job.delayed.total,gauge,,,,Number of jobs scheduled to run later,0,n8n,,, -n8n.queue.job.dequeued.count,count,,,,Number of jobs dequeued (picked up from queue),0,n8n,,, -n8n.queue.job.enqueued.count,count,,,,Number of jobs added to the queue,0,n8n,,, -n8n.queue.job.failed.count,count,,,,Number of jobs that have failed,0,n8n,,, -n8n.queue.job.waiting.duration.seconds.bucket,count,,,,Histogram bucket for duration jobs spend waiting before being processed,0,n8n,,, -n8n.queue.job.waiting.duration.seconds.count,count,,,,The count of duration jobs spend waiting before being processed,0,n8n,,, -n8n.queue.job.waiting.duration.seconds.sum,count,,,,The sum of duration jobs spend waiting before being processed,0,n8n,,, -n8n.queue.job.waiting.total,gauge,,,,Number of jobs currently waiting in the queue,0,n8n,,, -n8n.queue.jobs.count,count,,,,Total number of queue jobs,0,n8n,,, -n8n.queue.jobs.duration.seconds.bucket,count,,,,Histogram bucket for job duration in seconds,0,n8n,,, -n8n.queue.jobs.duration.seconds.count,count,,,,The count of job duration in seconds,0,n8n,,, -n8n.queue.jobs.duration.seconds.sum,count,,,,The sum of job duration in seconds,0,n8n,,, -n8n.readiness.check,gauge,,,,Readiness check status (1 if ready with status code 200 otherwise 0) with status code tag,0,n8n,,,status_code -n8n.workflow.executions.active,gauge,,,,Number of active workflow executions,0,n8n,,, -n8n.workflow.executions.count,count,,,,Total number of workflow executions,0,n8n,,, -n8n.workflow.executions.duration.seconds.bucket,count,,,,Histogram bucket for workflow execution duration in seconds,0,n8n,,, -n8n.workflow.executions.duration.seconds.count,count,,,,The count of workflow execution duration in seconds,0,n8n,,, -n8n.workflow.executions.duration.seconds.sum,count,,,,The sum of workflow execution duration in seconds,0,n8n,,, -n8n.workflow.failed.count,count,,,,Total number of workflows that failed,0,n8n,,, -n8n.workflow.started.count,count,,,,Total number of workflows started,0,n8n,,, -n8n.workflow.success.count,count,,,,Total number of workflows completed successfully,0,n8n,,, +n8n.production.executions,gauge,,,,Total number of production workflow executions. Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, +n8n.production.root.executions,gauge,,,,Total number of production root workflow executions (excludes sub-workflows). Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, +n8n.queue.job.completed.count,count,,,,Number of jobs completed successfully (n8n.queue.job.completed event).,0,n8n,,, +n8n.queue.job.dequeued.count,count,,,,Number of jobs dequeued by workers (n8n.queue.job.dequeued event). Emitted by worker processes in queue mode.,0,n8n,,, +n8n.queue.job.enqueued.count,count,,,,Number of jobs added to the queue (n8n.queue.job.enqueued event).,0,n8n,,, +n8n.queue.job.failed.count,count,,,,Number of jobs that have failed (n8n.queue.job.failed event).,0,n8n,,, +n8n.queue.job.stalled.count,count,,,,Number of jobs that stalled (n8n.queue.job.stalled event).,0,n8n,,, +n8n.readiness.check,gauge,,,,Readiness check status (1 if ready with status code 200 otherwise 0) with status code tag.,0,n8n,,,status_code +n8n.runner.task.requested.count,count,,,,Total number of runner tasks requested by worker processes.,0,n8n,,, +n8n.scaling.mode.queue.jobs.active,gauge,,,,Current number of jobs being processed across all workers in scaling mode.,0,n8n,,, +n8n.scaling.mode.queue.jobs.completed.count,count,,,,Total number of jobs completed across all workers in scaling mode since instance start.,0,n8n,,, +n8n.scaling.mode.queue.jobs.failed.count,count,,,,Total number of jobs failed across all workers in scaling mode since instance start.,0,n8n,,, +n8n.scaling.mode.queue.jobs.waiting,gauge,,,,Current number of enqueued jobs waiting for pickup in scaling mode.,0,n8n,,, +n8n.token.exchange.failures.count,count,,,,Total number of token exchange failures broken down by reason. Available in n8n 2.x and later. Only emits samples after the first failure.,0,n8n,,, +n8n.token.exchange.identity.linked.count,count,,,,Total number of identities linked to existing users via token exchange. Available in n8n 2.x and later.,0,n8n,,, +n8n.token.exchange.jit.provisioning.count,count,,,,Total number of users JIT-provisioned via token exchange. Available in n8n 2.x and later.,0,n8n,,, +n8n.token.exchange.requests.count,count,,,,Total number of token exchange requests. Available in n8n 2.x and later.,0,n8n,,, +n8n.users.total,gauge,,,,Total number of users. Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, +n8n.workflow.execution.duration.seconds.bucket,count,,,,Histogram bucket for workflow execution duration in seconds. Available in n8n 2.x and later.,0,n8n,,, +n8n.workflow.execution.duration.seconds.count,count,,,,The count of workflow execution duration samples. Available in n8n 2.x and later.,0,n8n,,, +n8n.workflow.execution.duration.seconds.sum,count,,,,The sum of workflow execution duration in seconds. Available in n8n 2.x and later.,0,n8n,,, +n8n.workflow.failed.count,count,,,,Total number of workflows that failed (n8n.workflow.failed event).,0,n8n,,, +n8n.workflow.started.count,count,,,,Total number of workflows started (n8n.workflow.started event).,0,n8n,,, +n8n.workflow.success.count,count,,,,Total number of workflows completed successfully (n8n.workflow.success event).,0,n8n,,, +n8n.workflows.total,gauge,,,,Total number of workflows. Available in n8n 2.x and later when N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS is enabled.,0,n8n,,, diff --git a/n8n/tests/common.py b/n8n/tests/common.py index 34e3fb84ead38..9694a9e748326 100644 --- a/n8n/tests/common.py +++ b/n8n/tests/common.py @@ -3,103 +3,137 @@ # Licensed under a 3-clause BSD style license (see LICENSE) import os +from datadog_checks.base.stubs.aggregator import AggregatorStub from datadog_checks.dev import get_docker_hostname +from datadog_checks.dev.utils import find_free_ports, get_metadata_metrics HERE = os.path.dirname(os.path.abspath(__file__)) COMPOSE_FILE = os.path.join(HERE, 'docker', 'docker-compose.yaml') HOST = get_docker_hostname() -PORT = 5678 +# Allocate free host ports once per session. The values are forwarded to docker compose via +# the ``env_vars`` argument of ``docker_run`` (see ``conftest.py``) so re-runs don't collide +# with stale containers or other locally-bound services. The in-container ports stay fixed. +MAIN_PORT, WORKER_PORT = find_free_ports('127.0.0.1', 2) -def get_fixture_path(filename): - return os.path.join(HERE, 'fixtures', filename) +N8N_VERSION = os.environ.get('N8N_VERSION', '1.118.1') +N8N_MAJOR = int(N8N_VERSION.split('.', 1)[0]) +# Submitted by the check itself, not by the OpenMetrics scrape. +CHECK_LEVEL_METRIC_NAMES = frozenset({'n8n.readiness.check'}) -OPENMETRICS_URL = f'http://{HOST}:{PORT}' -INSTANCE = { - 'openmetrics_endpoint': f'{OPENMETRICS_URL}/metrics', -} +# Metric families introduced in n8n 2.x — verified live against n8n@1.118.1 and n8n@2.19.5. +V2_ONLY_METRIC_NAMES = frozenset( + { + 'n8n.audit.workflow.activated.count', + 'n8n.audit.workflow.deactivated.count', + 'n8n.audit.workflow.executed.count', + 'n8n.audit.workflow.resumed.count', + 'n8n.audit.workflow.version.updated.count', + 'n8n.audit.workflow.waiting.count', + 'n8n.credentials.total', + 'n8n.embed.login.failures.count', + 'n8n.embed.login.requests.count', + 'n8n.enabled.users', + 'n8n.manual.executions', + 'n8n.process.pss.bytes', + 'n8n.production.executions', + 'n8n.production.root.executions', + 'n8n.token.exchange.failures.count', + 'n8n.token.exchange.identity.linked.count', + 'n8n.token.exchange.jit.provisioning.count', + 'n8n.token.exchange.requests.count', + 'n8n.users.total', + 'n8n.workflow.execution.duration.seconds.bucket', + 'n8n.workflow.execution.duration.seconds.count', + 'n8n.workflow.execution.duration.seconds.sum', + 'n8n.workflows.total', + } +) + +# Metrics that are mapped and present in metadata but only emit samples after a specific +# event fires (auth failure, audit state transition, libuv request mid-flight). The unit +# fixture has synthetic samples for them; live integration/e2e runs cannot guarantee +# samples and exclude them from the symmetric metadata assertion. +RARE_EVENT_METRIC_NAMES = frozenset( + { + 'n8n.audit.workflow.archived.count', + 'n8n.audit.workflow.created.count', + 'n8n.audit.workflow.deactivated.count', + 'n8n.audit.workflow.deleted.count', + 'n8n.audit.workflow.resumed.count', + 'n8n.audit.workflow.unarchived.count', + 'n8n.audit.workflow.updated.count', + 'n8n.audit.workflow.version.updated.count', + 'n8n.audit.workflow.waiting.count', + 'n8n.embed.login.failures.count', + 'n8n.queue.job.stalled.count', + 'n8n.runner.task.requested.count', + 'n8n.token.exchange.failures.count', + # prom-client's per-type libuv request gauge: only has samples while a libuv request is in flight + # at scrape time, so live containers can produce or omit it depending on timing. + 'n8n.nodejs.active.requests', + } +) -E2E_METADATA = { - 'docker_volumes': ['/var/run/docker.sock:/var/run/docker.sock:ro'], +MAIN_INSTANCE = { + 'openmetrics_endpoint': f'http://{HOST}:{MAIN_PORT}/metrics', + 'tags': ['n8n_process:main'], } +WORKER_INSTANCE = { + 'openmetrics_endpoint': f'http://{HOST}:{WORKER_PORT}/metrics', + 'tags': ['n8n_process:worker'], +} +INSTANCE = MAIN_INSTANCE # back-compat default for unit tests + +E2E_METADATA = {'docker_volumes': ['/var/run/docker.sock:/var/run/docker.sock:ro']} + + +def get_compose_env_vars() -> dict[str, str]: + """Variables consumed by docker-compose.yaml's ``${...}`` placeholders.""" + return { + 'N8N_MAIN_HOST_PORT': str(MAIN_PORT), + 'N8N_WORKER_HOST_PORT': str(WORKER_PORT), + } + + +def get_fixture_path(filename: str) -> str: + return os.path.join(HERE, 'fixtures', filename) + + +def get_metadata_metrics_for_version(major: int = N8N_MAJOR, *, exclude_rare: bool = False) -> dict: + """Return the metadata.csv subset that the given n8n major version is expected to emit.""" + metadata = get_metadata_metrics() + if major < 2: + for name in V2_ONLY_METRIC_NAMES: + metadata.pop(name, None) + if exclude_rare: + for name in RARE_EVENT_METRIC_NAMES: + metadata.pop(name, None) + return metadata + + +def get_openmetrics_metadata_metrics(major: int = N8N_MAJOR, *, exclude_rare: bool = False) -> dict: + """Version-aware metadata subset minus metrics submitted by the check itself.""" + metadata = get_metadata_metrics_for_version(major, exclude_rare=exclude_rare) + for name in CHECK_LEVEL_METRIC_NAMES: + metadata.pop(name, None) + return metadata + + +def get_all_metadata_metrics(major: int = N8N_MAJOR, *, exclude_rare: bool = False) -> dict: + """Version-aware metadata subset including the readiness gauge submitted by the check.""" + return get_metadata_metrics_for_version(major, exclude_rare=exclude_rare) + + +def drop_rare_event_metrics(aggregator: AggregatorStub): + """Strip rare-event metrics from the aggregator before a symmetric metadata assertion. -TEST_METRICS = [ - 'n8n.active.workflow.count', - 'n8n.api.request.duration.seconds.bucket', - 'n8n.api.request.duration.seconds.count', - 'n8n.api.request.duration.seconds.sum', - 'n8n.api.requests.count', - 'n8n.cache.errors.count', - 'n8n.cache.hits.count', - 'n8n.cache.latency.seconds.bucket', - 'n8n.cache.latency.seconds.count', - 'n8n.cache.latency.seconds.sum', - 'n8n.cache.misses.count', - 'n8n.cache.operations.count', - 'n8n.eventbus.connections.total', - 'n8n.eventbus.events.failed.count', - 'n8n.eventbus.events.processed.count', - 'n8n.eventbus.events.count', - 'n8n.eventbus.queue.size', - 'n8n.instance.role.leader', - 'n8n.last.activity', - 'n8n.nodejs.active.handles', - 'n8n.nodejs.active.handles.total', - 'n8n.nodejs.active.requests.total', - 'n8n.nodejs.active.resources', - 'n8n.nodejs.active.resources.total', - 'n8n.nodejs.event.loop.lag.seconds', - 'n8n.nodejs.eventloop.lag.max.seconds', - 'n8n.nodejs.eventloop.lag.mean.seconds', - 'n8n.nodejs.eventloop.lag.min.seconds', - 'n8n.nodejs.eventloop.lag.p50.seconds', - 'n8n.nodejs.eventloop.lag.p90.seconds', - 'n8n.nodejs.eventloop.lag.p99.seconds', - 'n8n.nodejs.eventloop.lag.seconds', - 'n8n.nodejs.eventloop.lag.stddev.seconds', - 'n8n.nodejs.external.memory.bytes', - 'n8n.nodejs.gc.duration.seconds.bucket', - 'n8n.nodejs.gc.duration.seconds.count', - 'n8n.nodejs.gc.duration.seconds.sum', - 'n8n.nodejs.heap.size.total.bytes', - 'n8n.nodejs.heap.size.used.bytes', - 'n8n.nodejs.heap.space.size.available.bytes', - 'n8n.nodejs.heap.space.size.total.bytes', - 'n8n.nodejs.heap.space.size.used.bytes', - 'n8n.nodejs.heap.total.bytes', - 'n8n.nodejs.heap.used.bytes', - 'n8n.process.cpu.system.seconds.count', - 'n8n.process.cpu.user.seconds.count', - 'n8n.process.heap.bytes', - 'n8n.process.max.fds', - 'n8n.process.open.fds', - 'n8n.process.resident.memory.bytes', - 'n8n.process.uptime.seconds', - 'n8n.process.virtual.memory.bytes', - 'n8n.queue.job.active.total', - 'n8n.queue.job.attempts.count', - 'n8n.queue.job.completed.count', - 'n8n.queue.job.delayed.total', - 'n8n.queue.job.dequeued.count', - 'n8n.queue.job.enqueued.count', - 'n8n.queue.job.failed.count', - 'n8n.queue.job.waiting.duration.seconds.bucket', - 'n8n.queue.job.waiting.duration.seconds.count', - 'n8n.queue.job.waiting.duration.seconds.sum', - 'n8n.queue.job.waiting.total', - 'n8n.queue.jobs.duration.seconds.bucket', - 'n8n.queue.jobs.duration.seconds.count', - 'n8n.queue.jobs.duration.seconds.sum', - 'n8n.queue.jobs.count', - 'n8n.readiness.check', - 'n8n.workflow.executions.active', - 'n8n.workflow.executions.duration.seconds.bucket', - 'n8n.workflow.executions.duration.seconds.count', - 'n8n.workflow.executions.duration.seconds.sum', - 'n8n.workflow.executions.count', - 'n8n.workflow.failed.count', - 'n8n.workflow.started.count', - 'n8n.workflow.success.count', - 'n8n.process.cpu.seconds.count', -] + These metrics are mapped and present in metadata.csv but only emit samples opportunistically + (auth failures, libuv requests in flight). Live containers may submit them or not depending on + timing, which makes ``check_symmetric_inclusion=True`` flaky in either direction. Dropping them + from the aggregator (and from the metadata subset via ``exclude_rare=True``) keeps the + symmetric check stable while still verifying the rest of the surface end-to-end. + """ + for name in RARE_EVENT_METRIC_NAMES: + aggregator._metrics.pop(name, None) diff --git a/n8n/tests/conftest.py b/n8n/tests/conftest.py index c6face31f7d4c..6539f85e00ab2 100644 --- a/n8n/tests/conftest.py +++ b/n8n/tests/conftest.py @@ -3,27 +3,111 @@ # Licensed under a 3-clause BSD style license (see LICENSE) import copy +import subprocess +import time +from contextlib import suppress +from typing import Any, Iterator import pytest +import requests from datadog_checks.dev import docker_run from datadog_checks.dev.conditions import CheckEndpoints from . import common +WORKFLOW_OK_PATH = '/workflows/sample_workflow.json' +WORKFLOW_FAIL_PATH = '/workflows/sample_workflow_failing.json' +WORKFLOW_OK_ID = 'testWorkflowOk' +WORKFLOW_FAIL_ID = 'testWorkflowFail' + +WEBHOOK_OK_PATH = '/webhook/test' +WEBHOOK_FAIL_PATH = '/webhook/fail' + +CONTAINER = 'n8n-test' + + +def _docker_exec(*cmd: str) -> str: + return subprocess.check_output(['docker', 'exec', CONTAINER, *cmd], stderr=subprocess.STDOUT).decode() + + +def _wait_for_n8n(timeout: int = 90): + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + with suppress(requests.RequestException): + if requests.get(common.MAIN_INSTANCE['openmetrics_endpoint'], timeout=2).status_code == 200: + return + time.sleep(2) + raise RuntimeError('n8n did not become healthy in time') + + +def _activate_imported_workflows(): + """Import sample workflows by stable id, activate them, restart n8n so webhooks register.""" + _docker_exec('n8n', 'import:workflow', f'--input={WORKFLOW_OK_PATH}') + _docker_exec('n8n', 'import:workflow', f'--input={WORKFLOW_FAIL_PATH}') + + for wf_id in (WORKFLOW_OK_ID, WORKFLOW_FAIL_ID): + _docker_exec('n8n', 'update:workflow', f'--id={wf_id}', '--active=true') + + subprocess.check_call(['docker', 'restart', CONTAINER], stderr=subprocess.STDOUT) + _wait_for_n8n() + + +def _generate_workflow_traffic(iterations: int = 5): + """Trigger workflows + API endpoints so workflow event and HTTP histogram metrics fire. + + Failures are not silently swallowed — at least the OK webhook must respond, otherwise + the test fixture is broken and downstream metric assertions can't be trusted. + """ + base_url = f'http://{common.HOST}:{common.MAIN_PORT}' + api_paths = ('/healthz', '/healthz/readiness', '/rest/login') + ok_responses = 0 + for _ in range(iterations): + with suppress(requests.RequestException): + ok = requests.get(f'{base_url}{WEBHOOK_OK_PATH}', timeout=5) + if ok.status_code < 500: + ok_responses += 1 + # Webhook fail is *expected* to error out — that's the point of triggering it. + for path in (WEBHOOK_FAIL_PATH, *api_paths): + with suppress(requests.RequestException): + requests.get(f'{base_url}{path}', timeout=5) + if ok_responses == 0: + raise RuntimeError('Test webhook returned no successful responses; workflow registration failed') + + +def _wait_for_workflow_metric(timeout: int = 30): + """Poll /metrics until at least one workflow_started_total sample is non-zero.""" + deadline = time.monotonic() + timeout + metrics_url = common.MAIN_INSTANCE['openmetrics_endpoint'] + while time.monotonic() < deadline: + with suppress(requests.RequestException): + payload = requests.get(metrics_url, timeout=3).text + for line in payload.splitlines(): + if line.startswith('n8n_workflow_started_total') and not line.endswith(' 0'): + return + time.sleep(2) + raise RuntimeError('workflow_started_total never went non-zero') + @pytest.fixture(scope='session') -def dd_environment(): - compose_file = common.COMPOSE_FILE +def dd_environment() -> Iterator[tuple[dict[str, Any], dict[str, Any]]]: conditions = [ - CheckEndpoints(common.INSTANCE["openmetrics_endpoint"]), + CheckEndpoints(common.MAIN_INSTANCE['openmetrics_endpoint']), + CheckEndpoints(common.WORKER_INSTANCE['openmetrics_endpoint']), + _activate_imported_workflows, + _generate_workflow_traffic, + _wait_for_workflow_metric, ] - with docker_run(compose_file, conditions=conditions): - yield { - 'instances': [common.INSTANCE], - } + with docker_run(common.COMPOSE_FILE, conditions=conditions, env_vars=common.get_compose_env_vars()): + config = {'instances': [common.MAIN_INSTANCE, common.WORKER_INSTANCE]} + yield config, common.E2E_METADATA + + +@pytest.fixture +def instance() -> dict[str, Any]: + return copy.deepcopy(common.MAIN_INSTANCE) @pytest.fixture -def instance(): - return copy.deepcopy(common.INSTANCE) +def worker_instance() -> dict[str, Any]: + return copy.deepcopy(common.WORKER_INSTANCE) diff --git a/n8n/tests/docker/Dockerfile b/n8n/tests/docker/Dockerfile deleted file mode 100644 index d74b7ccd9c162..0000000000000 --- a/n8n/tests/docker/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -ARG N8N_VERSION=1.118.1 -FROM n8nio/n8n:${N8N_VERSION} - -# Set environment variables to enable metrics and logging -ENV N8N_METRICS=true \ - N8N_LOG_LEVEL=debug \ - N8N_METRICS_INCLUDE_DEFAULT_METRICS=true \ - N8N_METRICS_INCLUDE_CACHE_METRICS=true \ - N8N_METRICS_INCLUDE_MESSAGE_EVENT_BUS_METRICS=true \ - N8N_HOST=0.0.0.0 \ - N8N_PORT=5678 - -# Expose the n8n port -EXPOSE 5678 diff --git a/n8n/tests/docker/README.md b/n8n/tests/docker/README.md index bb1d23cc34ce1..ac2ded112e06f 100644 --- a/n8n/tests/docker/README.md +++ b/n8n/tests/docker/README.md @@ -82,7 +82,8 @@ This setup is designed for integration testing. The n8n instance will: ## Notes -- The container uses the latest official n8n Docker image +- The container uses the official `n8nio/n8n` image at the version selected via the `N8N_VERSION` environment variable (forwarded by `hatch.toml`'s test matrix). The default in `docker-compose.yaml` is `1.118.1`. +- Queue mode is enabled with a Redis container and a separate `n8n-worker` service that exposes its own `/metrics` endpoint on host port `5680` (the default `5679` collides with the n8n 2.x task runner broker). - Data is persisted in a Docker volume named `n8n_data` - The health check waits up to 30 seconds for n8n to start before marking it as healthy diff --git a/n8n/tests/docker/docker-compose.yaml b/n8n/tests/docker/docker-compose.yaml index fb8da72559b78..554114d2819a2 100644 --- a/n8n/tests/docker/docker-compose.yaml +++ b/n8n/tests/docker/docker-compose.yaml @@ -1,33 +1,50 @@ services: + redis: + image: redis:7-alpine + container_name: n8n-test-redis + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 5 + n8n: - build: - context: . - dockerfile: Dockerfile + image: n8nio/n8n:${N8N_VERSION:-1.118.1} container_name: n8n-test ports: - - "5678:5678" + - "${N8N_MAIN_HOST_PORT:-5678}:5678" environment: - # Enable metrics endpoint - - N8N_METRICS=true - - N8N_METRICS_INCLUDE_DEFAULT_METRICS=true - - N8N_METRICS_INCLUDE_CACHE_METRICS=true - - N8N_METRICS_INCLUDE_MESSAGE_EVENT_BUS_METRICS=true - - N8N_METRICS_INCLUDE_API_ENDPOINTS=true - - N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL=true - # Logging configuration + - EXECUTIONS_MODE=queue + - QUEUE_BULL_REDIS_HOST=redis + - QUEUE_BULL_REDIS_PORT=6379 - N8N_LOG_LEVEL=debug - N8N_LOG_OUTPUT=console - # Basic configuration - N8N_HOST=0.0.0.0 - N8N_PORT=5678 - N8N_PROTOCOL=http - # Authentication (optional for testing) - N8N_BASIC_AUTH_ACTIVE=true - N8N_BASIC_AUTH_USER=admin - N8N_BASIC_AUTH_PASSWORD=admin + - N8N_DIAGNOSTICS_ENABLED=false + - N8N_VERSION_NOTIFICATIONS_ENABLED=false + - N8N_TEMPLATES_ENABLED=false + - N8N_RUNNERS_ENABLED=false + - N8N_METRICS=true + - N8N_METRICS_INCLUDE_DEFAULT_METRICS=true + - N8N_METRICS_INCLUDE_CACHE_METRICS=true + - N8N_METRICS_INCLUDE_MESSAGE_EVENT_BUS_METRICS=true + - N8N_METRICS_INCLUDE_API_ENDPOINTS=true + - N8N_METRICS_INCLUDE_QUEUE_METRICS=true + - N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL=true + - N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS=true volumes: - n8n_data:/home/node/.n8n - ${N8N_LOG_FOLDER:-./logs}:/var/log/n8n + - ./sample_workflow.json:/workflows/sample_workflow.json:ro + - ./sample_workflow_failing.json:/workflows/sample_workflow_failing.json:ro + depends_on: + redis: + condition: service_healthy healthcheck: test: ["CMD", "wget", "-q", "--spider", "http://localhost:5678/healthz"] interval: 10s @@ -35,7 +52,35 @@ services: retries: 5 start_period: 30s + n8n-worker: + image: n8nio/n8n:${N8N_VERSION:-1.118.1} + container_name: n8n-test-worker + command: ["worker"] + ports: + - "${N8N_WORKER_HOST_PORT:-5680}:5680" + environment: + - EXECUTIONS_MODE=queue + - QUEUE_BULL_REDIS_HOST=redis + - QUEUE_BULL_REDIS_PORT=6379 + - N8N_LOG_LEVEL=info + - N8N_LOG_OUTPUT=console + - N8N_RUNNERS_ENABLED=false + - N8N_METRICS=true + - N8N_METRICS_INCLUDE_DEFAULT_METRICS=true + - N8N_METRICS_INCLUDE_CACHE_METRICS=true + - N8N_METRICS_INCLUDE_MESSAGE_EVENT_BUS_METRICS=true + - N8N_METRICS_INCLUDE_API_ENDPOINTS=true + - N8N_METRICS_INCLUDE_QUEUE_METRICS=true + - N8N_METRICS_INCLUDE_WORKFLOW_ID_LABEL=true + - N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS=true + - QUEUE_HEALTH_CHECK_ACTIVE=true + - QUEUE_HEALTH_CHECK_PORT=5680 + volumes: + - n8n_data:/home/node/.n8n + depends_on: + n8n: + condition: service_healthy + volumes: n8n_data: driver: local - diff --git a/n8n/tests/docker/sample_workflow.json b/n8n/tests/docker/sample_workflow.json new file mode 100644 index 0000000000000..94400565cfd9e --- /dev/null +++ b/n8n/tests/docker/sample_workflow.json @@ -0,0 +1,59 @@ +{ + "id": "testWorkflowOk", + "versionId": "00000000-0000-0000-0000-000000000001", + "name": "Test Workflow", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "test", + "responseMode": "lastNode", + "options": {} + }, + "id": "11111111-1111-1111-1111-111111111111", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "test-webhook-aaaa-bbbb-cccc-111111111111" + }, + { + "parameters": { + "assignments": { + "assignments": [ + { + "id": "1", + "name": "ok", + "value": "true", + "type": "string" + } + ] + }, + "options": {} + }, + "id": "22222222-2222-2222-2222-222222222222", + "name": "Set", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [460, 300] + } + ], + "connections": { + "Webhook": { + "main": [ + [ + { + "node": "Set", + "type": "main", + "index": 0 + } + ] + ] + } + }, + "active": false, + "settings": { + "executionOrder": "v1" + }, + "pinData": {} +} diff --git a/n8n/tests/docker/sample_workflow_failing.json b/n8n/tests/docker/sample_workflow_failing.json new file mode 100644 index 0000000000000..159f08bfc8843 --- /dev/null +++ b/n8n/tests/docker/sample_workflow_failing.json @@ -0,0 +1,50 @@ +{ + "id": "testWorkflowFail", + "versionId": "00000000-0000-0000-0000-000000000002", + "name": "Failing Test Workflow", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "fail", + "responseMode": "lastNode", + "options": {} + }, + "id": "33333333-3333-3333-3333-333333333333", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "test-fail-aaaa-bbbb-cccc-333333333333" + }, + { + "parameters": { + "language": "javaScript", + "jsCode": "throw new Error('intentional failure for metrics tests');" + }, + "id": "44444444-4444-4444-4444-444444444444", + "name": "Code", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [460, 300] + } + ], + "connections": { + "Webhook": { + "main": [ + [ + { + "node": "Code", + "type": "main", + "index": 0 + } + ] + ] + } + }, + "active": false, + "settings": { + "executionOrder": "v1" + }, + "pinData": {} +} diff --git a/n8n/tests/fixtures/n8n.txt b/n8n/tests/fixtures/n8n.txt index c670f02d7fe46..bf9603a77f907 100644 --- a/n8n/tests/fixtures/n8n.txt +++ b/n8n/tests/fixtures/n8n.txt @@ -1,34 +1,34 @@ # HELP n8n_process_cpu_user_seconds_total Total user CPU time spent in seconds. # TYPE n8n_process_cpu_user_seconds_total counter -n8n_process_cpu_user_seconds_total 8.298932999999998 +n8n_process_cpu_user_seconds_total 0.921656 # HELP n8n_process_cpu_system_seconds_total Total system CPU time spent in seconds. # TYPE n8n_process_cpu_system_seconds_total counter -n8n_process_cpu_system_seconds_total 3.1041119999999998 +n8n_process_cpu_system_seconds_total 0.157367 # HELP n8n_process_cpu_seconds_total Total user and system CPU time spent in seconds. # TYPE n8n_process_cpu_seconds_total counter -n8n_process_cpu_seconds_total 11.403044999999999 +n8n_process_cpu_seconds_total 1.0790229999999998 # HELP n8n_process_start_time_seconds Start time of the process since unix epoch in seconds. # TYPE n8n_process_start_time_seconds gauge -n8n_process_start_time_seconds 1761656578 +n8n_process_start_time_seconds 1778234580 # HELP n8n_process_resident_memory_bytes Resident memory size in bytes. # TYPE n8n_process_resident_memory_bytes gauge -n8n_process_resident_memory_bytes 245043200 +n8n_process_resident_memory_bytes 267681792 # HELP n8n_process_virtual_memory_bytes Virtual memory size in bytes. # TYPE n8n_process_virtual_memory_bytes gauge -n8n_process_virtual_memory_bytes 33656197120 +n8n_process_virtual_memory_bytes 18517532672 # HELP n8n_process_heap_bytes Process heap size in bytes. # TYPE n8n_process_heap_bytes gauge -n8n_process_heap_bytes 277200896 +n8n_process_heap_bytes 840728576 # HELP n8n_process_open_fds Number of open file descriptors. # TYPE n8n_process_open_fds gauge -n8n_process_open_fds 44 +n8n_process_open_fds 45 # HELP n8n_process_max_fds Maximum number of open file descriptors. # TYPE n8n_process_max_fds gauge @@ -36,59 +36,62 @@ n8n_process_max_fds 1048576 # HELP n8n_nodejs_eventloop_lag_seconds Lag of event loop in seconds. # TYPE n8n_nodejs_eventloop_lag_seconds gauge -n8n_nodejs_eventloop_lag_seconds 0.002765567 +n8n_nodejs_eventloop_lag_seconds 0.008676917 # HELP n8n_nodejs_eventloop_lag_min_seconds The minimum recorded event loop delay. # TYPE n8n_nodejs_eventloop_lag_min_seconds gauge -n8n_nodejs_eventloop_lag_min_seconds 0.010018816 +n8n_nodejs_eventloop_lag_min_seconds 0.006340608 # HELP n8n_nodejs_eventloop_lag_max_seconds The maximum recorded event loop delay. # TYPE n8n_nodejs_eventloop_lag_max_seconds gauge -n8n_nodejs_eventloop_lag_max_seconds 0.011239423 +n8n_nodejs_eventloop_lag_max_seconds 0.030228479 # HELP n8n_nodejs_eventloop_lag_mean_seconds The mean of the recorded event loop delays. # TYPE n8n_nodejs_eventloop_lag_mean_seconds gauge -n8n_nodejs_eventloop_lag_mean_seconds 0.010092521938958708 +n8n_nodejs_eventloop_lag_mean_seconds 0.012079332927643785 # HELP n8n_nodejs_eventloop_lag_stddev_seconds The standard deviation of the recorded event loop delays. # TYPE n8n_nodejs_eventloop_lag_stddev_seconds gauge -n8n_nodejs_eventloop_lag_stddev_seconds 0.00016945350643679045 +n8n_nodejs_eventloop_lag_stddev_seconds 0.0011467288819057616 # HELP n8n_nodejs_eventloop_lag_p50_seconds The 50th percentile of the recorded event loop delays. # TYPE n8n_nodejs_eventloop_lag_p50_seconds gauge -n8n_nodejs_eventloop_lag_p50_seconds 0.010067967 +n8n_nodejs_eventloop_lag_p50_seconds 0.012001279 # HELP n8n_nodejs_eventloop_lag_p90_seconds The 90th percentile of the recorded event loop delays. # TYPE n8n_nodejs_eventloop_lag_p90_seconds gauge -n8n_nodejs_eventloop_lag_p90_seconds 0.010067967 +n8n_nodejs_eventloop_lag_p90_seconds 0.013254655 # HELP n8n_nodejs_eventloop_lag_p99_seconds The 99th percentile of the recorded event loop delays. # TYPE n8n_nodejs_eventloop_lag_p99_seconds gauge -n8n_nodejs_eventloop_lag_p99_seconds 0.011124735 +n8n_nodejs_eventloop_lag_p99_seconds 0.014426111 # HELP n8n_nodejs_active_resources Number of active resources that are currently keeping the event loop alive, grouped by async resource type. # TYPE n8n_nodejs_active_resources gauge -n8n_nodejs_active_resources{type="PipeWrap"} 2 -n8n_nodejs_active_resources{type="TCPServerWrap"} 1 -n8n_nodejs_active_resources{type="TCPSocketWrap"} 1 -n8n_nodejs_active_resources{type="Timeout"} 13 +n8n_nodejs_active_resources{type="PipeWrap"} 5 +n8n_nodejs_active_resources{type="TCPServerWrap"} 2 +n8n_nodejs_active_resources{type="TCPSocketWrap"} 9 +n8n_nodejs_active_resources{type="ProcessWrap"} 1 +n8n_nodejs_active_resources{type="Timeout"} 20 n8n_nodejs_active_resources{type="Immediate"} 1 # HELP n8n_nodejs_active_resources_total Total number of active resources. # TYPE n8n_nodejs_active_resources_total gauge -n8n_nodejs_active_resources_total 18 +n8n_nodejs_active_resources_total 38 # HELP n8n_nodejs_active_handles Number of active libuv handles grouped by handle type. Every handle type is C++ class name. # TYPE n8n_nodejs_active_handles gauge -n8n_nodejs_active_handles{type="Socket"} 3 -n8n_nodejs_active_handles{type="Server"} 1 +n8n_nodejs_active_handles{type="Socket"} 14 +n8n_nodejs_active_handles{type="Server"} 2 +n8n_nodejs_active_handles{type="ChildProcess"} 1 # HELP n8n_nodejs_active_handles_total Total number of active handles. # TYPE n8n_nodejs_active_handles_total gauge -n8n_nodejs_active_handles_total 4 +n8n_nodejs_active_handles_total 17 # HELP n8n_nodejs_active_requests Number of active libuv requests grouped by request type. Every request type is C++ class name. # TYPE n8n_nodejs_active_requests gauge +n8n_nodejs_active_requests{type="FSReqCallback"} 1 # HELP n8n_nodejs_active_requests_total Total number of active requests. # TYPE n8n_nodejs_active_requests_total gauge @@ -96,81 +99,87 @@ n8n_nodejs_active_requests_total 0 # HELP n8n_nodejs_heap_size_total_bytes Process heap size from Node.js in bytes. # TYPE n8n_nodejs_heap_size_total_bytes gauge -n8n_nodejs_heap_size_total_bytes 142774272 +n8n_nodejs_heap_size_total_bytes 146391040 # HELP n8n_nodejs_heap_size_used_bytes Process heap size used from Node.js in bytes. # TYPE n8n_nodejs_heap_size_used_bytes gauge -n8n_nodejs_heap_size_used_bytes 136342632 +n8n_nodejs_heap_size_used_bytes 136336448 # HELP n8n_nodejs_external_memory_bytes Node.js external memory size in bytes. # TYPE n8n_nodejs_external_memory_bytes gauge -n8n_nodejs_external_memory_bytes 20824585 +n8n_nodejs_external_memory_bytes 20993559 # HELP n8n_nodejs_heap_space_size_total_bytes Process heap space size total from Node.js in bytes. # TYPE n8n_nodejs_heap_space_size_total_bytes gauge n8n_nodejs_heap_space_size_total_bytes{space="read_only"} 0 -n8n_nodejs_heap_space_size_total_bytes{space="new"} 1048576 -n8n_nodejs_heap_space_size_total_bytes{space="old"} 122208256 -n8n_nodejs_heap_space_size_total_bytes{space="code"} 4718592 +n8n_nodejs_heap_space_size_total_bytes{space="new"} 2097152 +n8n_nodejs_heap_space_size_total_bytes{space="old"} 116920320 +n8n_nodejs_heap_space_size_total_bytes{space="code"} 5505024 n8n_nodejs_heap_space_size_total_bytes{space="shared"} 0 -n8n_nodejs_heap_space_size_total_bytes{space="trusted"} 7643136 +n8n_nodejs_heap_space_size_total_bytes{space="trusted"} 11624448 +n8n_nodejs_heap_space_size_total_bytes{space="shared_trusted"} 0 n8n_nodejs_heap_space_size_total_bytes{space="new_large_object"} 0 -n8n_nodejs_heap_space_size_total_bytes{space="large_object"} 7000064 -n8n_nodejs_heap_space_size_total_bytes{space="code_large_object"} 155648 +n8n_nodejs_heap_space_size_total_bytes{space="large_object"} 9875456 +n8n_nodejs_heap_space_size_total_bytes{space="code_large_object"} 368640 n8n_nodejs_heap_space_size_total_bytes{space="shared_large_object"} 0 +n8n_nodejs_heap_space_size_total_bytes{space="shared_trusted_large_object"} 0 n8n_nodejs_heap_space_size_total_bytes{space="trusted_large_object"} 0 # HELP n8n_nodejs_heap_space_size_used_bytes Process heap space size used from Node.js in bytes. # TYPE n8n_nodejs_heap_space_size_used_bytes gauge n8n_nodejs_heap_space_size_used_bytes{space="read_only"} 0 -n8n_nodejs_heap_space_size_used_bytes{space="new"} 652896 -n8n_nodejs_heap_space_size_used_bytes{space="old"} 119347344 -n8n_nodejs_heap_space_size_used_bytes{space="code"} 4183424 +n8n_nodejs_heap_space_size_used_bytes{space="new"} 382808 +n8n_nodejs_heap_space_size_used_bytes{space="old"} 111099512 +n8n_nodejs_heap_space_size_used_bytes{space="code"} 4853344 n8n_nodejs_heap_space_size_used_bytes{space="shared"} 0 -n8n_nodejs_heap_space_size_used_bytes{space="trusted"} 5187192 +n8n_nodejs_heap_space_size_used_bytes{space="trusted"} 9839592 +n8n_nodejs_heap_space_size_used_bytes{space="shared_trusted"} 0 n8n_nodejs_heap_space_size_used_bytes{space="new_large_object"} 0 -n8n_nodejs_heap_space_size_used_bytes{space="large_object"} 6837144 -n8n_nodejs_heap_space_size_used_bytes{space="code_large_object"} 138432 +n8n_nodejs_heap_space_size_used_bytes{space="large_object"} 9806288 +n8n_nodejs_heap_space_size_used_bytes{space="code_large_object"} 361728 n8n_nodejs_heap_space_size_used_bytes{space="shared_large_object"} 0 +n8n_nodejs_heap_space_size_used_bytes{space="shared_trusted_large_object"} 0 n8n_nodejs_heap_space_size_used_bytes{space="trusted_large_object"} 0 # HELP n8n_nodejs_heap_space_size_available_bytes Process heap space size available from Node.js in bytes. # TYPE n8n_nodejs_heap_space_size_available_bytes gauge n8n_nodejs_heap_space_size_available_bytes{space="read_only"} 0 -n8n_nodejs_heap_space_size_available_bytes{space="new"} 378016 -n8n_nodejs_heap_space_size_available_bytes{space="old"} 430568 -n8n_nodejs_heap_space_size_available_bytes{space="code"} 239680 +n8n_nodejs_heap_space_size_available_bytes{space="new"} 665704 +n8n_nodejs_heap_space_size_available_bytes{space="old"} 5484264 +n8n_nodejs_heap_space_size_available_bytes{space="code"} 651008 n8n_nodejs_heap_space_size_available_bytes{space="shared"} 0 -n8n_nodejs_heap_space_size_available_bytes{space="trusted"} 2323072 +n8n_nodejs_heap_space_size_available_bytes{space="trusted"} 1771032 +n8n_nodejs_heap_space_size_available_bytes{space="shared_trusted"} 0 n8n_nodejs_heap_space_size_available_bytes{space="new_large_object"} 1048576 n8n_nodejs_heap_space_size_available_bytes{space="large_object"} 0 n8n_nodejs_heap_space_size_available_bytes{space="code_large_object"} 0 n8n_nodejs_heap_space_size_available_bytes{space="shared_large_object"} 0 +n8n_nodejs_heap_space_size_available_bytes{space="shared_trusted_large_object"} 0 n8n_nodejs_heap_space_size_available_bytes{space="trusted_large_object"} 0 # HELP n8n_nodejs_version_info Node.js version info. # TYPE n8n_nodejs_version_info gauge -n8n_nodejs_version_info{version="v22.18.0",major="22",minor="18",patch="0"} 1 +n8n_nodejs_version_info{version="v24.14.1",major="24",minor="14",patch="1"} 1 # HELP n8n_nodejs_gc_duration_seconds Garbage collection duration by kind, one of major, minor, incremental or weakcb. # TYPE n8n_nodejs_gc_duration_seconds histogram -n8n_nodejs_gc_duration_seconds_bucket{le="0.001",kind="minor"} 128 -n8n_nodejs_gc_duration_seconds_bucket{le="0.01",kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_bucket{le="0.1",kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_bucket{le="1",kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_bucket{le="2",kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_bucket{le="5",kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_sum{kind="minor"} 0.09924478498101237 -n8n_nodejs_gc_duration_seconds_count{kind="minor"} 132 -n8n_nodejs_gc_duration_seconds_bucket{le="0.001",kind="incremental"} 1 +n8n_nodejs_gc_duration_seconds_bucket{le="0.001",kind="minor"} 0 +n8n_nodejs_gc_duration_seconds_bucket{le="0.01",kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_bucket{le="0.1",kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_bucket{le="1",kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_bucket{le="2",kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_bucket{le="5",kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_sum{kind="minor"} 0.004925500000128522 +n8n_nodejs_gc_duration_seconds_count{kind="minor"} 2 +n8n_nodejs_gc_duration_seconds_bucket{le="0.001",kind="incremental"} 0 n8n_nodejs_gc_duration_seconds_bucket{le="0.01",kind="incremental"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="0.1",kind="incremental"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="1",kind="incremental"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="2",kind="incremental"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="5",kind="incremental"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="incremental"} 2 -n8n_nodejs_gc_duration_seconds_sum{kind="incremental"} 0.0022786640077829363 +n8n_nodejs_gc_duration_seconds_sum{kind="incremental"} 0.005939041999867186 n8n_nodejs_gc_duration_seconds_count{kind="incremental"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="0.001",kind="major"} 0 n8n_nodejs_gc_duration_seconds_bucket{le="0.01",kind="major"} 0 @@ -179,231 +188,251 @@ n8n_nodejs_gc_duration_seconds_bucket{le="1",kind="major"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="2",kind="major"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="5",kind="major"} 2 n8n_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="major"} 2 -n8n_nodejs_gc_duration_seconds_sum{kind="major"} 0.1028408939987421 +n8n_nodejs_gc_duration_seconds_sum{kind="major"} 0.032123332999879496 n8n_nodejs_gc_duration_seconds_count{kind="major"} 2 +# HELP n8n_process_pss_bytes Proportional Set Size of the process in bytes. +# TYPE n8n_process_pss_bytes gauge +n8n_process_pss_bytes 220097536 + # HELP n8n_version_info n8n version info. # TYPE n8n_version_info gauge -n8n_version_info{version="v1.117.2",major="1",minor="117",patch="2"} 1 +n8n_version_info{version="v2.19.5",major="2",minor="19",patch="5"} 1 # HELP n8n_instance_role_leader Whether this main instance is the leader (1) or not (0). # TYPE n8n_instance_role_leader gauge n8n_instance_role_leader 1 +# HELP n8n_cache_hits_total Total number of cache hits. +# TYPE n8n_cache_hits_total counter +n8n_cache_hits_total 53 + +# HELP n8n_cache_misses_total Total number of cache misses. +# TYPE n8n_cache_misses_total counter +n8n_cache_misses_total 15 + +# HELP n8n_cache_updates_total Total number of cache updates. +# TYPE n8n_cache_updates_total counter +n8n_cache_updates_total 1 + # HELP n8n_http_request_duration_seconds duration histogram of http responses labeled with: status_code # TYPE n8n_http_request_duration_seconds histogram +n8n_http_request_duration_seconds_bucket{le="0.003"} 5 +n8n_http_request_duration_seconds_bucket{le="0.03"} 5 +n8n_http_request_duration_seconds_bucket{le="0.1"} 5 +n8n_http_request_duration_seconds_bucket{le="0.3"} 5 +n8n_http_request_duration_seconds_bucket{le="1.5"} 5 +n8n_http_request_duration_seconds_bucket{le="10"} 5 +n8n_http_request_duration_seconds_bucket{le="+Inf"} 5 +n8n_http_request_duration_seconds_sum 0.0018007910000000002 +n8n_http_request_duration_seconds_count 5 # HELP n8n_last_activity last instance activity (backend request) in Unix time (seconds). # TYPE n8n_last_activity gauge -n8n_last_activity 1761656582 +n8n_last_activity 1778234587 + +# HELP n8n_scaling_mode_queue_jobs_waiting Current number of enqueued jobs waiting for pickup in scaling mode. +# TYPE n8n_scaling_mode_queue_jobs_waiting gauge +n8n_scaling_mode_queue_jobs_waiting 0 + +# HELP n8n_scaling_mode_queue_jobs_active Current number of jobs being processed across all workers in scaling mode. +# TYPE n8n_scaling_mode_queue_jobs_active gauge +n8n_scaling_mode_queue_jobs_active 0 + +# HELP n8n_scaling_mode_queue_jobs_completed Total number of jobs completed across all workers in scaling mode since instance start. +# TYPE n8n_scaling_mode_queue_jobs_completed counter +n8n_scaling_mode_queue_jobs_completed 8 + +# HELP n8n_scaling_mode_queue_jobs_failed Total number of jobs failed across all workers in scaling mode since instance start. +# TYPE n8n_scaling_mode_queue_jobs_failed counter +n8n_scaling_mode_queue_jobs_failed 0 + +# HELP n8n_workflow_execution_duration_seconds Workflow execution duration in seconds. +# TYPE n8n_workflow_execution_duration_seconds histogram +n8n_workflow_execution_duration_seconds_bucket{le="0.005",status="success",mode="webhook",workflow_id="testWorkflowOk"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.01",status="success",mode="webhook",workflow_id="testWorkflowOk"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.025",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="0.05",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="0.1",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="0.25",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="0.5",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="1",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="2.5",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="5",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="10",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="30",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="60",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="120",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="300",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="600",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="+Inf",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_sum{status="success",mode="webhook",workflow_id="testWorkflowOk"} 0.027999999999999997 +n8n_workflow_execution_duration_seconds_count{status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="0.005",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.01",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.025",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.05",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.1",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.25",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +n8n_workflow_execution_duration_seconds_bucket{le="0.5",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="1",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="2.5",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="5",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="10",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="30",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="60",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="120",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="300",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="600",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_bucket{le="+Inf",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +n8n_workflow_execution_duration_seconds_sum{status="failed",mode="webhook",workflow_id="testWorkflowFail"} 0.405 +n8n_workflow_execution_duration_seconds_count{status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 # HELP n8n_active_workflow_count Total number of active workflows. # TYPE n8n_active_workflow_count gauge -n8n_active_workflow_count{workflow_id="wf_8a3b2c1d"} 0 -n8n_active_workflow_count{workflow_id="wf_7f4e9a2b"} 0 -n8n_active_workflow_count{workflow_id="wf_5d6c8e1f"} 0 - -# HELP n8n_nodejs_event_loop_lag_seconds Event loop lag in seconds -# TYPE n8n_nodejs_event_loop_lag_seconds gauge -n8n_nodejs_event_loop_lag_seconds 0.0035 - -# HELP n8n_nodejs_heap_total_bytes Total heap size allocated in bytes -# TYPE n8n_nodejs_heap_total_bytes gauge -n8n_nodejs_heap_total_bytes 73400320 - -# HELP n8n_nodejs_heap_used_bytes Heap memory used in bytes -# TYPE n8n_nodejs_heap_used_bytes gauge -n8n_nodejs_heap_used_bytes 51200000 - -# HELP n8n_workflow_executions_total Total number of workflow executions -# TYPE n8n_workflow_executions_total counter -n8n_workflow_executions_total{status="success",workflow_id="wf_8a3b2c1d"} 45 -n8n_workflow_executions_total{status="success",workflow_id="wf_7f4e9a2b"} 38 -n8n_workflow_executions_total{status="success",workflow_id="wf_5d6c8e1f"} 45 -n8n_workflow_executions_total{status="error",workflow_id="wf_8a3b2c1d"} 3 -n8n_workflow_executions_total{status="error",workflow_id="wf_5d6c8e1f"} 4 - -# HELP n8n_workflow_executions_duration_seconds Workflow execution duration in seconds -# TYPE n8n_workflow_executions_duration_seconds histogram -n8n_workflow_executions_duration_seconds_bucket{le="0.1",workflow_id="wf_8a3b2c1d"} 5 -n8n_workflow_executions_duration_seconds_bucket{le="1",workflow_id="wf_8a3b2c1d"} 18 -n8n_workflow_executions_duration_seconds_bucket{le="+Inf",workflow_id="wf_8a3b2c1d"} 48 -n8n_workflow_executions_duration_seconds_sum{workflow_id="wf_8a3b2c1d"} 14.3 -n8n_workflow_executions_duration_seconds_count{workflow_id="wf_8a3b2c1d"} 48 -n8n_workflow_executions_duration_seconds_bucket{le="0.1",workflow_id="wf_7f4e9a2b"} 4 -n8n_workflow_executions_duration_seconds_bucket{le="1",workflow_id="wf_7f4e9a2b"} 15 -n8n_workflow_executions_duration_seconds_bucket{le="+Inf",workflow_id="wf_7f4e9a2b"} 38 -n8n_workflow_executions_duration_seconds_sum{workflow_id="wf_7f4e9a2b"} 11.2 -n8n_workflow_executions_duration_seconds_count{workflow_id="wf_7f4e9a2b"} 38 -n8n_workflow_executions_duration_seconds_bucket{le="0.1",workflow_id="wf_5d6c8e1f"} 3 -n8n_workflow_executions_duration_seconds_bucket{le="1",workflow_id="wf_5d6c8e1f"} 12 -n8n_workflow_executions_duration_seconds_bucket{le="+Inf",workflow_id="wf_5d6c8e1f"} 49 -n8n_workflow_executions_duration_seconds_sum{workflow_id="wf_5d6c8e1f"} 12.7 -n8n_workflow_executions_duration_seconds_count{workflow_id="wf_5d6c8e1f"} 49 - -# HELP n8n_workflow_started_total Total number of workflows started -# TYPE n8n_workflow_started_total counter -n8n_workflow_started_total 25634 -n8n_workflow_started_total{workflow_id="12",workflow_name="CRM Sync"} 8142 -n8n_workflow_started_total{workflow_id="25",workflow_name="Webhook Intake"} 14290 -n8n_workflow_started_total{workflow_id="33",workflow_name="Slack Alerts"} 2202 +n8n_active_workflow_count 2 -# HELP n8n_workflow_success_total Total number of workflows completed successfully -# TYPE n8n_workflow_success_total counter -n8n_workflow_success_total 25209 -n8n_workflow_success_total{workflow_id="12",workflow_name="CRM Sync"} 8059 -n8n_workflow_success_total{workflow_id="25",workflow_name="Webhook Intake"} 14135 -n8n_workflow_success_total{workflow_id="33",workflow_name="Slack Alerts"} 2015 +# HELP n8n_production_executions Total number of production workflow executions (success + error). +# TYPE n8n_production_executions gauge +n8n_production_executions 8 -# HELP n8n_workflow_failed_total Total number of workflows that failed -# TYPE n8n_workflow_failed_total counter -n8n_workflow_failed_total 425 -n8n_workflow_failed_total{workflow_id="12",workflow_name="CRM Sync"} 83 -n8n_workflow_failed_total{workflow_id="25",workflow_name="Webhook Intake"} 155 -n8n_workflow_failed_total{workflow_id="33",workflow_name="Slack Alerts"} 187 - - -# HELP n8n_queue_jobs_total Total number of queue jobs -# TYPE n8n_queue_jobs_total counter -n8n_queue_jobs_total{state="waiting"} 3 -n8n_queue_jobs_total{state="active"} 2 -n8n_queue_jobs_total{state="completed"} 148 -n8n_queue_jobs_total{state="failed"} 5 - -# HELP n8n_queue_jobs_duration_seconds Job duration in seconds -# TYPE n8n_queue_jobs_duration_seconds histogram -n8n_queue_jobs_duration_seconds_bucket{le="0.1"} 22 -n8n_queue_jobs_duration_seconds_bucket{le="1"} 84 -n8n_queue_jobs_duration_seconds_bucket{le="+Inf"} 150 -n8n_queue_jobs_duration_seconds_sum 44.8 -n8n_queue_jobs_duration_seconds_count 150 - -# HELP n8n_queue_job_waiting_total Number of jobs currently waiting in the queue -# TYPE n8n_queue_job_waiting_total gauge -n8n_queue_job_waiting_total{queue="default"} 3 - -# HELP n8n_queue_job_active_total Number of jobs currently being processed -# TYPE n8n_queue_job_active_total gauge -n8n_queue_job_active_total{queue="default"} 2 - -# HELP n8n_queue_job_completed_total Number of jobs completed successfully -# TYPE n8n_queue_job_completed_total counter -n8n_queue_job_completed_total{queue="default"} 15892 +# HELP n8n_production_root_executions Total number of production root workflow executions (excludes sub-workflows). +# TYPE n8n_production_root_executions gauge +n8n_production_root_executions 8 -# HELP n8n_queue_job_failed_total Number of jobs that have failed -# TYPE n8n_queue_job_failed_total counter -n8n_queue_job_failed_total{queue="default"} 47 +# HELP n8n_manual_executions Total number of manual workflow executions (success + error). +# TYPE n8n_manual_executions gauge +n8n_manual_executions 0 -# HELP n8n_queue_job_dequeued_total Number of jobs dequeued (picked up from queue) -# TYPE n8n_queue_job_dequeued_total counter -n8n_queue_job_dequeued_total{queue="default"} 15939 +# HELP n8n_enabled_users Total number of enabled users. +# TYPE n8n_enabled_users gauge +n8n_enabled_users 1 + +# HELP n8n_users Total number of users. +# TYPE n8n_users gauge +n8n_users 1 + +# HELP n8n_workflows Total number of workflows. +# TYPE n8n_workflows gauge +n8n_workflows 2 + +# HELP n8n_credentials Total number of credentials. +# TYPE n8n_credentials gauge +n8n_credentials 0 + +# HELP n8n_token_exchange_requests_total Total number of token exchange requests. +# TYPE n8n_token_exchange_requests_total counter +n8n_token_exchange_requests_total{result="success"} 0 +n8n_token_exchange_requests_total{result="failure"} 0 + +# HELP n8n_token_exchange_failures_total Total number of token exchange failures broken down by reason. +# TYPE n8n_token_exchange_failures_total counter +n8n_token_exchange_failures_total{reason="invalid_token"} 0 + +# HELP n8n_embed_login_requests_total Total number of embed login requests. +# TYPE n8n_embed_login_requests_total counter +n8n_embed_login_requests_total{result="success"} 0 +n8n_embed_login_requests_total{result="failure"} 0 + +# HELP n8n_embed_login_failures_total Total number of embed login failures broken down by reason. +# TYPE n8n_embed_login_failures_total counter +n8n_embed_login_failures_total{reason="unauthorized"} 0 + +# HELP n8n_token_exchange_jit_provisioning_total Total number of users JIT-provisioned via token exchange. +# TYPE n8n_token_exchange_jit_provisioning_total counter +n8n_token_exchange_jit_provisioning_total 0 + +# HELP n8n_token_exchange_identity_linked_total Total number of external identities linked to existing users via token exchange. +# TYPE n8n_token_exchange_identity_linked_total counter +n8n_token_exchange_identity_linked_total 0 -# HELP n8n_queue_job_enqueued_total Number of jobs added to the queue +# HELP n8n_audit_workflow_activated_total Total number of n8n.audit.workflow.activated events. +# TYPE n8n_audit_workflow_activated_total counter +n8n_audit_workflow_activated_total{workflow_id="testWorkflowOk"} 1 +n8n_audit_workflow_activated_total{workflow_id="testWorkflowFail"} 1 + +# HELP n8n_audit_workflow_archived_total Total number of n8n.audit.workflow.archived events. +# TYPE n8n_audit_workflow_archived_total counter +n8n_audit_workflow_archived_total{workflow_id="testWorkflowOk"} 1 + +# HELP n8n_audit_workflow_created_total Total number of n8n.audit.workflow.created events. +# TYPE n8n_audit_workflow_created_total counter +n8n_audit_workflow_created_total{workflow_id="testWorkflowOk"} 1 + +# HELP n8n_audit_workflow_deactivated_total Total number of n8n.audit.workflow.deactivated events. +# TYPE n8n_audit_workflow_deactivated_total counter +n8n_audit_workflow_deactivated_total{workflow_id="testWorkflowOk"} 1 + +# HELP n8n_audit_workflow_deleted_total Total number of n8n.audit.workflow.deleted events. +# TYPE n8n_audit_workflow_deleted_total counter +n8n_audit_workflow_deleted_total{workflow_id="testWorkflowOk"} 1 + +# HELP n8n_queue_job_enqueued_total Total number of n8n.queue.job.enqueued events. # TYPE n8n_queue_job_enqueued_total counter -n8n_queue_job_enqueued_total{queue="default"} 15670 - -# HELP n8n_queue_job_delayed_total Number of jobs scheduled to run later -# TYPE n8n_queue_job_delayed_total gauge -n8n_queue_job_delayed_total{queue="default"} 5 - -# HELP n8n_queue_job_waiting_duration_seconds Duration jobs spend waiting before being processed -# TYPE n8n_queue_job_waiting_duration_seconds histogram -n8n_queue_job_waiting_duration_seconds_bucket{queue="default",le="0.1"} 50 -n8n_queue_job_waiting_duration_seconds_bucket{queue="default",le="1"} 241 -n8n_queue_job_waiting_duration_seconds_bucket{queue="default",le="5"} 820 -n8n_queue_job_waiting_duration_seconds_bucket{queue="default",le="10"} 1105 -n8n_queue_job_waiting_duration_seconds_bucket{queue="default",le="30"} 1240 -n8n_queue_job_waiting_duration_seconds_bucket{queue="default",le="+Inf"} 1253 -n8n_queue_job_waiting_duration_seconds_sum{queue="default"} 450.32 -n8n_queue_job_waiting_duration_seconds_count{queue="default"} 1253 - -# HELP n8n_api_requests_total Total API requests -# TYPE n8n_api_requests_total counter -n8n_api_requests_total{method="GET",endpoint="/workflows"} 240 -n8n_api_requests_total{method="POST",endpoint="/executions"} 75 - -# HELP n8n_api_request_duration_seconds API request duration in seconds -# TYPE n8n_api_request_duration_seconds histogram -n8n_api_request_duration_seconds_bucket{le="0.1"} 90 -n8n_api_request_duration_seconds_bucket{le="1"} 120 -n8n_api_request_duration_seconds_bucket{le="+Inf"} 125 -n8n_api_request_duration_seconds_sum 15.3 -n8n_api_request_duration_seconds_count 125 - -# HELP n8n_cache_operations_total Total cache operations -# TYPE n8n_cache_operations_total counter -n8n_cache_operations_total{operation="get"} 1250 -n8n_cache_operations_total{operation="set"} 320 -n8n_cache_operations_total{operation="delete"} 10 - -# HELP n8n_cache_hits_total Cache hits -# TYPE n8n_cache_hits_total counter -n8n_cache_hits_total 1080 +n8n_queue_job_enqueued_total 8 -# HELP n8n_cache_misses_total Cache misses -# TYPE n8n_cache_misses_total counter -n8n_cache_misses_total 170 - -# HELP n8n_cache_errors_total Cache errors -# TYPE n8n_cache_errors_total counter -n8n_cache_errors_total 0 - -# HELP n8n_cache_latency_seconds Cache operation latency in seconds -# TYPE n8n_cache_latency_seconds histogram -n8n_cache_latency_seconds_bucket{le="0.001"} 90 -n8n_cache_latency_seconds_bucket{le="0.01"} 240 -n8n_cache_latency_seconds_bucket{le="+Inf"} 260 -n8n_cache_latency_seconds_sum 1.42 -n8n_cache_latency_seconds_count 260 - -# HELP n8n_eventbus_events_total Total events published on the event bus -# TYPE n8n_eventbus_events_total counter -n8n_eventbus_events_total{event_type="workflowStarted"} 140 -n8n_eventbus_events_total{event_type="workflowCompleted"} 135 -n8n_eventbus_events_total{event_type="workflowFailed"} 5 - -# HELP n8n_eventbus_events_processed_total Total processed events -# TYPE n8n_eventbus_events_processed_total counter -n8n_eventbus_events_processed_total 138 - -# HELP n8n_eventbus_events_failed_total Total failed event processing -# TYPE n8n_eventbus_events_failed_total counter -n8n_eventbus_events_failed_total 2 - -# HELP n8n_eventbus_queue_size Current event queue size -# TYPE n8n_eventbus_queue_size gauge -n8n_eventbus_queue_size 1 - -# HELP n8n_eventbus_connections_total Active event bus backend connections -# TYPE n8n_eventbus_connections_total gauge -n8n_eventbus_connections_total 1 - -# HELP n8n_workflow_executions_active Number of active workflow executions -# TYPE n8n_workflow_executions_active gauge -n8n_workflow_executions_active 3 - -# HELP n8n_queue_job_attempts_total Total number of job attempts -# TYPE n8n_queue_job_attempts_total counter -n8n_queue_job_attempts_total{result="success"} 435 -n8n_queue_job_attempts_total{result="failed"} 12 - -# HELP n8n_workflow_started_total Total number of workflows started +# HELP n8n_workflow_started_total Total number of n8n.workflow.started events. # TYPE n8n_workflow_started_total counter -n8n_workflow_started_total 25634 -n8n_workflow_started_total{workflow_id="12",workflow_name="CRM Sync"} 8142 -n8n_workflow_started_total{workflow_id="25",workflow_name="Webhook Intake"} 14290 -n8n_workflow_started_total{workflow_id="33",workflow_name="Slack Alerts"} 2202 +n8n_workflow_started_total{workflow_id="testWorkflowOk"} 4 +n8n_workflow_started_total{workflow_id="testWorkflowFail"} 4 + +# HELP n8n_audit_workflow_executed_total Total number of n8n.audit.workflow.executed events. +# TYPE n8n_audit_workflow_executed_total counter +n8n_audit_workflow_executed_total{workflow_id="testWorkflowOk"} 4 +n8n_audit_workflow_executed_total{workflow_id="testWorkflowFail"} 4 + +# HELP n8n_audit_workflow_resumed_total Total number of n8n.audit.workflow.resumed events. +# TYPE n8n_audit_workflow_resumed_total counter +n8n_audit_workflow_resumed_total{workflow_id="testWorkflowOk"} 1 + +# HELP n8n_audit_workflow_unarchived_total Total number of n8n.audit.workflow.unarchived events. +# TYPE n8n_audit_workflow_unarchived_total counter +n8n_audit_workflow_unarchived_total{workflow_id="testWorkflowOk"} 1 + +# HELP n8n_audit_workflow_updated_total Total number of n8n.audit.workflow.updated events. +# TYPE n8n_audit_workflow_updated_total counter +n8n_audit_workflow_updated_total{workflow_id="testWorkflowOk"} 1 -# HELP n8n_workflow_success_total Total number of workflows completed successfully +# HELP n8n_audit_workflow_version_updated_total Total number of n8n.audit.workflow.version.updated events. +# TYPE n8n_audit_workflow_version_updated_total counter +n8n_audit_workflow_version_updated_total{workflow_id="testWorkflowOk"} 1 + +# HELP n8n_audit_workflow_waiting_total Total number of n8n.audit.workflow.waiting events. +# TYPE n8n_audit_workflow_waiting_total counter +n8n_audit_workflow_waiting_total{workflow_id="testWorkflowOk"} 1 + +# HELP n8n_workflow_success_total Total number of n8n.workflow.success events. # TYPE n8n_workflow_success_total counter -n8n_workflow_success_total 25209 -n8n_workflow_success_total{workflow_id="12",workflow_name="CRM Sync"} 8059 -n8n_workflow_success_total{workflow_id="25",workflow_name="Webhook Intake"} 14135 -n8n_workflow_success_total{workflow_id="33",workflow_name="Slack Alerts"} 2015 +n8n_workflow_success_total{workflow_id="testWorkflowOk"} 4 -# HELP n8n_workflow_failed_total Total number of workflows that failed +# HELP n8n_queue_job_completed_total Total number of n8n.queue.job.completed events. +# TYPE n8n_queue_job_completed_total counter +n8n_queue_job_completed_total 4 + +# HELP n8n_workflow_failed_total Total number of n8n.workflow.failed events. # TYPE n8n_workflow_failed_total counter -n8n_workflow_failed_total 425 -n8n_workflow_failed_total{workflow_id="12",workflow_name="CRM Sync"} 83 -n8n_workflow_failed_total{workflow_id="25",workflow_name="Webhook Intake"} 155 -n8n_workflow_failed_total{workflow_id="33",workflow_name="Slack Alerts"} 187 \ No newline at end of file +n8n_workflow_failed_total{workflow_id="testWorkflowFail"} 4 + +# HELP n8n_queue_job_failed_total Total number of n8n.queue.job.failed events. +# TYPE n8n_queue_job_failed_total counter +n8n_queue_job_failed_total 4 +# HELP n8n_queue_job_stalled_total Total number of n8n.queue.job.stalled events. +# TYPE n8n_queue_job_stalled_total counter +n8n_queue_job_stalled_total 1 +# HELP n8n_queue_job_dequeued_total Total number of n8n.queue.job.dequeued events. +# TYPE n8n_queue_job_dequeued_total counter +n8n_queue_job_dequeued_total 8 + +# HELP n8n_node_started_total Total number of n8n.node.started events. +# TYPE n8n_node_started_total counter +n8n_node_started_total{workflow_id="testWorkflowOk"} 8 +n8n_node_started_total{workflow_id="testWorkflowFail"} 8 + +# HELP n8n_node_finished_total Total number of n8n.node.finished events. +# TYPE n8n_node_finished_total counter +n8n_node_finished_total{workflow_id="testWorkflowOk"} 8 +n8n_node_finished_total{workflow_id="testWorkflowFail"} 8 + +# HELP n8n_runner_task_requested_total Total number of n8n.runner.task.requested events. +# TYPE n8n_runner_task_requested_total counter +n8n_runner_task_requested_total 4 diff --git a/n8n/tests/fixtures/n8n_custom.txt b/n8n/tests/fixtures/n8n_custom.txt index d06fa2589b0ba..26d3ee593f24c 100644 --- a/n8n/tests/fixtures/n8n_custom.txt +++ b/n8n/tests/fixtures/n8n_custom.txt @@ -1,34 +1,34 @@ # HELP test_process_cpu_user_seconds_total Total user CPU time spent in seconds. # TYPE test_process_cpu_user_seconds_total counter -test_process_cpu_user_seconds_total 8.298932999999998 +test_process_cpu_user_seconds_total 0.921656 # HELP test_process_cpu_system_seconds_total Total system CPU time spent in seconds. # TYPE test_process_cpu_system_seconds_total counter -test_process_cpu_system_seconds_total 3.1041119999999998 +test_process_cpu_system_seconds_total 0.157367 # HELP test_process_cpu_seconds_total Total user and system CPU time spent in seconds. # TYPE test_process_cpu_seconds_total counter -test_process_cpu_seconds_total 11.403044999999999 +test_process_cpu_seconds_total 1.0790229999999998 # HELP test_process_start_time_seconds Start time of the process since unix epoch in seconds. # TYPE test_process_start_time_seconds gauge -test_process_start_time_seconds 1761656578 +test_process_start_time_seconds 1778234580 # HELP test_process_resident_memory_bytes Resident memory size in bytes. # TYPE test_process_resident_memory_bytes gauge -test_process_resident_memory_bytes 245043200 +test_process_resident_memory_bytes 267681792 # HELP test_process_virtual_memory_bytes Virtual memory size in bytes. # TYPE test_process_virtual_memory_bytes gauge -test_process_virtual_memory_bytes 33656197120 +test_process_virtual_memory_bytes 18517532672 # HELP test_process_heap_bytes Process heap size in bytes. # TYPE test_process_heap_bytes gauge -test_process_heap_bytes 277200896 +test_process_heap_bytes 840728576 # HELP test_process_open_fds Number of open file descriptors. # TYPE test_process_open_fds gauge -test_process_open_fds 44 +test_process_open_fds 45 # HELP test_process_max_fds Maximum number of open file descriptors. # TYPE test_process_max_fds gauge @@ -36,59 +36,62 @@ test_process_max_fds 1048576 # HELP test_nodejs_eventloop_lag_seconds Lag of event loop in seconds. # TYPE test_nodejs_eventloop_lag_seconds gauge -test_nodejs_eventloop_lag_seconds 0.002765567 +test_nodejs_eventloop_lag_seconds 0.008676917 # HELP test_nodejs_eventloop_lag_min_seconds The minimum recorded event loop delay. # TYPE test_nodejs_eventloop_lag_min_seconds gauge -test_nodejs_eventloop_lag_min_seconds 0.010018816 +test_nodejs_eventloop_lag_min_seconds 0.006340608 # HELP test_nodejs_eventloop_lag_max_seconds The maximum recorded event loop delay. # TYPE test_nodejs_eventloop_lag_max_seconds gauge -test_nodejs_eventloop_lag_max_seconds 0.011239423 +test_nodejs_eventloop_lag_max_seconds 0.030228479 # HELP test_nodejs_eventloop_lag_mean_seconds The mean of the recorded event loop delays. # TYPE test_nodejs_eventloop_lag_mean_seconds gauge -test_nodejs_eventloop_lag_mean_seconds 0.010092521938958708 +test_nodejs_eventloop_lag_mean_seconds 0.012079332927643785 # HELP test_nodejs_eventloop_lag_stddev_seconds The standard deviation of the recorded event loop delays. # TYPE test_nodejs_eventloop_lag_stddev_seconds gauge -test_nodejs_eventloop_lag_stddev_seconds 0.00016945350643679045 +test_nodejs_eventloop_lag_stddev_seconds 0.0011467288819057616 # HELP test_nodejs_eventloop_lag_p50_seconds The 50th percentile of the recorded event loop delays. # TYPE test_nodejs_eventloop_lag_p50_seconds gauge -test_nodejs_eventloop_lag_p50_seconds 0.010067967 +test_nodejs_eventloop_lag_p50_seconds 0.012001279 # HELP test_nodejs_eventloop_lag_p90_seconds The 90th percentile of the recorded event loop delays. # TYPE test_nodejs_eventloop_lag_p90_seconds gauge -test_nodejs_eventloop_lag_p90_seconds 0.010067967 +test_nodejs_eventloop_lag_p90_seconds 0.013254655 # HELP test_nodejs_eventloop_lag_p99_seconds The 99th percentile of the recorded event loop delays. # TYPE test_nodejs_eventloop_lag_p99_seconds gauge -test_nodejs_eventloop_lag_p99_seconds 0.011124735 +test_nodejs_eventloop_lag_p99_seconds 0.014426111 # HELP test_nodejs_active_resources Number of active resources that are currently keeping the event loop alive, grouped by async resource type. # TYPE test_nodejs_active_resources gauge -test_nodejs_active_resources{type="PipeWrap"} 2 -test_nodejs_active_resources{type="TCPServerWrap"} 1 -test_nodejs_active_resources{type="TCPSocketWrap"} 1 -test_nodejs_active_resources{type="Timeout"} 13 +test_nodejs_active_resources{type="PipeWrap"} 5 +test_nodejs_active_resources{type="TCPServerWrap"} 2 +test_nodejs_active_resources{type="TCPSocketWrap"} 9 +test_nodejs_active_resources{type="ProcessWrap"} 1 +test_nodejs_active_resources{type="Timeout"} 20 test_nodejs_active_resources{type="Immediate"} 1 # HELP test_nodejs_active_resources_total Total number of active resources. # TYPE test_nodejs_active_resources_total gauge -test_nodejs_active_resources_total 18 +test_nodejs_active_resources_total 38 # HELP test_nodejs_active_handles Number of active libuv handles grouped by handle type. Every handle type is C++ class name. # TYPE test_nodejs_active_handles gauge -test_nodejs_active_handles{type="Socket"} 3 -test_nodejs_active_handles{type="Server"} 1 +test_nodejs_active_handles{type="Socket"} 14 +test_nodejs_active_handles{type="Server"} 2 +test_nodejs_active_handles{type="ChildProcess"} 1 # HELP test_nodejs_active_handles_total Total number of active handles. # TYPE test_nodejs_active_handles_total gauge -test_nodejs_active_handles_total 4 +test_nodejs_active_handles_total 17 # HELP test_nodejs_active_requests Number of active libuv requests grouped by request type. Every request type is C++ class name. # TYPE test_nodejs_active_requests gauge +test_nodejs_active_requests{type="FSReqCallback"} 1 # HELP test_nodejs_active_requests_total Total number of active requests. # TYPE test_nodejs_active_requests_total gauge @@ -96,81 +99,87 @@ test_nodejs_active_requests_total 0 # HELP test_nodejs_heap_size_total_bytes Process heap size from Node.js in bytes. # TYPE test_nodejs_heap_size_total_bytes gauge -test_nodejs_heap_size_total_bytes 142774272 +test_nodejs_heap_size_total_bytes 146391040 # HELP test_nodejs_heap_size_used_bytes Process heap size used from Node.js in bytes. # TYPE test_nodejs_heap_size_used_bytes gauge -test_nodejs_heap_size_used_bytes 136342632 +test_nodejs_heap_size_used_bytes 136336448 # HELP test_nodejs_external_memory_bytes Node.js external memory size in bytes. # TYPE test_nodejs_external_memory_bytes gauge -test_nodejs_external_memory_bytes 20824585 +test_nodejs_external_memory_bytes 20993559 # HELP test_nodejs_heap_space_size_total_bytes Process heap space size total from Node.js in bytes. # TYPE test_nodejs_heap_space_size_total_bytes gauge test_nodejs_heap_space_size_total_bytes{space="read_only"} 0 -test_nodejs_heap_space_size_total_bytes{space="new"} 1048576 -test_nodejs_heap_space_size_total_bytes{space="old"} 122208256 -test_nodejs_heap_space_size_total_bytes{space="code"} 4718592 +test_nodejs_heap_space_size_total_bytes{space="new"} 2097152 +test_nodejs_heap_space_size_total_bytes{space="old"} 116920320 +test_nodejs_heap_space_size_total_bytes{space="code"} 5505024 test_nodejs_heap_space_size_total_bytes{space="shared"} 0 -test_nodejs_heap_space_size_total_bytes{space="trusted"} 7643136 +test_nodejs_heap_space_size_total_bytes{space="trusted"} 11624448 +test_nodejs_heap_space_size_total_bytes{space="shared_trusted"} 0 test_nodejs_heap_space_size_total_bytes{space="new_large_object"} 0 -test_nodejs_heap_space_size_total_bytes{space="large_object"} 7000064 -test_nodejs_heap_space_size_total_bytes{space="code_large_object"} 155648 +test_nodejs_heap_space_size_total_bytes{space="large_object"} 9875456 +test_nodejs_heap_space_size_total_bytes{space="code_large_object"} 368640 test_nodejs_heap_space_size_total_bytes{space="shared_large_object"} 0 +test_nodejs_heap_space_size_total_bytes{space="shared_trusted_large_object"} 0 test_nodejs_heap_space_size_total_bytes{space="trusted_large_object"} 0 # HELP test_nodejs_heap_space_size_used_bytes Process heap space size used from Node.js in bytes. # TYPE test_nodejs_heap_space_size_used_bytes gauge test_nodejs_heap_space_size_used_bytes{space="read_only"} 0 -test_nodejs_heap_space_size_used_bytes{space="new"} 652896 -test_nodejs_heap_space_size_used_bytes{space="old"} 119347344 -test_nodejs_heap_space_size_used_bytes{space="code"} 4183424 +test_nodejs_heap_space_size_used_bytes{space="new"} 382808 +test_nodejs_heap_space_size_used_bytes{space="old"} 111099512 +test_nodejs_heap_space_size_used_bytes{space="code"} 4853344 test_nodejs_heap_space_size_used_bytes{space="shared"} 0 -test_nodejs_heap_space_size_used_bytes{space="trusted"} 5187192 +test_nodejs_heap_space_size_used_bytes{space="trusted"} 9839592 +test_nodejs_heap_space_size_used_bytes{space="shared_trusted"} 0 test_nodejs_heap_space_size_used_bytes{space="new_large_object"} 0 -test_nodejs_heap_space_size_used_bytes{space="large_object"} 6837144 -test_nodejs_heap_space_size_used_bytes{space="code_large_object"} 138432 +test_nodejs_heap_space_size_used_bytes{space="large_object"} 9806288 +test_nodejs_heap_space_size_used_bytes{space="code_large_object"} 361728 test_nodejs_heap_space_size_used_bytes{space="shared_large_object"} 0 +test_nodejs_heap_space_size_used_bytes{space="shared_trusted_large_object"} 0 test_nodejs_heap_space_size_used_bytes{space="trusted_large_object"} 0 # HELP test_nodejs_heap_space_size_available_bytes Process heap space size available from Node.js in bytes. # TYPE test_nodejs_heap_space_size_available_bytes gauge test_nodejs_heap_space_size_available_bytes{space="read_only"} 0 -test_nodejs_heap_space_size_available_bytes{space="new"} 378016 -test_nodejs_heap_space_size_available_bytes{space="old"} 430568 -test_nodejs_heap_space_size_available_bytes{space="code"} 239680 +test_nodejs_heap_space_size_available_bytes{space="new"} 665704 +test_nodejs_heap_space_size_available_bytes{space="old"} 5484264 +test_nodejs_heap_space_size_available_bytes{space="code"} 651008 test_nodejs_heap_space_size_available_bytes{space="shared"} 0 -test_nodejs_heap_space_size_available_bytes{space="trusted"} 2323072 +test_nodejs_heap_space_size_available_bytes{space="trusted"} 1771032 +test_nodejs_heap_space_size_available_bytes{space="shared_trusted"} 0 test_nodejs_heap_space_size_available_bytes{space="new_large_object"} 1048576 test_nodejs_heap_space_size_available_bytes{space="large_object"} 0 test_nodejs_heap_space_size_available_bytes{space="code_large_object"} 0 test_nodejs_heap_space_size_available_bytes{space="shared_large_object"} 0 +test_nodejs_heap_space_size_available_bytes{space="shared_trusted_large_object"} 0 test_nodejs_heap_space_size_available_bytes{space="trusted_large_object"} 0 # HELP test_nodejs_version_info Node.js version info. # TYPE test_nodejs_version_info gauge -test_nodejs_version_info{version="v22.18.0",major="22",minor="18",patch="0"} 1 +test_nodejs_version_info{version="v24.14.1",major="24",minor="14",patch="1"} 1 # HELP test_nodejs_gc_duration_seconds Garbage collection duration by kind, one of major, minor, incremental or weakcb. # TYPE test_nodejs_gc_duration_seconds histogram -test_nodejs_gc_duration_seconds_bucket{le="0.001",kind="minor"} 128 -test_nodejs_gc_duration_seconds_bucket{le="0.01",kind="minor"} 132 -test_nodejs_gc_duration_seconds_bucket{le="0.1",kind="minor"} 132 -test_nodejs_gc_duration_seconds_bucket{le="1",kind="minor"} 132 -test_nodejs_gc_duration_seconds_bucket{le="2",kind="minor"} 132 -test_nodejs_gc_duration_seconds_bucket{le="5",kind="minor"} 132 -test_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="minor"} 132 -test_nodejs_gc_duration_seconds_sum{kind="minor"} 0.09924478498101237 -test_nodejs_gc_duration_seconds_count{kind="minor"} 132 -test_nodejs_gc_duration_seconds_bucket{le="0.001",kind="incremental"} 1 +test_nodejs_gc_duration_seconds_bucket{le="0.001",kind="minor"} 0 +test_nodejs_gc_duration_seconds_bucket{le="0.01",kind="minor"} 2 +test_nodejs_gc_duration_seconds_bucket{le="0.1",kind="minor"} 2 +test_nodejs_gc_duration_seconds_bucket{le="1",kind="minor"} 2 +test_nodejs_gc_duration_seconds_bucket{le="2",kind="minor"} 2 +test_nodejs_gc_duration_seconds_bucket{le="5",kind="minor"} 2 +test_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="minor"} 2 +test_nodejs_gc_duration_seconds_sum{kind="minor"} 0.004925500000128522 +test_nodejs_gc_duration_seconds_count{kind="minor"} 2 +test_nodejs_gc_duration_seconds_bucket{le="0.001",kind="incremental"} 0 test_nodejs_gc_duration_seconds_bucket{le="0.01",kind="incremental"} 2 test_nodejs_gc_duration_seconds_bucket{le="0.1",kind="incremental"} 2 test_nodejs_gc_duration_seconds_bucket{le="1",kind="incremental"} 2 test_nodejs_gc_duration_seconds_bucket{le="2",kind="incremental"} 2 test_nodejs_gc_duration_seconds_bucket{le="5",kind="incremental"} 2 test_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="incremental"} 2 -test_nodejs_gc_duration_seconds_sum{kind="incremental"} 0.0022786640077829363 +test_nodejs_gc_duration_seconds_sum{kind="incremental"} 0.005939041999867186 test_nodejs_gc_duration_seconds_count{kind="incremental"} 2 test_nodejs_gc_duration_seconds_bucket{le="0.001",kind="major"} 0 test_nodejs_gc_duration_seconds_bucket{le="0.01",kind="major"} 0 @@ -179,232 +188,251 @@ test_nodejs_gc_duration_seconds_bucket{le="1",kind="major"} 2 test_nodejs_gc_duration_seconds_bucket{le="2",kind="major"} 2 test_nodejs_gc_duration_seconds_bucket{le="5",kind="major"} 2 test_nodejs_gc_duration_seconds_bucket{le="+Inf",kind="major"} 2 -test_nodejs_gc_duration_seconds_sum{kind="major"} 0.1028408939987421 +test_nodejs_gc_duration_seconds_sum{kind="major"} 0.032123332999879496 test_nodejs_gc_duration_seconds_count{kind="major"} 2 +# HELP test_process_pss_bytes Proportional Set Size of the process in bytes. +# TYPE test_process_pss_bytes gauge +test_process_pss_bytes 220097536 + # HELP test_version_info n8n version info. # TYPE test_version_info gauge -test_version_info{version="v1.117.2",major="1",minor="117",patch="2"} 1 +test_version_info{version="v2.19.5",major="2",minor="19",patch="5"} 1 # HELP test_instance_role_leader Whether this main instance is the leader (1) or not (0). # TYPE test_instance_role_leader gauge test_instance_role_leader 1 +# HELP test_cache_hits_total Total number of cache hits. +# TYPE test_cache_hits_total counter +test_cache_hits_total 53 + +# HELP test_cache_misses_total Total number of cache misses. +# TYPE test_cache_misses_total counter +test_cache_misses_total 15 + +# HELP test_cache_updates_total Total number of cache updates. +# TYPE test_cache_updates_total counter +test_cache_updates_total 1 + # HELP test_http_request_duration_seconds duration histogram of http responses labeled with: status_code # TYPE test_http_request_duration_seconds histogram +test_http_request_duration_seconds_bucket{le="0.003"} 5 +test_http_request_duration_seconds_bucket{le="0.03"} 5 +test_http_request_duration_seconds_bucket{le="0.1"} 5 +test_http_request_duration_seconds_bucket{le="0.3"} 5 +test_http_request_duration_seconds_bucket{le="1.5"} 5 +test_http_request_duration_seconds_bucket{le="10"} 5 +test_http_request_duration_seconds_bucket{le="+Inf"} 5 +test_http_request_duration_seconds_sum 0.0018007910000000002 +test_http_request_duration_seconds_count 5 # HELP test_last_activity last instance activity (backend request) in Unix time (seconds). # TYPE test_last_activity gauge -test_last_activity 1761656582 +test_last_activity 1778234587 + +# HELP test_scaling_mode_queue_jobs_waiting Current number of enqueued jobs waiting for pickup in scaling mode. +# TYPE test_scaling_mode_queue_jobs_waiting gauge +test_scaling_mode_queue_jobs_waiting 0 + +# HELP test_scaling_mode_queue_jobs_active Current number of jobs being processed across all workers in scaling mode. +# TYPE test_scaling_mode_queue_jobs_active gauge +test_scaling_mode_queue_jobs_active 0 + +# HELP test_scaling_mode_queue_jobs_completed Total number of jobs completed across all workers in scaling mode since instance start. +# TYPE test_scaling_mode_queue_jobs_completed counter +test_scaling_mode_queue_jobs_completed 8 + +# HELP test_scaling_mode_queue_jobs_failed Total number of jobs failed across all workers in scaling mode since instance start. +# TYPE test_scaling_mode_queue_jobs_failed counter +test_scaling_mode_queue_jobs_failed 0 + +# HELP test_workflow_execution_duration_seconds Workflow execution duration in seconds. +# TYPE test_workflow_execution_duration_seconds histogram +test_workflow_execution_duration_seconds_bucket{le="0.005",status="success",mode="webhook",workflow_id="testWorkflowOk"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.01",status="success",mode="webhook",workflow_id="testWorkflowOk"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.025",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="0.05",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="0.1",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="0.25",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="0.5",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="1",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="2.5",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="5",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="10",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="30",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="60",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="120",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="300",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="600",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="+Inf",status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_sum{status="success",mode="webhook",workflow_id="testWorkflowOk"} 0.027999999999999997 +test_workflow_execution_duration_seconds_count{status="success",mode="webhook",workflow_id="testWorkflowOk"} 4 +test_workflow_execution_duration_seconds_bucket{le="0.005",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.01",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.025",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.05",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.1",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.25",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 3 +test_workflow_execution_duration_seconds_bucket{le="0.5",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="1",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="2.5",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="5",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="10",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="30",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="60",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="120",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="300",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="600",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_bucket{le="+Inf",status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 +test_workflow_execution_duration_seconds_sum{status="failed",mode="webhook",workflow_id="testWorkflowFail"} 0.405 +test_workflow_execution_duration_seconds_count{status="failed",mode="webhook",workflow_id="testWorkflowFail"} 4 # HELP test_active_workflow_count Total number of active workflows. # TYPE test_active_workflow_count gauge -test_active_workflow_count{workflow_id="wf_8a3b2c1d"} 0 -test_active_workflow_count{workflow_id="wf_7f4e9a2b"} 0 -test_active_workflow_count{workflow_id="wf_5d6c8e1f"} 0 - -# HELP test_nodejs_event_loop_lag_seconds Event loop lag in seconds -# TYPE test_nodejs_event_loop_lag_seconds gauge -test_nodejs_event_loop_lag_seconds 0.0035 - -# HELP test_nodejs_heap_total_bytes Total heap size allocated in bytes -# TYPE test_nodejs_heap_total_bytes gauge -test_nodejs_heap_total_bytes 73400320 - -# HELP test_nodejs_heap_used_bytes Heap memory used in bytes -# TYPE test_nodejs_heap_used_bytes gauge -test_nodejs_heap_used_bytes 51200000 - -# HELP test_workflow_executions_total Total number of workflow executions -# TYPE test_workflow_executions_total counter -test_workflow_executions_total{status="success",workflow_id="wf_8a3b2c1d"} 45 -test_workflow_executions_total{status="success",workflow_id="wf_7f4e9a2b"} 38 -test_workflow_executions_total{status="success",workflow_id="wf_5d6c8e1f"} 45 -test_workflow_executions_total{status="error",workflow_id="wf_8a3b2c1d"} 3 -test_workflow_executions_total{status="error",workflow_id="wf_5d6c8e1f"} 4 - -# HELP test_workflow_executions_duration_seconds Workflow execution duration in seconds -# TYPE test_workflow_executions_duration_seconds histogram -test_workflow_executions_duration_seconds_bucket{le="0.1",workflow_id="wf_8a3b2c1d"} 5 -test_workflow_executions_duration_seconds_bucket{le="1",workflow_id="wf_8a3b2c1d"} 18 -test_workflow_executions_duration_seconds_bucket{le="+Inf",workflow_id="wf_8a3b2c1d"} 48 -test_workflow_executions_duration_seconds_sum{workflow_id="wf_8a3b2c1d"} 14.3 -test_workflow_executions_duration_seconds_count{workflow_id="wf_8a3b2c1d"} 48 -test_workflow_executions_duration_seconds_bucket{le="0.1",workflow_id="wf_7f4e9a2b"} 4 -test_workflow_executions_duration_seconds_bucket{le="1",workflow_id="wf_7f4e9a2b"} 15 -test_workflow_executions_duration_seconds_bucket{le="+Inf",workflow_id="wf_7f4e9a2b"} 38 -test_workflow_executions_duration_seconds_sum{workflow_id="wf_7f4e9a2b"} 11.2 -test_workflow_executions_duration_seconds_count{workflow_id="wf_7f4e9a2b"} 38 -test_workflow_executions_duration_seconds_bucket{le="0.1",workflow_id="wf_5d6c8e1f"} 3 -test_workflow_executions_duration_seconds_bucket{le="1",workflow_id="wf_5d6c8e1f"} 12 -test_workflow_executions_duration_seconds_bucket{le="+Inf",workflow_id="wf_5d6c8e1f"} 49 -test_workflow_executions_duration_seconds_sum{workflow_id="wf_5d6c8e1f"} 12.7 -test_workflow_executions_duration_seconds_count{workflow_id="wf_5d6c8e1f"} 49 - -# HELP test_workflow_started_total Total number of workflows started -# TYPE test_workflow_started_total counter -test_workflow_started_total 25634 -test_workflow_started_total{workflow_id="12",workflow_name="CRM Sync"} 8142 -test_workflow_started_total{workflow_id="25",workflow_name="Webhook Intake"} 14290 -test_workflow_started_total{workflow_id="33",workflow_name="Slack Alerts"} 2202 +test_active_workflow_count 2 -# HELP test_workflow_success_total Total number of workflows completed successfully -# TYPE test_workflow_success_total counter -test_workflow_success_total 25209 -test_workflow_success_total{workflow_id="12",workflow_name="CRM Sync"} 8059 -test_workflow_success_total{workflow_id="25",workflow_name="Webhook Intake"} 14135 -test_workflow_success_total{workflow_id="33",workflow_name="Slack Alerts"} 2015 +# HELP test_production_executions Total number of production workflow executions (success + error). +# TYPE test_production_executions gauge +test_production_executions 8 -# HELP test_workflow_failed_total Total number of workflows that failed -# TYPE test_workflow_failed_total counter -test_workflow_failed_total 425 -test_workflow_failed_total{workflow_id="12",workflow_name="CRM Sync"} 83 -test_workflow_failed_total{workflow_id="25",workflow_name="Webhook Intake"} 155 -test_workflow_failed_total{workflow_id="33",workflow_name="Slack Alerts"} 187 - - -# HELP test_queue_jobs_total Total number of queue jobs -# TYPE test_queue_jobs_total counter -test_queue_jobs_total{state="waiting"} 3 -test_queue_jobs_total{state="active"} 2 -test_queue_jobs_total{state="completed"} 148 -test_queue_jobs_total{state="failed"} 5 - -# HELP test_queue_jobs_duration_seconds Job duration in seconds -# TYPE test_queue_jobs_duration_seconds histogram -test_queue_jobs_duration_seconds_bucket{le="0.1"} 22 -test_queue_jobs_duration_seconds_bucket{le="1"} 84 -test_queue_jobs_duration_seconds_bucket{le="+Inf"} 150 -test_queue_jobs_duration_seconds_sum 44.8 -test_queue_jobs_duration_seconds_count 150 - -# HELP test_queue_job_waiting_total Number of jobs currently waiting in the queue -# TYPE test_queue_job_waiting_total gauge -test_queue_job_waiting_total{queue="default"} 3 - -# HELP test_queue_job_active_total Number of jobs currently being processed -# TYPE test_queue_job_active_total gauge -test_queue_job_active_total{queue="default"} 2 - -# HELP test_queue_job_completed_total Number of jobs completed successfully -# TYPE test_queue_job_completed_total counter -test_queue_job_completed_total{queue="default"} 15892 +# HELP test_production_root_executions Total number of production root workflow executions (excludes sub-workflows). +# TYPE test_production_root_executions gauge +test_production_root_executions 8 -# HELP test_queue_job_failed_total Number of jobs that have failed -# TYPE test_queue_job_failed_total counter -test_queue_job_failed_total{queue="default"} 47 +# HELP test_manual_executions Total number of manual workflow executions (success + error). +# TYPE test_manual_executions gauge +test_manual_executions 0 -# HELP test_queue_job_dequeued_total Number of jobs dequeued (picked up from queue) -# TYPE test_queue_job_dequeued_total counter -test_queue_job_dequeued_total{queue="default"} 15939 +# HELP test_enabled_users Total number of enabled users. +# TYPE test_enabled_users gauge +test_enabled_users 1 + +# HELP test_users Total number of users. +# TYPE test_users gauge +test_users 1 + +# HELP test_workflows Total number of workflows. +# TYPE test_workflows gauge +test_workflows 2 + +# HELP test_credentials Total number of credentials. +# TYPE test_credentials gauge +test_credentials 0 + +# HELP test_token_exchange_requests_total Total number of token exchange requests. +# TYPE test_token_exchange_requests_total counter +test_token_exchange_requests_total{result="success"} 0 +test_token_exchange_requests_total{result="failure"} 0 + +# HELP test_token_exchange_failures_total Total number of token exchange failures broken down by reason. +# TYPE test_token_exchange_failures_total counter +test_token_exchange_failures_total{reason="invalid_token"} 0 + +# HELP test_embed_login_requests_total Total number of embed login requests. +# TYPE test_embed_login_requests_total counter +test_embed_login_requests_total{result="success"} 0 +test_embed_login_requests_total{result="failure"} 0 + +# HELP test_embed_login_failures_total Total number of embed login failures broken down by reason. +# TYPE test_embed_login_failures_total counter +test_embed_login_failures_total{reason="unauthorized"} 0 + +# HELP test_token_exchange_jit_provisioning_total Total number of users JIT-provisioned via token exchange. +# TYPE test_token_exchange_jit_provisioning_total counter +test_token_exchange_jit_provisioning_total 0 + +# HELP test_token_exchange_identity_linked_total Total number of external identities linked to existing users via token exchange. +# TYPE test_token_exchange_identity_linked_total counter +test_token_exchange_identity_linked_total 0 -# HELP test_queue_job_enqueued_total Number of jobs added to the queue +# HELP test_audit_workflow_activated_total Total number of n8n.audit.workflow.activated events. +# TYPE test_audit_workflow_activated_total counter +test_audit_workflow_activated_total{workflow_id="testWorkflowOk"} 1 +test_audit_workflow_activated_total{workflow_id="testWorkflowFail"} 1 + +# HELP test_audit_workflow_archived_total Total number of n8n.audit.workflow.archived events. +# TYPE test_audit_workflow_archived_total counter +test_audit_workflow_archived_total{workflow_id="testWorkflowOk"} 1 + +# HELP test_audit_workflow_created_total Total number of n8n.audit.workflow.created events. +# TYPE test_audit_workflow_created_total counter +test_audit_workflow_created_total{workflow_id="testWorkflowOk"} 1 + +# HELP test_audit_workflow_deactivated_total Total number of n8n.audit.workflow.deactivated events. +# TYPE test_audit_workflow_deactivated_total counter +test_audit_workflow_deactivated_total{workflow_id="testWorkflowOk"} 1 + +# HELP test_audit_workflow_deleted_total Total number of n8n.audit.workflow.deleted events. +# TYPE test_audit_workflow_deleted_total counter +test_audit_workflow_deleted_total{workflow_id="testWorkflowOk"} 1 + +# HELP test_queue_job_enqueued_total Total number of n8n.queue.job.enqueued events. # TYPE test_queue_job_enqueued_total counter -test_queue_job_enqueued_total{queue="default"} 15670 - -# HELP test_queue_job_delayed_total Number of jobs scheduled to run later -# TYPE test_queue_job_delayed_total gauge -test_queue_job_delayed_total{queue="default"} 5 - -# HELP test_queue_job_waiting_duration_seconds Duration jobs spend waiting before being processed -# TYPE test_queue_job_waiting_duration_seconds histogram -test_queue_job_waiting_duration_seconds_bucket{queue="default",le="0.1"} 50 -test_queue_job_waiting_duration_seconds_bucket{queue="default",le="1"} 241 -test_queue_job_waiting_duration_seconds_bucket{queue="default",le="5"} 820 -test_queue_job_waiting_duration_seconds_bucket{queue="default",le="10"} 1105 -test_queue_job_waiting_duration_seconds_bucket{queue="default",le="30"} 1240 -test_queue_job_waiting_duration_seconds_bucket{queue="default",le="+Inf"} 1253 -test_queue_job_waiting_duration_seconds_sum{queue="default"} 450.32 -test_queue_job_waiting_duration_seconds_count{queue="default"} 1253 - -# HELP test_api_requests_total Total API requests -# TYPE test_api_requests_total counter -test_api_requests_total{method="GET",endpoint="/workflows"} 240 -test_api_requests_total{method="POST",endpoint="/executions"} 75 - -# HELP test_api_request_duration_seconds API request duration in seconds -# TYPE test_api_request_duration_seconds histogram -test_api_request_duration_seconds_bucket{le="0.1"} 90 -test_api_request_duration_seconds_bucket{le="1"} 120 -test_api_request_duration_seconds_bucket{le="+Inf"} 125 -test_api_request_duration_seconds_sum 15.3 -test_api_request_duration_seconds_count 125 - -# HELP test_cache_operations_total Total cache operations -# TYPE test_cache_operations_total counter -test_cache_operations_total{operation="get"} 1250 -test_cache_operations_total{operation="set"} 320 -test_cache_operations_total{operation="delete"} 10 - -# HELP test_cache_hits_total Cache hits -# TYPE test_cache_hits_total counter -test_cache_hits_total 1080 +test_queue_job_enqueued_total 8 -# HELP test_cache_misses_total Cache misses -# TYPE test_cache_misses_total counter -test_cache_misses_total 170 - -# HELP test_cache_errors_total Cache errors -# TYPE test_cache_errors_total counter -test_cache_errors_total 0 - -# HELP test_cache_latency_seconds Cache operation latency in seconds -# TYPE test_cache_latency_seconds histogram -test_cache_latency_seconds_bucket{le="0.001"} 90 -test_cache_latency_seconds_bucket{le="0.01"} 240 -test_cache_latency_seconds_bucket{le="+Inf"} 260 -test_cache_latency_seconds_sum 1.42 -test_cache_latency_seconds_count 260 - -# HELP test_eventbus_events_total Total events published on the event bus -# TYPE test_eventbus_events_total counter -test_eventbus_events_total{event_type="workflowStarted"} 140 -test_eventbus_events_total{event_type="workflowCompleted"} 135 -test_eventbus_events_total{event_type="workflowFailed"} 5 - -# HELP test_eventbus_events_processed_total Total processed events -# TYPE test_eventbus_events_processed_total counter -test_eventbus_events_processed_total 138 - -# HELP test_eventbus_events_failed_total Total failed event processing -# TYPE test_eventbus_events_failed_total counter -test_eventbus_events_failed_total 2 - -# HELP test_eventbus_queue_size Current event queue size -# TYPE test_eventbus_queue_size gauge -test_eventbus_queue_size 1 - -# HELP test_eventbus_connections_total Active event bus backend connections -# TYPE test_eventbus_connections_total gauge -test_eventbus_connections_total 1 - -# HELP test_workflow_executions_active Number of active workflow executions -# TYPE test_workflow_executions_active gauge -test_workflow_executions_active 3 - -# HELP test_queue_job_attempts_total Total number of job attempts -# TYPE test_queue_job_attempts_total counter -test_queue_job_attempts_total{result="success"} 435 -test_queue_job_attempts_total{result="failed"} 12 - -# HELP test_workflow_started_total Total number of workflows started +# HELP test_workflow_started_total Total number of n8n.workflow.started events. # TYPE test_workflow_started_total counter -test_workflow_started_total 25634 -test_workflow_started_total{workflow_id="12",workflow_name="CRM Sync"} 8142 -test_workflow_started_total{workflow_id="25",workflow_name="Webhook Intake"} 14290 -test_workflow_started_total{workflow_id="33",workflow_name="Slack Alerts"} 2202 +test_workflow_started_total{workflow_id="testWorkflowOk"} 4 +test_workflow_started_total{workflow_id="testWorkflowFail"} 4 + +# HELP test_audit_workflow_executed_total Total number of n8n.audit.workflow.executed events. +# TYPE test_audit_workflow_executed_total counter +test_audit_workflow_executed_total{workflow_id="testWorkflowOk"} 4 +test_audit_workflow_executed_total{workflow_id="testWorkflowFail"} 4 + +# HELP test_audit_workflow_resumed_total Total number of n8n.audit.workflow.resumed events. +# TYPE test_audit_workflow_resumed_total counter +test_audit_workflow_resumed_total{workflow_id="testWorkflowOk"} 1 + +# HELP test_audit_workflow_unarchived_total Total number of n8n.audit.workflow.unarchived events. +# TYPE test_audit_workflow_unarchived_total counter +test_audit_workflow_unarchived_total{workflow_id="testWorkflowOk"} 1 -# HELP test_workflow_success_total Total number of workflows completed successfully +# HELP test_audit_workflow_updated_total Total number of n8n.audit.workflow.updated events. +# TYPE test_audit_workflow_updated_total counter +test_audit_workflow_updated_total{workflow_id="testWorkflowOk"} 1 + +# HELP test_audit_workflow_version_updated_total Total number of n8n.audit.workflow.version.updated events. +# TYPE test_audit_workflow_version_updated_total counter +test_audit_workflow_version_updated_total{workflow_id="testWorkflowOk"} 1 + +# HELP test_audit_workflow_waiting_total Total number of n8n.audit.workflow.waiting events. +# TYPE test_audit_workflow_waiting_total counter +test_audit_workflow_waiting_total{workflow_id="testWorkflowOk"} 1 + +# HELP test_workflow_success_total Total number of n8n.workflow.success events. # TYPE test_workflow_success_total counter -test_workflow_success_total 25209 -test_workflow_success_total{workflow_id="12",workflow_name="CRM Sync"} 8059 -test_workflow_success_total{workflow_id="25",workflow_name="Webhook Intake"} 14135 -test_workflow_success_total{workflow_id="33",workflow_name="Slack Alerts"} 2015 +test_workflow_success_total{workflow_id="testWorkflowOk"} 4 + +# HELP test_queue_job_completed_total Total number of n8n.queue.job.completed events. +# TYPE test_queue_job_completed_total counter +test_queue_job_completed_total 4 -# HELP test_workflow_failed_total Total number of workflows that failed +# HELP test_workflow_failed_total Total number of n8n.workflow.failed events. # TYPE test_workflow_failed_total counter -test_workflow_failed_total 425 -test_workflow_failed_total{workflow_id="12",workflow_name="CRM Sync"} 83 -test_workflow_failed_total{workflow_id="25",workflow_name="Webhook Intake"} 155 -test_workflow_failed_total{workflow_id="33",workflow_name="Slack Alerts"} 187 +test_workflow_failed_total{workflow_id="testWorkflowFail"} 4 + +# HELP test_queue_job_failed_total Total number of n8n.queue.job.failed events. +# TYPE test_queue_job_failed_total counter +test_queue_job_failed_total 4 +# HELP test_queue_job_stalled_total Total number of n8n.queue.job.stalled events. +# TYPE test_queue_job_stalled_total counter +test_queue_job_stalled_total 1 +# HELP test_queue_job_dequeued_total Total number of n8n.queue.job.dequeued events. +# TYPE test_queue_job_dequeued_total counter +test_queue_job_dequeued_total 8 + +# HELP test_node_started_total Total number of n8n.node.started events. +# TYPE test_node_started_total counter +test_node_started_total{workflow_id="testWorkflowOk"} 8 +test_node_started_total{workflow_id="testWorkflowFail"} 8 + +# HELP test_node_finished_total Total number of n8n.node.finished events. +# TYPE test_node_finished_total counter +test_node_finished_total{workflow_id="testWorkflowOk"} 8 +test_node_finished_total{workflow_id="testWorkflowFail"} 8 +# HELP test_runner_task_requested_total Total number of n8n.runner.task.requested events. +# TYPE test_runner_task_requested_total counter +test_runner_task_requested_total 4 diff --git a/n8n/tests/lab/README.md b/n8n/tests/lab/README.md new file mode 100644 index 0000000000000..493cde1b93056 --- /dev/null +++ b/n8n/tests/lab/README.md @@ -0,0 +1,91 @@ +# n8n integration lab + +A long-running n8n simulation that pushes real metrics to a Datadog org so you can iterate on dashboards, monitors, and customer reports against live data. + +It reuses the integration test environment (so you get queue mode, a worker, the full Datadog Agent) and layers on top: + +- five lab-only workflows with distinct shapes (fast, slow, always-fail, flaky, multi-step chain), and +- an async traffic generator that drives a configurable webhook + REST API mix and reloads its config on the fly. + +## Setup + +### Datadog credentials + +The lab uses a `.ddev.toml` in this directory (already committed) to point at an `n8nlab` ddev org. Add the matching entry to your global `~/.ddev/config.toml`: + +```toml +[orgs.n8nlab] +api_key = "" +site = "datadoghq.com" +``` + +Use any org name you like; just keep `org = "n8nlab"` in `tests/lab/.ddev.toml` aligned with what you put in your global config. + +### Traffic configuration + +`tests/lab/config.yaml` controls the traffic mix. Probabilities are independent draws per tick, and values above `1.0` mean "more than one call per tick on average": + +```yaml +webhook_probabilities: + /webhook/lab/fast: 0.9 # bulk traffic, fast histogram bucket + /webhook/lab/slow: 0.4 # populates higher histogram buckets + /webhook/lab/fail: 0.15 # populates workflow_failed + /webhook/lab/flaky: 0.5 # mixed success/failure + /webhook/lab/chain: 0.3 # 4 Set nodes -> 4x node.* events +api_probabilities: + /healthz: 1.0 + /healthz/readiness: 0.5 + /rest/login: 0.2 # 401s +tick_seconds: 1.0 +reload_interval: 5 +``` + +Edit this file while the lab is running and the generator will pick it up on the next `reload_interval` tick. + +## Usage + +### One-shot (recommended) + +```bash +./tests/lab/run_lab.sh # default env: py3.13-2 (n8n 2.19.5) +./tests/lab/run_lab.sh -e py3.13-1 # n8n 1.118.1 +``` + +The script brings up the env, imports & activates the lab workflows, restarts n8n so webhooks register, and starts the traffic generator. `Ctrl+C` triggers a `cleanup` trap that runs `lab:stop` to tear everything down. + +### Individual hatch commands + +```bash +hatch run lab:start -e py3.13-2 # ddev env start + import lab workflows + restart +hatch run lab:generate # traffic loop (foreground; Ctrl+C to stop) +hatch run lab:stop -e py3.13-2 # ddev env stop +``` + +## What this exercises + +The lab is wired to populate every metric family the integration maps that does not require an SSO/embed flow: + +| Metric family | How the lab drives it | +| --- | --- | +| `n8n.workflow.started/.success/.failed.count` | every webhook hit goes through the EventBus | +| `n8n.workflow.execution.duration.seconds.*` (n8n 2.x) | the slow & chain workflows spread the histogram | +| `n8n.node.started/.finished.count` | the worker fires per-node events; the chain workflow yields 4× per call | +| `n8n.queue.job.enqueued/.dequeued/.completed/.failed.count` | queue mode is enabled in the test compose | +| `n8n.scaling.mode.queue.jobs.{active,waiting,completed,failed}` | main process tracks queue depth | +| `n8n.http.request.duration.seconds.*` | the API mix (`/healthz`, `/rest/login`) drives status code labels | +| `n8n.cache.hits/.misses/.updates.count` | cache traffic comes from n8n itself during execution | +| `n8n.last.activity` | refreshed on every API call | +| `n8n.{production,production.root,manual,enabled.users,users,workflows,credentials}.total` | enabled in the test compose via `N8N_METRICS_INCLUDE_WORKFLOW_STATISTICS` | + +What it does **not** exercise (these need extra infra and are documented in the README "Version-specific metrics" section): + +- `n8n.token.exchange.*` and `n8n.embed.login.*` — require an SSO IdP / embed integration. +- `n8n.audit.workflow.*` — fire on UI-driven activate/deactivate; not currently driven by the generator. Future iteration could call the n8n REST API to toggle workflow active state on a slow timer. + +## Stopping the lab + +`Ctrl+C` from `run_lab.sh` cleans up automatically. If you ran the hatch commands directly: + +```bash +hatch run lab:stop -e py3.13-2 +``` diff --git a/n8n/tests/lab/config.yaml b/n8n/tests/lab/config.yaml new file mode 100644 index 0000000000000..9e1e2e76706b2 --- /dev/null +++ b/n8n/tests/lab/config.yaml @@ -0,0 +1,25 @@ +# n8n lab traffic configuration. Edit this file while the lab is running and +# changes are picked up every `reload_interval` seconds. + +# Probability of hitting each webhook on every traffic tick. Independent draws — +# multiple endpoints can fire on the same tick. Values can exceed 1.0 to issue +# multiple invocations per tick (e.g. 2.5 = 2 calls + a 50% chance of a third). +webhook_probabilities: + /webhook/lab/fast: 0.9 # bulk of the workflow_started counter and HTTP histogram + /webhook/lab/slow: 0.4 # Wait node spreads execution-duration buckets + /webhook/lab/fail: 0.15 # populates workflow_failed and node_finished{status="failed"} + /webhook/lab/flaky: 0.5 # mixed success/failure, ~30% fail rate + /webhook/lab/chain: 0.3 # 4 Set nodes => 4x node.started/finished events per call + +# Probability of hitting each REST API endpoint per tick. Used to drive the +# http_request_duration_seconds histogram across status code labels. +api_probabilities: + /healthz: 1.0 + /healthz/readiness: 0.5 + /rest/login: 0.2 # 401s — useful for status_code label coverage + +# How long to sleep between traffic ticks (seconds). +tick_seconds: 1.0 + +# Reload this file every N seconds (live config). +reload_interval: 5 diff --git a/n8n/tests/lab/run_lab.sh b/n8n/tests/lab/run_lab.sh new file mode 100755 index 0000000000000..5e8d56e5ce525 --- /dev/null +++ b/n8n/tests/lab/run_lab.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -e + +ORIGINAL_DIR=$(pwd) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ENV="py3.13-2" + +while [[ $# -gt 0 ]]; do + case $1 in + -e|--env) + ENV="$2" + shift 2 + ;; + -h|--help) + echo "Usage: $0 [-e|--env ENV]" + echo "" + echo "Options:" + echo " -e, --env ENV ddev environment to use (default: py3.13-2)" + echo " -h, --help Show this help message" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +cleanup() { + echo "" + echo "Cleaning up..." + cd "$SCRIPT_DIR" + hatch run lab:stop -e "$ENV" || true + cd "$ORIGINAL_DIR" + exit 0 +} + +# `lab:generate` runs through `hatch`, which traps SIGINT itself, so we +# install our own EXIT trap to make sure `lab:stop` always runs even on Ctrl+C. +trap cleanup EXIT + +cd "$SCRIPT_DIR" +hatch run lab:start -e "$ENV" + +echo "Starting traffic (Ctrl+C to stop)..." +hatch run lab:generate diff --git a/n8n/tests/lab/traffic_generator.py b/n8n/tests/lab/traffic_generator.py new file mode 100644 index 0000000000000..be73b6df6e58c --- /dev/null +++ b/n8n/tests/lab/traffic_generator.py @@ -0,0 +1,260 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +"""n8n lab traffic generator. + +Brings up the standard n8n test environment via ``ddev env start --base``, +imports a richer set of workflows than the integration tests use, activates +them, and then drives a continuous, configurable traffic mix against the +running container so a real Datadog Agent can ship the resulting metrics. +""" + +from __future__ import annotations + +import asyncio +import random +import signal +import subprocess +import sys +import time +from pathlib import Path +from typing import Any + +import click +import httpx +import yaml +from rich.console import Console +from rich.table import Table + +ConfigDict = dict[str, Any] +LAB_DIR = Path(__file__).resolve().parent +WORKFLOWS_DIR = LAB_DIR / "workflows" +CONFIG_PATH = LAB_DIR / "config.yaml" + +CONTAINER = "n8n-test" +MAIN_BASE_URL = "http://localhost:5678" + +# Stable IDs that match the workflow JSON files. Kept here to drive the +# import/activate/restart loop without re-parsing the JSON. +LAB_WORKFLOW_IDS: list[str] = [ + "labFastSuccess", + "labSlowSuccess", + "labAlwaysFail", + "labFlaky", + "labLongChain", +] + +shutdown_event = asyncio.Event() +current_config: ConfigDict = {} + + +def _load_config(path: Path) -> tuple[ConfigDict, str]: + try: + with open(path) as f: + data = yaml.safe_load(f) or {} + except FileNotFoundError: + return current_config, f"Config file {path} not found; using current values." + except yaml.YAMLError as exc: + return current_config, f"Failed to parse {path}: {exc}; using current values." + + if not isinstance(data, dict): + return current_config, f"{path} must be a mapping at the top level; using current values." + + return data, "" + + +def _docker_exec(*cmd: str, check: bool = True) -> subprocess.CompletedProcess: + return subprocess.run( + ["docker", "exec", CONTAINER, *cmd], + check=check, + capture_output=True, + text=True, + ) + + +def _docker_cp(src: Path, dest: str) -> None: + subprocess.check_call(["docker", "cp", str(src), f"{CONTAINER}:{dest}"]) + + +def _wait_for_endpoint(url: str, *, timeout: int = 90) -> None: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + try: + if httpx.get(url, timeout=2).status_code == 200: + return + except httpx.RequestError: + pass + time.sleep(2) + raise RuntimeError(f"Endpoint {url} never became reachable") + + +def _import_lab_workflows(console: Console) -> None: + """Copy the lab workflow files into the running container, import & activate them.""" + console.print("[bold cyan]Copying lab workflows into the container...[/bold cyan]") + _docker_exec("mkdir", "-p", "/lab/workflows") + for path in sorted(WORKFLOWS_DIR.glob("*.json")): + _docker_cp(path, f"/lab/workflows/{path.name}") + + console.print("[bold cyan]Importing & activating lab workflows...[/bold cyan]") + for path in sorted(WORKFLOWS_DIR.glob("*.json")): + result = _docker_exec("n8n", "import:workflow", f"--input=/lab/workflows/{path.name}", check=False) + if result.returncode != 0: + console.print(f"[bold red]Failed to import {path.name}:[/bold red]\n{result.stdout}\n{result.stderr}") + sys.exit(1) + for wf_id in LAB_WORKFLOW_IDS: + _docker_exec("n8n", "update:workflow", f"--id={wf_id}", "--active=true") + + console.print("[bold cyan]Restarting n8n so webhooks register...[/bold cyan]") + subprocess.check_call( + ["docker", "compose", "-f", str(LAB_DIR.parent / "docker" / "docker-compose.yaml"), "restart", "n8n"] + ) + _wait_for_endpoint(f"{MAIN_BASE_URL}/healthz") + console.print("[bold green]Lab workflows are live.[/bold green]") + + +def _signal_handler(_sig, _frame) -> None: + shutdown_event.set() + + +def _print_row(console: Console, ts: str, scenario: str, target: str, status: str, latency_ms: str) -> None: + table = Table(show_header=False, box=None, show_edge=False) + table.add_column("Timestamp", style="dim", width=20) + table.add_column("Scenario", width=10) + table.add_column("Endpoint", width=28) + table.add_column("Status", justify="right", width=14) + table.add_column("Latency (ms)", justify="right", width=14) + table.add_row(ts, scenario, target, status, latency_ms) + console.print(table) + + +async def _hit(client: httpx.AsyncClient, console: Console, scenario: str, path: str) -> None: + url = f"{MAIN_BASE_URL}{path}" + ts = time.strftime("%H:%M:%S") + start = time.perf_counter() + try: + resp = await client.get(url, timeout=10.0) + latency_ms = f"{(time.perf_counter() - start) * 1000:.0f}" + style = "green" if 200 <= resp.status_code < 400 else "red" + _print_row(console, ts, scenario, path, f"[{style}]{resp.status_code}[/]", latency_ms) + except httpx.TimeoutException: + _print_row(console, ts, scenario, path, "[bold yellow]TIMEOUT[/]", "") + except httpx.RequestError as exc: + _print_row(console, ts, scenario, path, f"[bold red]ERR[/] {type(exc).__name__}", "") + + +def _draws(probability: float) -> int: + """Return the number of times an event should fire this tick. + + ``probability`` is interpreted as expected count: ``2.5`` => 2 firings + a + 50% chance of a third. Values <= 1 act like a single Bernoulli trial. + """ + whole = int(probability) + fractional = probability - whole + extra = 1 if random.random() < fractional else 0 + return whole + extra + + +async def _config_reloader(path: Path, console: Console) -> None: + global current_config + while not shutdown_event.is_set(): + new_config, error = _load_config(path) + if error: + console.print(f"[bold yellow]{error}[/bold yellow]") + elif new_config != current_config: + current_config = new_config + console.print(f"[bold cyan]Reloaded config from {path}[/bold cyan]") + try: + await asyncio.wait_for(shutdown_event.wait(), timeout=float(current_config.get("reload_interval", 5))) + except asyncio.TimeoutError: + pass + + +async def _run_traffic(console: Console) -> None: + global current_config + current_config, error = _load_config(CONFIG_PATH) + if error: + console.print(f"[bold red]{error}[/bold red]") + sys.exit(1) + + console.print(f"[dim]Traffic config: {CONFIG_PATH}\nEdit it while the lab runs to change the mix.[/dim]\n") + + reloader = asyncio.create_task(_config_reloader(CONFIG_PATH, console)) + async with httpx.AsyncClient() as client: + try: + while not shutdown_event.is_set(): + tasks = [] + for path, probability in (current_config.get("webhook_probabilities") or {}).items(): + for _ in range(_draws(float(probability))): + tasks.append(_hit(client, console, "webhook", path)) + for path, probability in (current_config.get("api_probabilities") or {}).items(): + for _ in range(_draws(float(probability))): + tasks.append(_hit(client, console, "api", path)) + if tasks: + await asyncio.gather(*tasks, return_exceptions=True) + try: + await asyncio.wait_for( + shutdown_event.wait(), + timeout=float(current_config.get("tick_seconds", 1.0)), + ) + except asyncio.TimeoutError: + pass + finally: + reloader.cancel() + try: + await reloader + except asyncio.CancelledError: + pass + + +@click.group() +def cli() -> None: + """n8n traffic lab commands.""" + + +@cli.command() +@click.option("-e", "--env", default="py3.13-2", help="ddev env name to start (matches hatch matrix entry).") +def start(env: str) -> None: + """Bring up the n8n test environment + agent and import lab workflows on top.""" + console = Console() + console.print(f"[bold cyan]Starting environment {env} via ddev (this also starts the Agent)...[/bold cyan]") + rc = subprocess.call(["ddev", "env", "start", "n8n", "--base", env, "-e", "DD_LOGS_ENABLED=true"]) + if rc != 0: + console.print(f"[bold red]ddev env start failed (exit {rc})[/bold red]") + sys.exit(rc) + + _wait_for_endpoint(f"{MAIN_BASE_URL}/healthz") + _import_lab_workflows(console) + console.print( + "\n[bold green]Lab is up.[/bold green] " + "Run [bold]hatch run lab:generate[/bold] to start traffic, " + "[bold]hatch run lab:stop[/bold] to tear down." + ) + + +@cli.command() +def generate() -> None: + """Drive a continuous, configurable traffic mix against the running lab.""" + console = Console() + signal.signal(signal.SIGINT, _signal_handler) + signal.signal(signal.SIGTERM, _signal_handler) + try: + asyncio.run(_run_traffic(console)) + except KeyboardInterrupt: + console.print("\n[bold yellow]Traffic stopped.[/bold yellow]") + + +@cli.command() +@click.option("-e", "--env", default="py3.13-2", help="ddev env name to stop.") +def stop(env: str) -> None: + """Tear down the lab environment.""" + console = Console() + console.print(f"[bold cyan]Stopping environment {env}...[/bold cyan]") + rc = subprocess.call(["ddev", "env", "stop", "n8n", env]) + if rc != 0: + console.print(f"[bold red]ddev env stop failed (exit {rc})[/bold red]") + sys.exit(rc) + console.print("[bold green]Lab stopped.[/bold green]") + + +if __name__ == "__main__": + cli() diff --git a/n8n/tests/lab/workflows/lab_chain.json b/n8n/tests/lab/workflows/lab_chain.json new file mode 100644 index 0000000000000..27c8279d63dd1 --- /dev/null +++ b/n8n/tests/lab/workflows/lab_chain.json @@ -0,0 +1,82 @@ +{ + "id": "labLongChain", + "versionId": "10000000-0000-0000-0000-000000000005", + "name": "Lab Long Chain", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "lab/chain", + "responseMode": "lastNode", + "options": {} + }, + "id": "11111111-0000-0000-0000-000000000005", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "lab-chain-aaaa-bbbb-cccc-000000000005" + }, + { + "parameters": { + "assignments": { + "assignments": [{"id": "1", "name": "step", "value": "one", "type": "string"}] + }, + "options": {} + }, + "id": "22222222-0000-0000-0000-000000000005", + "name": "Step 1", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [460, 300] + }, + { + "parameters": { + "assignments": { + "assignments": [{"id": "2", "name": "step", "value": "two", "type": "string"}] + }, + "options": {} + }, + "id": "33333333-0000-0000-0000-000000000005", + "name": "Step 2", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [680, 300] + }, + { + "parameters": { + "assignments": { + "assignments": [{"id": "3", "name": "step", "value": "three", "type": "string"}] + }, + "options": {} + }, + "id": "44444444-0000-0000-0000-000000000005", + "name": "Step 3", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [900, 300] + }, + { + "parameters": { + "assignments": { + "assignments": [{"id": "4", "name": "scenario", "value": "chain", "type": "string"}] + }, + "options": {} + }, + "id": "55555555-0000-0000-0000-000000000005", + "name": "Step 4", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [1120, 300] + } + ], + "connections": { + "Webhook": {"main": [[{"node": "Step 1", "type": "main", "index": 0}]]}, + "Step 1": {"main": [[{"node": "Step 2", "type": "main", "index": 0}]]}, + "Step 2": {"main": [[{"node": "Step 3", "type": "main", "index": 0}]]}, + "Step 3": {"main": [[{"node": "Step 4", "type": "main", "index": 0}]]} + }, + "active": false, + "settings": {"executionOrder": "v1"}, + "pinData": {} +} diff --git a/n8n/tests/lab/workflows/lab_fail.json b/n8n/tests/lab/workflows/lab_fail.json new file mode 100644 index 0000000000000..327cb511ced65 --- /dev/null +++ b/n8n/tests/lab/workflows/lab_fail.json @@ -0,0 +1,38 @@ +{ + "id": "labAlwaysFail", + "versionId": "10000000-0000-0000-0000-000000000003", + "name": "Lab Always Fail", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "lab/fail", + "responseMode": "lastNode", + "options": {} + }, + "id": "11111111-0000-0000-0000-000000000003", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "lab-fail-aaaa-bbbb-cccc-000000000003" + }, + { + "parameters": { + "language": "javaScript", + "jsCode": "throw new Error('intentional lab failure');" + }, + "id": "22222222-0000-0000-0000-000000000003", + "name": "Code", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [460, 300] + } + ], + "connections": { + "Webhook": {"main": [[{"node": "Code", "type": "main", "index": 0}]]} + }, + "active": false, + "settings": {"executionOrder": "v1"}, + "pinData": {} +} diff --git a/n8n/tests/lab/workflows/lab_fast.json b/n8n/tests/lab/workflows/lab_fast.json new file mode 100644 index 0000000000000..ebada5057e67b --- /dev/null +++ b/n8n/tests/lab/workflows/lab_fast.json @@ -0,0 +1,42 @@ +{ + "id": "labFastSuccess", + "versionId": "10000000-0000-0000-0000-000000000001", + "name": "Lab Fast Success", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "lab/fast", + "responseMode": "lastNode", + "options": {} + }, + "id": "11111111-0000-0000-0000-000000000001", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "lab-fast-aaaa-bbbb-cccc-000000000001" + }, + { + "parameters": { + "assignments": { + "assignments": [ + {"id": "1", "name": "scenario", "value": "fast", "type": "string"} + ] + }, + "options": {} + }, + "id": "22222222-0000-0000-0000-000000000001", + "name": "Set", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [460, 300] + } + ], + "connections": { + "Webhook": {"main": [[{"node": "Set", "type": "main", "index": 0}]]} + }, + "active": false, + "settings": {"executionOrder": "v1"}, + "pinData": {} +} diff --git a/n8n/tests/lab/workflows/lab_flaky.json b/n8n/tests/lab/workflows/lab_flaky.json new file mode 100644 index 0000000000000..2485f7646ecd1 --- /dev/null +++ b/n8n/tests/lab/workflows/lab_flaky.json @@ -0,0 +1,38 @@ +{ + "id": "labFlaky", + "versionId": "10000000-0000-0000-0000-000000000004", + "name": "Lab Flaky", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "lab/flaky", + "responseMode": "lastNode", + "options": {} + }, + "id": "11111111-0000-0000-0000-000000000004", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "lab-flaky-aaaa-bbbb-cccc-000000000004" + }, + { + "parameters": { + "language": "javaScript", + "jsCode": "if (Math.random() < 0.3) { throw new Error('flaky lab failure'); } return [{json: {ok: true, scenario: 'flaky'}}];" + }, + "id": "22222222-0000-0000-0000-000000000004", + "name": "Code", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [460, 300] + } + ], + "connections": { + "Webhook": {"main": [[{"node": "Code", "type": "main", "index": 0}]]} + }, + "active": false, + "settings": {"executionOrder": "v1"}, + "pinData": {} +} diff --git a/n8n/tests/lab/workflows/lab_slow.json b/n8n/tests/lab/workflows/lab_slow.json new file mode 100644 index 0000000000000..b8adbbcd2d204 --- /dev/null +++ b/n8n/tests/lab/workflows/lab_slow.json @@ -0,0 +1,54 @@ +{ + "id": "labSlowSuccess", + "versionId": "10000000-0000-0000-0000-000000000002", + "name": "Lab Slow Success", + "nodes": [ + { + "parameters": { + "httpMethod": "GET", + "path": "lab/slow", + "responseMode": "lastNode", + "options": {} + }, + "id": "11111111-0000-0000-0000-000000000002", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 2, + "position": [240, 300], + "webhookId": "lab-slow-aaaa-bbbb-cccc-000000000002" + }, + { + "parameters": { + "amount": 500, + "unit": "ms" + }, + "id": "22222222-0000-0000-0000-000000000002", + "name": "Wait", + "type": "n8n-nodes-base.wait", + "typeVersion": 1, + "position": [460, 300] + }, + { + "parameters": { + "assignments": { + "assignments": [ + {"id": "1", "name": "scenario", "value": "slow", "type": "string"} + ] + }, + "options": {} + }, + "id": "33333333-0000-0000-0000-000000000002", + "name": "Set", + "type": "n8n-nodes-base.set", + "typeVersion": 3.4, + "position": [680, 300] + } + ], + "connections": { + "Webhook": {"main": [[{"node": "Wait", "type": "main", "index": 0}]]}, + "Wait": {"main": [[{"node": "Set", "type": "main", "index": 0}]]} + }, + "active": false, + "settings": {"executionOrder": "v1"}, + "pinData": {} +} diff --git a/n8n/tests/test_e2e.py b/n8n/tests/test_e2e.py index 2571135ebce6a..6075e820d5fa9 100644 --- a/n8n/tests/test_e2e.py +++ b/n8n/tests/test_e2e.py @@ -1,13 +1,29 @@ # (C) Datadog, Inc. 2026-present # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) +from typing import Any, Callable + +import pytest + from datadog_checks.dev.utils import assert_service_checks +from . import common + -def test_check_n8n_e2e(dd_agent_check, instance): - aggregator = dd_agent_check(instance, rate=True) +@pytest.mark.e2e +def test_check_n8n_e2e( + dd_agent_check: Callable[..., Any], +): + aggregator = dd_agent_check(rate=True) - # Assert the readiness check metric is present with status_code tag - aggregator.assert_metric('n8n.readiness.check', value=1, tags=["status_code:200"], at_least=1) + aggregator.assert_metric('n8n.readiness.check', value=1, tags=['status_code:200', 'n8n_process:main'], at_least=1) + # Worker also exposes /healthz/readiness via QUEUE_HEALTH_CHECK_ACTIVE on its own port. + aggregator.assert_metric('n8n.readiness.check', value=1, tags=['status_code:200', 'n8n_process:worker'], at_least=1) + common.drop_rare_event_metrics(aggregator) + aggregator.assert_metrics_using_metadata( + common.get_all_metadata_metrics(exclude_rare=True), + check_submission_type=True, + check_symmetric_inclusion=True, + ) assert_service_checks(aggregator) diff --git a/n8n/tests/test_integration.py b/n8n/tests/test_integration.py new file mode 100644 index 0000000000000..965df76703ebd --- /dev/null +++ b/n8n/tests/test_integration.py @@ -0,0 +1,58 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +from typing import Any, Callable + +import pytest + +from datadog_checks.base.stubs.aggregator import AggregatorStub +from datadog_checks.n8n import N8nCheck + +from . import common + +pytestmark = [pytest.mark.usefixtures('dd_environment'), pytest.mark.integration] + + +def _run_check_twice(instance: dict[str, Any], dd_run_check: Callable[[N8nCheck], Any]) -> N8nCheck: + check = N8nCheck('n8n', {}, [instance]) + # First run primes any one-shot/cached metrics; the second exercises the steady state. + dd_run_check(check) + dd_run_check(check) + return check + + +@pytest.fixture +def warmed_main( + instance: dict[str, Any], + dd_run_check: Callable[[N8nCheck], Any], + aggregator: AggregatorStub, +) -> N8nCheck: + return _run_check_twice(instance, dd_run_check) + + +@pytest.fixture +def warmed_both( + instance: dict[str, Any], + worker_instance: dict[str, Any], + dd_run_check: Callable[[N8nCheck], Any], + aggregator: AggregatorStub, +) -> AggregatorStub: + """Run the check against both the main and worker /metrics endpoints into one aggregator.""" + _run_check_twice(instance, dd_run_check) + _run_check_twice(worker_instance, dd_run_check) + return aggregator + + +def test_all_metadata_metrics_emitted(warmed_both: AggregatorStub): + """Across main and worker, every metadata metric for this n8n version is emitted.""" + common.drop_rare_event_metrics(warmed_both) + warmed_both.assert_metrics_using_metadata( + common.get_all_metadata_metrics(exclude_rare=True), + check_submission_type=True, + check_symmetric_inclusion=True, + ) + + +def test_readiness_check_metric(warmed_main: N8nCheck, aggregator: AggregatorStub): + aggregator.assert_metric('n8n.readiness.check', value=1, tags=['status_code:200', 'n8n_process:main'], at_least=1) diff --git a/n8n/tests/test_unit.py b/n8n/tests/test_unit.py index cc3b314428044..0e3ac92985289 100644 --- a/n8n/tests/test_unit.py +++ b/n8n/tests/test_unit.py @@ -2,91 +2,121 @@ # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) +from typing import Any, Callable from unittest import mock -from datadog_checks.dev.utils import get_metadata_metrics +import pytest +from requests.exceptions import ConnectionError + +from datadog_checks.base.stubs.aggregator import AggregatorStub +from datadog_checks.base.stubs.datadog_agent import DatadogAgentStub from datadog_checks.n8n import N8nCheck from . import common -def test_unit_metrics(dd_run_check, instance, aggregator, mock_http_response): +def test_check_emits_metrics_as_in_metadata( + dd_run_check: Callable[[N8nCheck], Any], + aggregator: AggregatorStub, + mock_http_response: Callable[..., Any], +): mock_http_response(file_path=common.get_fixture_path('n8n.txt')) + instance: dict[str, Any] = {'openmetrics_endpoint': 'http://localhost:5678/metrics'} check = N8nCheck('n8n', {}, [instance]) - dd_run_check(check) + with mock.patch.object(N8nCheck, '_check_n8n_readiness', return_value=None): + dd_run_check(check) - for metric in common.TEST_METRICS: - aggregator.assert_metric(metric) - aggregator.assert_all_metrics_covered() - aggregator.assert_metrics_using_metadata(get_metadata_metrics()) + aggregator.assert_metrics_using_metadata( + common.get_openmetrics_metadata_metrics(major=2), + check_submission_type=True, + check_symmetric_inclusion=True, + ) -def test_metrics_custom_prefx(dd_run_check, aggregator, mock_http_response): +def test_metrics_custom_prefix( + dd_run_check: Callable[[N8nCheck], Any], + aggregator: AggregatorStub, + mock_http_response: Callable[..., Any], +): mock_http_response(file_path=common.get_fixture_path('n8n_custom.txt')) - instance = { + instance: dict[str, Any] = { 'openmetrics_endpoint': 'http://localhost:5678/metrics', 'raw_metric_prefix': 'test_', } check = N8nCheck('n8n', {}, [instance]) - dd_run_check(check) + with mock.patch.object(N8nCheck, '_check_n8n_readiness', return_value=None): + dd_run_check(check) - for metric in common.TEST_METRICS: - aggregator.assert_metric(metric) - aggregator.assert_all_metrics_covered() - aggregator.assert_metrics_using_metadata(get_metadata_metrics()) + aggregator.assert_metrics_using_metadata( + common.get_openmetrics_metadata_metrics(major=2), + check_submission_type=True, + check_symmetric_inclusion=True, + ) -def test_readiness_check_ready(aggregator, instance): +@pytest.fixture +def initialized_check(instance: dict[str, Any]) -> N8nCheck: + check = N8nCheck('n8n', {}, [instance]) + check.load_configuration_models() + return check + + +@pytest.mark.parametrize( + 'status_code, expected_value', + [ + pytest.param(200, 1, id='ready'), + pytest.param(503, 0, id='not_ready'), + ], +) +def test_readiness_check( + aggregator: AggregatorStub, + initialized_check: N8nCheck, + status_code: int, + expected_value: int, +): with mock.patch( 'requests.Session.get', - return_value=mock.Mock(ok=True, status_code=200), + return_value=mock.Mock(ok=expected_value == 1, status_code=status_code), ): - check = N8nCheck('n8n', {}, [instance]) - check._check_n8n_readiness() + initialized_check._check_n8n_readiness() - # Assert metric value is 1 (ready) with status_code:200 tag - aggregator.assert_metric('n8n.readiness.check', value=1, tags=['status_code:200']) + aggregator.assert_metric( + 'n8n.readiness.check', + value=expected_value, + tags=['n8n_process:main', f'status_code:{status_code}'], + ) -def test_readiness_check_not_ready(aggregator, instance): - with mock.patch( - 'requests.Session.get', - return_value=mock.Mock(ok=False, status_code=503), - ): - check = N8nCheck('n8n', {}, [instance]) - check._check_n8n_readiness() +def test_readiness_check_unreachable(aggregator: AggregatorStub, initialized_check: N8nCheck): + with mock.patch('requests.Session.get', side_effect=ConnectionError('boom')): + initialized_check._check_n8n_readiness() - # Assert metric value is 0 (not ready) with status_code:503 tag - aggregator.assert_metric('n8n.readiness.check', value=0, tags=['status_code:503']) + aggregator.assert_metric('n8n.readiness.check', value=0, tags=['n8n_process:main', 'status_code:none']) -def test_readiness_check_no_status_code(aggregator, instance): - with mock.patch( - 'requests.Session.get', - return_value=mock.Mock(ok=False, status_code=None), - ): - check = N8nCheck('n8n', {}, [instance]) - check._check_n8n_readiness() - - # Assert metric value is 0 (not ready) with status_code:null tag - aggregator.assert_metric('n8n.readiness.check', value=0, tags=['status_code:null']) +def test_readiness_uses_endpoint_host_not_metrics_path(initialized_check: N8nCheck): + """The readiness endpoint must be derived from the host, not appended to /metrics.""" + expected = f'http://{common.HOST}:{common.MAIN_PORT}/healthz/readiness' + assert initialized_check._readiness_endpoint() == expected -def test_version_metadata(datadog_agent, dd_run_check, mock_http_response, instance): - """ - Test version metadata collection from Prometheus metrics - """ +def test_version_metadata( + datadog_agent: DatadogAgentStub, + dd_run_check: Callable[[N8nCheck], Any], + mock_http_response: Callable[..., Any], + instance: dict[str, Any], +): mock_http_response(file_path=common.get_fixture_path('n8n.txt')) check = N8nCheck('n8n', {}, [instance]) check.check_id = 'n8n_test' - dd_run_check(check) - # Version from fixture: n8n_version_info{version="v1.117.2",major="1",minor="117",patch="2"} 1 + with mock.patch.object(N8nCheck, '_check_n8n_readiness', return_value=None): + dd_run_check(check) version_metadata = { 'version.scheme': 'semver', - 'version.major': '1', - 'version.minor': '117', - 'version.patch': '2', - 'version.raw': 'v1.117.2', + 'version.major': '2', + 'version.minor': '19', + 'version.patch': '5', + 'version.raw': 'v2.19.5', } datadog_agent.assert_metadata('n8n_test', version_metadata)