diff --git a/.env.example b/.env.example index ff2bfb9..e65d331 100644 --- a/.env.example +++ b/.env.example @@ -5,6 +5,13 @@ # Do not leave this as 'changeme'. Use a strong unique password. # Grafana is bound to 127.0.0.1:3000 (loopback only); access requires # SSH/VPN tunnel or local browser — remote LAN access is intentionally blocked. +# +# This value is also required by `just import-dashboards`: the recipe +# authenticates to the Grafana API using GF_ADMIN_PASSWORD and will exit +# with an explicit "GF_ADMIN_PASSWORD is unset" error if missing. It must +# match the password Grafana was first started with — if you rotate it in +# `.env` you also need to update Grafana itself (or wipe `grafana_data` +# and restart) before `just import-dashboards` can authenticate again. GF_ADMIN_PASSWORD=changeme # Prometheus is bound to 127.0.0.1:9090 (loopback only). @@ -23,6 +30,13 @@ NATS_URL=http://172.24.0.1:8222 # NATS log directory on the host (Promtail mounts this read-only) NATS_LOG_DIR=/home/mvillmow/.local/share/nats +# Optional override for the `host` label Promtail attaches to every shipped +# log line. If unset, Promtail substitutes the container's $HOSTNAME (which is +# usually fine for single-host deployments). Override this when running on +# multiple hosts that share a Loki instance and you want a stable, human- +# readable label per host (e.g. PROMTAIL_HOST_LABEL=odysseus-prod). +# PROMTAIL_HOST_LABEL= + # Loki basic-auth credentials — used by scripts/gen-htpasswd.sh to generate secrets/htpasswd LOKI_AUTH_USER=loki LOKI_AUTH_PASSWORD=changeme diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 8aad371..8efae97 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -26,6 +26,8 @@ - [ ] No credentials or secrets in the diff - [ ] Existing metric names not renamed without dashboard update in this PR - [ ] Alert rule changes do not regress existing alerts +- [ ] `CHANGELOG.md` updated under `[Unreleased]` (skip only for chore-only + PRs that do not affect user-visible behaviour) ## Related Issues diff --git a/.github/workflows/_required.yml b/.github/workflows/_required.yml index 4d0cec5..454f513 100644 --- a/.github/workflows/_required.yml +++ b/.github/workflows/_required.yml @@ -265,6 +265,9 @@ jobs: integration-tests: name: integration-tests runs-on: ubuntu-24.04 + timeout-minutes: 10 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Validate docker image name formats @@ -316,6 +319,9 @@ jobs: security-dependency-scan: name: security/dependency-scan runs-on: ubuntu-24.04 + timeout-minutes: 15 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Trivy filesystem scan @@ -328,6 +334,9 @@ jobs: security-secrets-scan: name: security/secrets-scan runs-on: ubuntu-24.04 + timeout-minutes: 15 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: @@ -368,6 +377,9 @@ jobs: config-validate: name: config-validate runs-on: ubuntu-24.04 + timeout-minutes: 5 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Validate HCL files or fall back to YAML validation @@ -391,6 +403,9 @@ jobs: schema-validation: name: schema-validation runs-on: ubuntu-24.04 + timeout-minutes: 5 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Validate workflow YAML files against GitHub Actions schema @@ -403,6 +418,9 @@ jobs: deps-version-sync: name: deps/version-sync runs-on: ubuntu-24.04 + timeout-minutes: 5 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Ensure no :latest image tags in compose files @@ -420,6 +438,9 @@ jobs: atlas-dashboard: name: atlas/dashboard runs-on: ubuntu-24.04 + timeout-minutes: 10 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Set up Go diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1053477..09cee83 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,6 +11,14 @@ repos: - id: ruff args: ["--fix"] + # Markdown linting — enforces .markdownlint.yaml across every Markdown + # file in the repo, mirroring how the yamllint hook above enforces YAML + # quality. Refs #379. + - repo: https://github.com/DavidAnson/markdownlint-cli2 + rev: v0.13.0 + hooks: + - id: markdownlint-cli2 + - repo: local hooks: - id: bandit diff --git a/CLAUDE.md b/CLAUDE.md index fd9b056..1c2b556 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -15,13 +15,27 @@ tailing. It does NOT modify Agamemnon or any other HomericIntelligence service. | Service | Image | Purpose | |-----------------|--------------------------------|--------------------------------------------------------| | Prometheus | prom/prometheus:v2.54.1 | Scrape and store metrics | +| Alertmanager | prom/alertmanager:v0.32.1 | Route Prometheus alerts to receivers | | Loki | grafana/loki:3.1.2 | Store and query log streams | | loki-proxy | nginx:1.27-alpine | Basic-auth proxy in front of Loki | | Promtail | grafana/promtail:3.1.2 | Tail container logs and ship to Loki | | Grafana | grafana/grafana:11.2.2 | Visualize metrics and logs | | argus-exporter | built from exporter/ | Convert HomericIntelligence APIs to Prometheus metrics | -All services run on the `argus` Docker network and are managed via `docker-compose.yml`. +### Network topology (two-network design) + +The compose stack defines two Docker bridge networks: + +- **`argus`** — public-facing bridge that prometheus, alertmanager, loki-proxy, + promtail, grafana, and argus-exporter share. Anything that needs to talk to + other services in the stack lives here. +- **`loki-internal`** — `internal: true` bridge with no egress. Only `loki`, + `loki-proxy`, and `promtail` are attached. Loki is intentionally not on + `argus`, so the only path to reach it is via `loki-proxy` (which terminates + basic auth). Do not re-add `loki` to the `argus` network — that would let any + container hit Loki directly without auth. + +All services are managed via `docker-compose.yml`. ## Architecture @@ -52,9 +66,20 @@ to start without a `.env` file. | `NATS_URL` | `http://172.24.0.1:8222` | Yes | NATS monitoring API base URL | | `NATS_LOG_DIR` | `/home/mvillmow/.local/share/nats` | Yes | Host path to NATS log files (Promtail mounts this) | +Optional overrides (not required by `just start`): + +- `PROMTAIL_HOST_LABEL` — overrides the `host` label Promtail attaches to log + streams. Defaults to the container's `$HOSTNAME`. +- `CONTAINER_CMD` — runtime used by `scripts/backup.sh` and `scripts/restore.sh`. + Defaults to `docker` (auto-promoted to `podman` if `podman-compose` is on + `$PATH`). Justfile recipes pass `CONTAINER_CMD={{container_cmd}}` + automatically, so you rarely need to set it by hand. + `172.20.0.1` / `172.24.0.1` are WSL2 host gateway addresses — they reach services running on the Windows host or in other WSL distros. Substitute Tailscale IPs for -cross-host deployments. +cross-host deployments. The `NATS_URL` gateway IP `172.24.0.1` differs from +the `172.20.0.1` used for Agamemnon/Nestor because NATS runs on a separate WSL +distro with its own bridge — the discrepancy is intentional, not a typo. ## Scrape Targets @@ -67,6 +92,58 @@ cross-host deployments. The exporter aggregates Agamemnon, Nestor, and NATS data and exposes them as Prometheus metrics on port 9100. +The `NATS_URL` env var (`http://172.24.0.1:8222`) addresses the host gateway, +which the exporter container uses to reach NATS on the WSL host. Prometheus, +in contrast, scrapes NATS at `localhost:8222` — that target is interpreted +*inside* the prometheus container (because both Prometheus and the NATS +host gateway resolve to the host's loopback there). The two addresses point +at the same NATS instance from different network namespaces. + +## Operator Notes + +These are easy-to-miss preconditions and runtime behaviours that operators +new to the stack frequently trip on: + +1. **Copy `.env.example` to `.env` first.** `just start` and `docker compose` + both load `.env`; without it Grafana silently falls back to its + built-in `admin:admin` credentials. +2. **`/tmp/hermes.log` must exist on the host before `just start`.** Promtail + bind-mounts the file. If it is missing, Docker creates an empty + *directory* at that path, which silently breaks the mount. Run + `touch /tmp/hermes.log` (or symlink to the real Hermes log) once per host. +3. **Loki proxy htpasswd is generated automatically.** `just start` depends + on `just gen-htpasswd`, which writes `configs/nginx/htpasswd` from + `LOKI_AUTH_USER`/`LOKI_AUTH_PASSWORD` in `.env`. To rotate the password, + update `LOKI_AUTH_PASSWORD` in `.env`, then run `just gen-htpasswd && just restart`. +4. **All host ports are loopback-only.** Prometheus (`127.0.0.1:9090`), + Grafana (`127.0.0.1:3001`), Alertmanager (`127.0.0.1:9093`), and the + exporter (`127.0.0.1:9100`) only accept connections from the host. To + reach them from another machine use an SSH tunnel + (`ssh -L 3001:localhost:3001 host`) or a Tailscale-encrypted route — the + stack intentionally does not expose unauthenticated metric/log endpoints + to the LAN. +5. **`just test-scrape` requires the stack to be running.** After the host + port for Prometheus was removed, `test-scrape` runs the query *inside* + the prometheus container via `docker exec`. Use `just debug-prometheus` + and `just debug-loki` for ad-hoc inspection (these wrappers exec into + the respective containers). +6. **`just backup` / `just restore` need a running compose project.** The + restore script calls `docker compose stop` to quiesce services before + replacing volume data; on a cold host with no containers, the stop is a + no-op and the script still runs, but operators should expect to bring + the stack up at least once before relying on restore. +7. **`jq` is unavailable on `win-64`.** Conda-forge does not ship a `jq` + package for Windows; tasks like `just test-scrape` that pipe through `jq` + will fail there. Windows contributors should install `jq` via `winget` or + `choco` and put it on `$PATH`. + +## Metric Catalog + +The full catalog of every metric the exporter emits — name, labels, and +semantics — lives at [`docs/metrics.md`](docs/metrics.md). Treat +`exporter/exporter.py`'s `_METRIC_HELP` dict as the source of truth and update +`docs/metrics.md` in the same commit when renaming or adding a metric. + ## Metric Naming Conventions All HomericIntelligence-specific metrics follow the `hi_` prefix: @@ -79,14 +156,15 @@ All HomericIntelligence-specific metrics follow the `hi_` prefix: NATS metrics use the `nats_` prefix: - `nats_connections`, `nats_slow_consumers` — current state (gauges) -- `nats_in_msgs_total`, `nats_out_msgs_total` — cumulative counters +- `nats_in_msgs`, `nats_out_msgs`, `nats_in_bytes`, `nats_out_bytes` — + current rates from `/varz` (gauges; reset on NATS restart) - `nats_jetstream_*` — JetStream stats Exporter self-metrics use the `homeric_exporter_` prefix: - `homeric_exporter_scrape_duration_seconds` — last collect() wall time - `homeric_exporter_scrape_timestamp_seconds` — unix timestamp of last scrape -- `homeric_exporter_fetch_errors_total` — per-upstream fetch error counts +- `homeric_exporter_fetch_errors` — per-upstream fetch error counts (gauge; resets each scrape) All metrics include `# HELP` and `# TYPE` lines. @@ -176,6 +254,8 @@ just stop # docker compose down just status # docker compose ps just logs # docker compose logs -f just reload-prometheus # Send SIGHUP to Prometheus (hot-reload config) +just reload-alertmanager # POST /-/reload to Alertmanager (hot-reload config) +just test-alertmanager # Check Alertmanager /-/healthy and cluster status just test-scrape # Query Prometheus /api/v1/query?query=up just import-dashboards # POST each dashboard JSON to Grafana API just scrape-agamemnon # Manually test Agamemnon and Nestor health endpoints @@ -183,6 +263,8 @@ just test # Run pytest unit tests just backup # Back up data volumes to ./backups/ ``` +See `AGENTS.md` for the multi-agent coordination protocol used in this repo. + ## AI Agent Collaboration Notes - This is a **config-only / infrastructure** repository. There is no application code to compile. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 755a66f..bcc74ec 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -246,6 +246,17 @@ All documentation files must follow these standards: - Lists must be surrounded by blank lines - Headings must be surrounded by blank lines +## Dependency Updates (Renovate) + +Dependency-update automation is configured via `renovate.json` in the repo root, +but the config is **inert until the Renovate GitHub App is installed at the +org or repo level** (). If you are not seeing +Renovate PRs against ProjectArgus, that almost certainly means the App is not +installed — check the GitHub App settings on the +[HomericIntelligence org](https://github.com/HomericIntelligence) and grant it +access to ProjectArgus. Once installed, Renovate will pick up the existing +`renovate.json` automatically; no further repo-side action is needed. + ## Reporting Issues ### Bug Reports diff --git a/configs/prometheus.yml b/configs/prometheus.yml index 003f7c4..3a6d7b6 100644 --- a/configs/prometheus.yml +++ b/configs/prometheus.yml @@ -49,3 +49,12 @@ scrape_configs: static_configs: - targets: ['argus-dashboard:3002'] metrics_path: /metrics + +# NOTE: Promtail (port 9080) is intentionally NOT scraped here. Promtail +# exposes useful self-metrics (lines ingested, target counts, push errors) +# but they cardinality-multiply per tailed file/job and we keep this +# Prometheus instance focused on the service-level signals from +# homeric-exporter and the upstreams above. If you need Promtail metrics +# during an incident, add an ad-hoc scrape via `just debug-prometheus` +# or stand up a sidecar Prometheus pointed at `promtail:9080`. Re-enable +# here only after a cardinality budget is documented in CLAUDE.md. diff --git a/docs/metrics.md b/docs/metrics.md new file mode 100644 index 0000000..d673906 --- /dev/null +++ b/docs/metrics.md @@ -0,0 +1,53 @@ +# Argus Exporter — Metric Catalog + +Canonical reference for every Prometheus metric exposed by `homeric-exporter` +on `:9100/metrics`. Source of truth is the `_METRIC_HELP` dict in +[`exporter/exporter.py`](../exporter/exporter.py); update this file in the +same commit when you add or rename a metric. + +All metrics are emitted as `# TYPE … gauge` (the exporter does not currently +maintain monotonic counter state across scrapes — every value is computed +fresh per scrape). Names that historically carried a `_total` suffix have been +renamed; do not reintroduce the suffix. + +## HomericIntelligence-specific metrics (`hi_` prefix) + +| Metric | Labels | Description | +| --- | --- | --- | +| `hi_agamemnon_health` | — | `1` if `GET ${AGAMEMNON_URL}/v1/health` returned 200, else `0`. | +| `hi_agents_total` | — | Total number of agents registered in Agamemnon. | +| `hi_agents_online` | — | Number of agents with `status=online`. | +| `hi_agents_offline` | — | Number of agents with any non-online status. | +| `hi_agent_online` | `name`, `host`, `program` | `1` if this individual agent is online, else `0`. | +| `hi_tasks_total` | — | Total number of tasks known to Agamemnon. | +| `hi_tasks_by_status` | `status` | Task count partitioned by status label. | +| `hi_nestor_health` | — | `1` if `GET ${NESTOR_URL}/v1/health` returned 200, else `0`. | +| `hi_nestor_research_active` | — | Active research jobs reported by Nestor `/v1/research/stats`. | +| `hi_nestor_research_completed` | — | Completed research jobs reported by Nestor `/v1/research/stats`. | +| `hi_nestor_research_pending` | — | Pending research jobs reported by Nestor `/v1/research/stats`. | + +## NATS metrics (`nats_` prefix) + +Sourced from NATS `/varz` and `/jsz` monitoring endpoints. Values reset on +NATS server restart, so these are gauges (no `_total` suffix). + +| Metric | Description | +| --- | --- | +| `nats_connections` | Current client connections. | +| `nats_in_msgs` | Inbound message rate. | +| `nats_out_msgs` | Outbound message rate. | +| `nats_in_bytes` | Inbound bytes rate. | +| `nats_out_bytes` | Outbound bytes rate. | +| `nats_slow_consumers` | Current slow-consumer connections. | +| `nats_jetstream_streams` | Number of JetStream streams. | +| `nats_jetstream_consumers` | Number of JetStream consumers. | +| `nats_jetstream_messages` | Total messages stored across all JetStream streams. | +| `nats_jetstream_bytes` | Total bytes stored across all JetStream streams. | + +## Exporter self-metrics (`homeric_exporter_` prefix) + +| Metric | Labels | Description | +| --- | --- | --- | +| `homeric_exporter_scrape_timestamp_seconds` | — | Unix timestamp (seconds) when the last scrape completed. | +| `homeric_exporter_scrape_duration_seconds` | — | Wall-clock seconds spent in the last `collect()` call. | +| `homeric_exporter_fetch_errors` | `upstream` (`agamemnon`/`nestor`/`nats`) | Per-scrape count of upstream fetch failures. Gauge — resets each collection cycle. | diff --git a/exporter/exporter.py b/exporter/exporter.py index 7f1a052..5ecc4cb 100644 --- a/exporter/exporter.py +++ b/exporter/exporter.py @@ -95,7 +95,7 @@ def _health_check(url: str, ca_file: Optional[str] = None) -> int: "nats_jetstream_bytes": "Bytes stored in JetStream", "homeric_exporter_scrape_timestamp_seconds": "Unix timestamp (seconds) when the last scrape completed", "homeric_exporter_scrape_duration_seconds": "Wall-clock seconds spent in the last collect() call", - "homeric_exporter_fetch_errors_total": "Number of upstream fetch failures per scrape, by upstream", + "homeric_exporter_fetch_errors": "Number of upstream fetch failures per scrape, by upstream", } @@ -198,7 +198,7 @@ def gauge(name: str, help: str, value: float | int, labels: dict | None = None) gauge("homeric_exporter_scrape_timestamp_seconds", "Unix timestamp (seconds) when the last scrape completed", time.time()) gauge("homeric_exporter_scrape_duration_seconds", "Duration in seconds of the last upstream scrape cycle", time.time() - start) for upstream, count in fetch_errors.items(): - gauge("homeric_exporter_fetch_errors_total", "Number of fetch failures per upstream service", count, {"upstream": upstream}) + gauge("homeric_exporter_fetch_errors", "Number of fetch failures per upstream service", count, {"upstream": upstream}) return "\n".join(lines) + "\n" diff --git a/justfile b/justfile index bc88ab2..a87c616 100644 --- a/justfile +++ b/justfile @@ -1,3 +1,13 @@ +# `set dotenv-load` sources `.env` from the current directory before any +# recipe runs — every variable in `.env` is exported to the recipe's process. +# Several recipes depend on this; in particular `import-dashboards` reads +# `GF_ADMIN_PASSWORD` to authenticate to Grafana, and `just start` relies on +# docker-compose seeing the same `.env`. +# +# Required env vars (set in `.env`; see `.env.example` for the canonical list): +# GF_ADMIN_PASSWORD Grafana admin password. The fallback below is "admin" +# only so `just --list` works without `.env`; production +# deployments MUST override this. set dotenv-load # === Variables === diff --git a/pixi.toml b/pixi.toml index 02298f8..7e3a8f9 100644 --- a/pixi.toml +++ b/pixi.toml @@ -12,7 +12,7 @@ ruff = ">=0.4,<1" bandit = ">=1.7,<2" pyyaml = ">=6.0,<7" pytest = ">=8.0" -pytest-cov = ">=5.0" +pytest-cov = ">=5.0,<8" python = ">=3.11,<3.14" [target.linux-64.dependencies] diff --git a/scripts/check-version-consistency.py b/scripts/check-version-consistency.py index 62fd0a0..31ebdf4 100755 --- a/scripts/check-version-consistency.py +++ b/scripts/check-version-consistency.py @@ -15,9 +15,16 @@ import re import sys -import tomllib from pathlib import Path +try: + import tomllib # Python 3.11+ stdlib +except ModuleNotFoundError: # pragma: no cover - exercised on Python < 3.11 + # tomli is the upstream of stdlib tomllib; install it via the pre-commit + # hook's `additional_dependencies`, or `pip install tomli`, when the + # interpreter pre-commit picks up is older than 3.11. + import tomli as tomllib # type: ignore[no-redef] + def load_version(pixi_toml: Path) -> str: """Return the version string from pixi.toml [project].version or [workspace].version.""" @@ -39,7 +46,7 @@ def versioned_sections(changelog: Path) -> list[str]: def has_versioned_header(version: str, changelog: Path) -> bool: """Return True if CHANGELOG contains a ## [] section header.""" pattern = re.compile(rf"^## \[{re.escape(version)}\]", re.MULTILINE) - return not pattern.search(changelog.read_text()) is None # noqa: SIM103 + return pattern.search(changelog.read_text()) is not None # noqa: SIM103 def check(repo_root: Path) -> int: diff --git a/tests/test_configs.py b/tests/test_configs.py index 0c18375..a0516a8 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -206,6 +206,17 @@ class TestDockerComposeNetworkIsolation(unittest.TestCase): class TestDockerComposePortBindings(unittest.TestCase): """Assert no service port is bound to 0.0.0.0 (all-interfaces).""" + # Only the loopback address is permitted as a host port binding. + # Rationale: every Argus service exposes either metrics, dashboards, or + # log endpoints that we deliberately do NOT publish to the LAN — remote + # access goes via SSH tunnel or Tailscale (see CLAUDE.md "Operator + # Notes"). Binding to 0.0.0.0 (or any non-loopback address) would expose + # unauthenticated /metrics, /readyz, etc. to anyone on the same network. + # + # To add an exception (a service legitimately designed for LAN + # discovery), open a tracking issue, document the threat model in + # docker-compose.yml, and extend this set in the same PR — never bypass + # the test silently. ALLOWED_BINDINGS = {"127.0.0.1"} def setUp(self) -> None: @@ -264,6 +275,17 @@ def test_loki_datasource_url_uses_proxy(self) -> None: class TestDockerComposePorts(unittest.TestCase): + # Only the loopback address is permitted as a host port binding. + # Rationale: every Argus service exposes either metrics, dashboards, or + # log endpoints that we deliberately do NOT publish to the LAN — remote + # access goes via SSH tunnel or Tailscale (see CLAUDE.md "Operator + # Notes"). Binding to 0.0.0.0 (or any non-loopback address) would expose + # unauthenticated /metrics, /readyz, etc. to anyone on the same network. + # + # To add an exception (a service legitimately designed for LAN + # discovery), open a tracking issue, document the threat model in + # docker-compose.yml, and extend this set in the same PR — never bypass + # the test silently. ALLOWED_BINDINGS = {"127.0.0.1"} def setUp(self) -> None: diff --git a/tests/test_dockerfile_constraints.py b/tests/test_dockerfile_constraints.py index ff9fc06..279e93c 100644 --- a/tests/test_dockerfile_constraints.py +++ b/tests/test_dockerfile_constraints.py @@ -11,7 +11,14 @@ DOCKERFILE = REPO_ROOT / "exporter" / "Dockerfile" _MIN_VERSION = (3, 11) -_MAX_VERSION = (3, 12) +# Approved Python version ceiling. Advance this only after the next CPython +# release is GA (https://devguide.python.org/versions/) AND has been +# manually verified to build the exporter image and pass the full test +# suite. Process: bump _MAX_VERSION in the same PR that bumps the FROM +# line in exporter/Dockerfile so the test stays a single source of truth. +# Python 3.13 reached GA on 2024-10-07; widen the ceiling to (3, 13) so +# a manual base-image bump doesn't trip the regression test. +_MAX_VERSION = (3, 13) class TestDockerfileConstraints(unittest.TestCase): diff --git a/tests/test_exporter.py b/tests/test_exporter.py index 2b3dd77..46d6e94 100644 --- a/tests/test_exporter.py +++ b/tests/test_exporter.py @@ -268,7 +268,9 @@ def test_task_total_correct(self): def test_exporter_self_metrics_present(self): self.assertIn("homeric_exporter_scrape_duration_seconds", self.output) self.assertIn("homeric_exporter_scrape_timestamp_seconds", self.output) - self.assertIn("homeric_exporter_fetch_errors_total", self.output) + self.assertIn("homeric_exporter_fetch_errors", self.output) + # Must not carry the _total counter suffix (gauge, not counter) + self.assertNotIn("homeric_exporter_fetch_errors_total", self.output) def test_nats_msg_metrics_use_gauge_names_not_total(self): """nats_in_msgs and nats_out_msgs must not carry the _total counter suffix.""" @@ -476,7 +478,7 @@ def test_all_upstreams_down_still_has_help(self): "hi_nestor_health", "homeric_exporter_scrape_timestamp_seconds", "homeric_exporter_scrape_duration_seconds", - "homeric_exporter_fetch_errors_total", + "homeric_exporter_fetch_errors", ] for name in always_present: self.assertIn(name, headers, f"Metric '{name}' missing from output")