From 233d31342a18e8ab8475d7eda3e4352cd8829a64 Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:40:16 -0700 Subject: [PATCH 01/18] fix(exporter): rename homeric_exporter_fetch_errors_total to homeric_exporter_fetch_errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The metric is a per-scrape gauge (resets each collection cycle), not a monotonic counter. The _total suffix was misleading tooling that auto-detects type from the suffix. Renames the metric, adds an assertion that the old name no longer appears, and refreshes the CLAUDE.md metric reference (also fixes the stale nats_*_total entries documented under cumulative counters — those are emitted as gauges from /varz). Closes #430 --- CLAUDE.md | 4 ++-- exporter/exporter.py | 4 ++-- tests/test_exporter.py | 6 ++++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index fd9b056..e7cb391 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -79,14 +79,14 @@ All HomericIntelligence-specific metrics follow the `hi_` prefix: NATS metrics use the `nats_` prefix: - `nats_connections`, `nats_slow_consumers` — current state (gauges) -- `nats_in_msgs_total`, `nats_out_msgs_total` — cumulative counters +- `nats_in_msgs`, `nats_out_msgs`, `nats_in_bytes`, `nats_out_bytes` — current rates from `/varz` (gauges; reset on NATS restart) - `nats_jetstream_*` — JetStream stats Exporter self-metrics use the `homeric_exporter_` prefix: - `homeric_exporter_scrape_duration_seconds` — last collect() wall time - `homeric_exporter_scrape_timestamp_seconds` — unix timestamp of last scrape -- `homeric_exporter_fetch_errors_total` — per-upstream fetch error counts +- `homeric_exporter_fetch_errors` — per-upstream fetch error counts (gauge; resets each scrape) All metrics include `# HELP` and `# TYPE` lines. diff --git a/exporter/exporter.py b/exporter/exporter.py index 7f1a052..5ecc4cb 100644 --- a/exporter/exporter.py +++ b/exporter/exporter.py @@ -95,7 +95,7 @@ def _health_check(url: str, ca_file: Optional[str] = None) -> int: "nats_jetstream_bytes": "Bytes stored in JetStream", "homeric_exporter_scrape_timestamp_seconds": "Unix timestamp (seconds) when the last scrape completed", "homeric_exporter_scrape_duration_seconds": "Wall-clock seconds spent in the last collect() call", - "homeric_exporter_fetch_errors_total": "Number of upstream fetch failures per scrape, by upstream", + "homeric_exporter_fetch_errors": "Number of upstream fetch failures per scrape, by upstream", } @@ -198,7 +198,7 @@ def gauge(name: str, help: str, value: float | int, labels: dict | None = None) gauge("homeric_exporter_scrape_timestamp_seconds", "Unix timestamp (seconds) when the last scrape completed", time.time()) gauge("homeric_exporter_scrape_duration_seconds", "Duration in seconds of the last upstream scrape cycle", time.time() - start) for upstream, count in fetch_errors.items(): - gauge("homeric_exporter_fetch_errors_total", "Number of fetch failures per upstream service", count, {"upstream": upstream}) + gauge("homeric_exporter_fetch_errors", "Number of fetch failures per upstream service", count, {"upstream": upstream}) return "\n".join(lines) + "\n" diff --git a/tests/test_exporter.py b/tests/test_exporter.py index 2b3dd77..46d6e94 100644 --- a/tests/test_exporter.py +++ b/tests/test_exporter.py @@ -268,7 +268,9 @@ def test_task_total_correct(self): def test_exporter_self_metrics_present(self): self.assertIn("homeric_exporter_scrape_duration_seconds", self.output) self.assertIn("homeric_exporter_scrape_timestamp_seconds", self.output) - self.assertIn("homeric_exporter_fetch_errors_total", self.output) + self.assertIn("homeric_exporter_fetch_errors", self.output) + # Must not carry the _total counter suffix (gauge, not counter) + self.assertNotIn("homeric_exporter_fetch_errors_total", self.output) def test_nats_msg_metrics_use_gauge_names_not_total(self): """nats_in_msgs and nats_out_msgs must not carry the _total counter suffix.""" @@ -476,7 +478,7 @@ def test_all_upstreams_down_still_has_help(self): "hi_nestor_health", "homeric_exporter_scrape_timestamp_seconds", "homeric_exporter_scrape_duration_seconds", - "homeric_exporter_fetch_errors_total", + "homeric_exporter_fetch_errors", ] for name in always_present: self.assertIn(name, headers, f"Metric '{name}' missing from output") From 735014eb1684ac856c063b037970a5ec4c377cdc Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:42:21 -0700 Subject: [PATCH 02/18] docs(env): document PROMTAIL_HOST_LABEL override in .env.example Closes #252 --- .env.example | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.env.example b/.env.example index ff2bfb9..61421d9 100644 --- a/.env.example +++ b/.env.example @@ -23,6 +23,13 @@ NATS_URL=http://172.24.0.1:8222 # NATS log directory on the host (Promtail mounts this read-only) NATS_LOG_DIR=/home/mvillmow/.local/share/nats +# Optional override for the `host` label Promtail attaches to every shipped +# log line. If unset, Promtail substitutes the container's $HOSTNAME (which is +# usually fine for single-host deployments). Override this when running on +# multiple hosts that share a Loki instance and you want a stable, human- +# readable label per host (e.g. PROMTAIL_HOST_LABEL=odysseus-prod). +# PROMTAIL_HOST_LABEL= + # Loki basic-auth credentials — used by scripts/gen-htpasswd.sh to generate secrets/htpasswd LOKI_AUTH_USER=loki LOKI_AUTH_PASSWORD=changeme From 677864187356db5d94965182620af03b2f505546 Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:42:26 -0700 Subject: [PATCH 03/18] docs(justfile): document set dotenv-load and required env vars in header Adds a header comment block explaining what `set dotenv-load` does, which recipes depend on the sourced env, and which vars must be set in .env (GF_ADMIN_PASSWORD). Closes #265 Closes #319 --- justfile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/justfile b/justfile index bc88ab2..a87c616 100644 --- a/justfile +++ b/justfile @@ -1,3 +1,13 @@ +# `set dotenv-load` sources `.env` from the current directory before any +# recipe runs — every variable in `.env` is exported to the recipe's process. +# Several recipes depend on this; in particular `import-dashboards` reads +# `GF_ADMIN_PASSWORD` to authenticate to Grafana, and `just start` relies on +# docker-compose seeing the same `.env`. +# +# Required env vars (set in `.env`; see `.env.example` for the canonical list): +# GF_ADMIN_PASSWORD Grafana admin password. The fallback below is "admin" +# only so `just --list` works without `.env`; production +# deployments MUST override this. set dotenv-load # === Variables === From c6df464583c2c4a74a5bc802e8c17ed800f4eb86 Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:42:39 -0700 Subject: [PATCH 04/18] docs(CLAUDE.md): expand stack table, env vars, and operator commands Single coordinated CLAUDE.md update covering several follow-ups: - Add Alertmanager to the Stack Components table; add reload-alertmanager and test-alertmanager to Common Commands. - Add a 'Network topology' section explaining the two-network (argus + loki-internal) design and warning against re-attaching loki to argus. - Document CONTAINER_CMD, PROMTAIL_HOST_LABEL in the env-var table. - Note that the NATS_URL gateway IP (172.24.0.1) intentionally differs from the Agamemnon/Nestor gateway (172.20.0.1). - Point at AGENTS.md from the Common Commands section so new contributors discover the multi-agent coordination protocol. Closes #175 Closes #216 Closes #223 Closes #252 Closes #258 Closes #336 Closes #356 Closes #382 --- CLAUDE.md | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index e7cb391..2749858 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -15,13 +15,27 @@ tailing. It does NOT modify Agamemnon or any other HomericIntelligence service. | Service | Image | Purpose | |-----------------|--------------------------------|--------------------------------------------------------| | Prometheus | prom/prometheus:v2.54.1 | Scrape and store metrics | +| Alertmanager | prom/alertmanager:v0.32.1 | Route Prometheus alerts to receivers | | Loki | grafana/loki:3.1.2 | Store and query log streams | | loki-proxy | nginx:1.27-alpine | Basic-auth proxy in front of Loki | | Promtail | grafana/promtail:3.1.2 | Tail container logs and ship to Loki | | Grafana | grafana/grafana:11.2.2 | Visualize metrics and logs | | argus-exporter | built from exporter/ | Convert HomericIntelligence APIs to Prometheus metrics | -All services run on the `argus` Docker network and are managed via `docker-compose.yml`. +### Network topology (two-network design) + +The compose stack defines two Docker bridge networks: + +- **`argus`** — public-facing bridge that prometheus, alertmanager, loki-proxy, + promtail, grafana, and argus-exporter share. Anything that needs to talk to + other services in the stack lives here. +- **`loki-internal`** — `internal: true` bridge with no egress. Only `loki`, + `loki-proxy`, and `promtail` are attached. Loki is intentionally not on + `argus`, so the only path to reach it is via `loki-proxy` (which terminates + basic auth). Do not re-add `loki` to the `argus` network — that would let any + container hit Loki directly without auth. + +All services are managed via `docker-compose.yml`. ## Architecture @@ -51,10 +65,14 @@ to start without a `.env` file. | `NESTOR_URL` | `http://172.20.0.1:8081` | Yes | Nestor API base URL | | `NATS_URL` | `http://172.24.0.1:8222` | Yes | NATS monitoring API base URL | | `NATS_LOG_DIR` | `/home/mvillmow/.local/share/nats` | Yes | Host path to NATS log files (Promtail mounts this) | +| `PROMTAIL_HOST_LABEL` | unset (Promtail uses `$HOSTNAME`) | No | Override the `host` label Promtail attaches to log streams | +| `CONTAINER_CMD` | `docker` (auto-set to `podman` if `podman-compose` is on `$PATH`) | No | Runtime used by `scripts/backup.sh` and `scripts/restore.sh`; the justfile recipes pass `CONTAINER_CMD={{container_cmd}}` so you usually don't set this manually | `172.20.0.1` / `172.24.0.1` are WSL2 host gateway addresses — they reach services running on the Windows host or in other WSL distros. Substitute Tailscale IPs for -cross-host deployments. +cross-host deployments. The `NATS_URL` gateway IP `172.24.0.1` differs from +the `172.20.0.1` used for Agamemnon/Nestor because NATS runs on a separate WSL +distro with its own bridge — the discrepancy is intentional, not a typo. ## Scrape Targets @@ -176,6 +194,8 @@ just stop # docker compose down just status # docker compose ps just logs # docker compose logs -f just reload-prometheus # Send SIGHUP to Prometheus (hot-reload config) +just reload-alertmanager # POST /-/reload to Alertmanager (hot-reload config) +just test-alertmanager # Check Alertmanager /-/healthy and cluster status just test-scrape # Query Prometheus /api/v1/query?query=up just import-dashboards # POST each dashboard JSON to Grafana API just scrape-agamemnon # Manually test Agamemnon and Nestor health endpoints @@ -183,6 +203,8 @@ just test # Run pytest unit tests just backup # Back up data volumes to ./backups/ ``` +See `AGENTS.md` for the multi-agent coordination protocol used in this repo. + ## AI Agent Collaboration Notes - This is a **config-only / infrastructure** repository. There is no application code to compile. From 62f3d5d4a33bb76a654df14635f433de63348ec6 Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:43:26 -0700 Subject: [PATCH 05/18] docs(contributing): document Renovate GitHub App install requirement Closes #298 --- CONTRIBUTING.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 755a66f..bcc74ec 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -246,6 +246,17 @@ All documentation files must follow these standards: - Lists must be surrounded by blank lines - Headings must be surrounded by blank lines +## Dependency Updates (Renovate) + +Dependency-update automation is configured via `renovate.json` in the repo root, +but the config is **inert until the Renovate GitHub App is installed at the +org or repo level** (). If you are not seeing +Renovate PRs against ProjectArgus, that almost certainly means the App is not +installed — check the GitHub App settings on the +[HomericIntelligence org](https://github.com/HomericIntelligence) and grant it +access to ProjectArgus. Once installed, Renovate will pick up the existing +`renovate.json` automatically; no further repo-side action is needed. + ## Reporting Issues ### Bug Reports From a3b2e7881ddf7b53678b7b2ba35c7f49e618b281 Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:44:47 -0700 Subject: [PATCH 06/18] docs(metrics): add canonical metric catalog at docs/metrics.md Builds a single reference table for every metric the exporter emits (name, labels, description). Cross-linked from CLAUDE.md so future reviewers see exactly what the exporter exposes without grepping exporter.py. Closes #423 Closes #427 --- CLAUDE.md | 7 +++++++ docs/metrics.md | 53 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 docs/metrics.md diff --git a/CLAUDE.md b/CLAUDE.md index 2749858..aee7ecc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -85,6 +85,13 @@ distro with its own bridge — the discrepancy is intentional, not a typo. The exporter aggregates Agamemnon, Nestor, and NATS data and exposes them as Prometheus metrics on port 9100. +## Metric Catalog + +The full catalog of every metric the exporter emits — name, labels, and +semantics — lives at [`docs/metrics.md`](docs/metrics.md). Treat +`exporter/exporter.py`'s `_METRIC_HELP` dict as the source of truth and update +`docs/metrics.md` in the same commit when renaming or adding a metric. + ## Metric Naming Conventions All HomericIntelligence-specific metrics follow the `hi_` prefix: diff --git a/docs/metrics.md b/docs/metrics.md new file mode 100644 index 0000000..d673906 --- /dev/null +++ b/docs/metrics.md @@ -0,0 +1,53 @@ +# Argus Exporter — Metric Catalog + +Canonical reference for every Prometheus metric exposed by `homeric-exporter` +on `:9100/metrics`. Source of truth is the `_METRIC_HELP` dict in +[`exporter/exporter.py`](../exporter/exporter.py); update this file in the +same commit when you add or rename a metric. + +All metrics are emitted as `# TYPE … gauge` (the exporter does not currently +maintain monotonic counter state across scrapes — every value is computed +fresh per scrape). Names that historically carried a `_total` suffix have been +renamed; do not reintroduce the suffix. + +## HomericIntelligence-specific metrics (`hi_` prefix) + +| Metric | Labels | Description | +| --- | --- | --- | +| `hi_agamemnon_health` | — | `1` if `GET ${AGAMEMNON_URL}/v1/health` returned 200, else `0`. | +| `hi_agents_total` | — | Total number of agents registered in Agamemnon. | +| `hi_agents_online` | — | Number of agents with `status=online`. | +| `hi_agents_offline` | — | Number of agents with any non-online status. | +| `hi_agent_online` | `name`, `host`, `program` | `1` if this individual agent is online, else `0`. | +| `hi_tasks_total` | — | Total number of tasks known to Agamemnon. | +| `hi_tasks_by_status` | `status` | Task count partitioned by status label. | +| `hi_nestor_health` | — | `1` if `GET ${NESTOR_URL}/v1/health` returned 200, else `0`. | +| `hi_nestor_research_active` | — | Active research jobs reported by Nestor `/v1/research/stats`. | +| `hi_nestor_research_completed` | — | Completed research jobs reported by Nestor `/v1/research/stats`. | +| `hi_nestor_research_pending` | — | Pending research jobs reported by Nestor `/v1/research/stats`. | + +## NATS metrics (`nats_` prefix) + +Sourced from NATS `/varz` and `/jsz` monitoring endpoints. Values reset on +NATS server restart, so these are gauges (no `_total` suffix). + +| Metric | Description | +| --- | --- | +| `nats_connections` | Current client connections. | +| `nats_in_msgs` | Inbound message rate. | +| `nats_out_msgs` | Outbound message rate. | +| `nats_in_bytes` | Inbound bytes rate. | +| `nats_out_bytes` | Outbound bytes rate. | +| `nats_slow_consumers` | Current slow-consumer connections. | +| `nats_jetstream_streams` | Number of JetStream streams. | +| `nats_jetstream_consumers` | Number of JetStream consumers. | +| `nats_jetstream_messages` | Total messages stored across all JetStream streams. | +| `nats_jetstream_bytes` | Total bytes stored across all JetStream streams. | + +## Exporter self-metrics (`homeric_exporter_` prefix) + +| Metric | Labels | Description | +| --- | --- | --- | +| `homeric_exporter_scrape_timestamp_seconds` | — | Unix timestamp (seconds) when the last scrape completed. | +| `homeric_exporter_scrape_duration_seconds` | — | Wall-clock seconds spent in the last `collect()` call. | +| `homeric_exporter_fetch_errors` | `upstream` (`agamemnon`/`nestor`/`nats`) | Per-scrape count of upstream fetch failures. Gauge — resets each collection cycle. | From cfb5a4824c5357d664eacea0727b3754e1b834c9 Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:46:18 -0700 Subject: [PATCH 07/18] docs(pr-template): add CHANGELOG update item to validation checklist Closes #239 --- .github/PULL_REQUEST_TEMPLATE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 8aad371..8efae97 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -26,6 +26,8 @@ - [ ] No credentials or secrets in the diff - [ ] Existing metric names not renamed without dashboard update in this PR - [ ] Alert rule changes do not regress existing alerts +- [ ] `CHANGELOG.md` updated under `[Unreleased]` (skip only for chore-only + PRs that do not affect user-visible behaviour) ## Related Issues From 280dadae3eeba84e8daed00b5060e99afa850e2c Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:47:26 -0700 Subject: [PATCH 08/18] docs(CLAUDE.md): add Operator Notes section + NATS_URL/scrape clarification Surfaces the easily-missed operator preconditions and runtime gotchas that several follow-up issues asked to be documented: - Onboarding step: copy .env.example to .env (#181, #196). - /tmp/hermes.log must exist on the host before `just start` (#192). - htpasswd is auto-generated by `just start`; rotation steps documented (#228, #342). - All host ports are loopback-only; SSH/Tailscale tunneling is the supported remote-access pattern (#188, #199, #245). - `just test-scrape` runs inside the container; debug-prometheus and debug-loki are the entry points for ad-hoc inspection (#199). - Backup/restore expectations on cold hosts (#360). - jq unavailability on win-64 (#405). - NATS_URL gateway IP vs. Prometheus localhost target (#386). Closes #181 Closes #188 Closes #192 Closes #196 Closes #199 Closes #228 Closes #245 Closes #342 Closes #360 Closes #386 Closes #405 --- CLAUDE.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index aee7ecc..a3db535 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -85,6 +85,51 @@ distro with its own bridge — the discrepancy is intentional, not a typo. The exporter aggregates Agamemnon, Nestor, and NATS data and exposes them as Prometheus metrics on port 9100. +The `NATS_URL` env var (`http://172.24.0.1:8222`) addresses the host gateway, +which the exporter container uses to reach NATS on the WSL host. Prometheus, +in contrast, scrapes NATS at `localhost:8222` — that target is interpreted +*inside* the prometheus container (because both Prometheus and the NATS +host gateway resolve to the host's loopback there). The two addresses point +at the same NATS instance from different network namespaces. + +## Operator Notes + +These are easy-to-miss preconditions and runtime behaviours that operators +new to the stack frequently trip on: + +1. **Copy `.env.example` to `.env` first.** `just start` and `docker compose` + both load `.env`; without it Grafana silently falls back to its + built-in `admin:admin` credentials. +2. **`/tmp/hermes.log` must exist on the host before `just start`.** Promtail + bind-mounts the file. If it is missing, Docker creates an empty + *directory* at that path, which silently breaks the mount. Run + `touch /tmp/hermes.log` (or symlink to the real Hermes log) once per host. +3. **Loki proxy htpasswd is generated automatically.** `just start` depends + on `just gen-htpasswd`, which writes `configs/nginx/htpasswd` from + `LOKI_AUTH_USER`/`LOKI_AUTH_PASSWORD` in `.env`. To rotate the password, + update `LOKI_AUTH_PASSWORD` in `.env`, then run `just gen-htpasswd && just restart`. +4. **All host ports are loopback-only.** Prometheus (`127.0.0.1:9090`), + Grafana (`127.0.0.1:3001`), Alertmanager (`127.0.0.1:9093`), and the + exporter (`127.0.0.1:9100`) only accept connections from the host. To + reach them from another machine use an SSH tunnel + (`ssh -L 3001:localhost:3001 host`) or a Tailscale-encrypted route — the + stack intentionally does not expose unauthenticated metric/log endpoints + to the LAN. +5. **`just test-scrape` requires the stack to be running.** After the host + port for Prometheus was removed, `test-scrape` runs the query *inside* + the prometheus container via `docker exec`. Use `just debug-prometheus` + and `just debug-loki` for ad-hoc inspection (these wrappers exec into + the respective containers). +6. **`just backup` / `just restore` need a running compose project.** The + restore script calls `docker compose stop` to quiesce services before + replacing volume data; on a cold host with no containers, the stop is a + no-op and the script still runs, but operators should expect to bring + the stack up at least once before relying on restore. +7. **`jq` is unavailable on `win-64`.** Conda-forge does not ship a `jq` + package for Windows; tasks like `just test-scrape` that pipe through `jq` + will fail there. Windows contributors should install `jq` via `winget` or + `choco` and put it on `$PATH`. + ## Metric Catalog The full catalog of every metric the exporter emits — name, labels, and From 6828023c8604f70600516ee7ae699d217715c71a Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:48:32 -0700 Subject: [PATCH 09/18] ci(_required): add timeout-minutes and least-privilege permissions to remaining jobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Brings the remaining unhardened jobs up to the same baseline as unit-tests (which got timeout-minutes + permissions in #113): - integration-tests, security-dependency-scan, security-secrets-scan, config-validate, schema-validation, deps-version-sync, atlas-dashboard all now declare an explicit timeout-minutes and 'permissions: contents: read' (read-only is sufficient — none of these jobs need write access to the repo, packages, or actions). Closes #275 --- .github/workflows/_required.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/.github/workflows/_required.yml b/.github/workflows/_required.yml index 4d0cec5..454f513 100644 --- a/.github/workflows/_required.yml +++ b/.github/workflows/_required.yml @@ -265,6 +265,9 @@ jobs: integration-tests: name: integration-tests runs-on: ubuntu-24.04 + timeout-minutes: 10 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Validate docker image name formats @@ -316,6 +319,9 @@ jobs: security-dependency-scan: name: security/dependency-scan runs-on: ubuntu-24.04 + timeout-minutes: 15 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Trivy filesystem scan @@ -328,6 +334,9 @@ jobs: security-secrets-scan: name: security/secrets-scan runs-on: ubuntu-24.04 + timeout-minutes: 15 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: @@ -368,6 +377,9 @@ jobs: config-validate: name: config-validate runs-on: ubuntu-24.04 + timeout-minutes: 5 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Validate HCL files or fall back to YAML validation @@ -391,6 +403,9 @@ jobs: schema-validation: name: schema-validation runs-on: ubuntu-24.04 + timeout-minutes: 5 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Validate workflow YAML files against GitHub Actions schema @@ -403,6 +418,9 @@ jobs: deps-version-sync: name: deps/version-sync runs-on: ubuntu-24.04 + timeout-minutes: 5 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Ensure no :latest image tags in compose files @@ -420,6 +438,9 @@ jobs: atlas-dashboard: name: atlas/dashboard runs-on: ubuntu-24.04 + timeout-minutes: 10 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Set up Go From dd5368877379070eb5b045665074dbb277ea6f38 Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:48:58 -0700 Subject: [PATCH 10/18] docs(tests): add ALLOWED_BINDINGS rationale to TestDockerComposePort* classes Closes #323 --- tests/test_configs.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_configs.py b/tests/test_configs.py index 0c18375..a0516a8 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -206,6 +206,17 @@ class TestDockerComposeNetworkIsolation(unittest.TestCase): class TestDockerComposePortBindings(unittest.TestCase): """Assert no service port is bound to 0.0.0.0 (all-interfaces).""" + # Only the loopback address is permitted as a host port binding. + # Rationale: every Argus service exposes either metrics, dashboards, or + # log endpoints that we deliberately do NOT publish to the LAN — remote + # access goes via SSH tunnel or Tailscale (see CLAUDE.md "Operator + # Notes"). Binding to 0.0.0.0 (or any non-loopback address) would expose + # unauthenticated /metrics, /readyz, etc. to anyone on the same network. + # + # To add an exception (a service legitimately designed for LAN + # discovery), open a tracking issue, document the threat model in + # docker-compose.yml, and extend this set in the same PR — never bypass + # the test silently. ALLOWED_BINDINGS = {"127.0.0.1"} def setUp(self) -> None: @@ -264,6 +275,17 @@ def test_loki_datasource_url_uses_proxy(self) -> None: class TestDockerComposePorts(unittest.TestCase): + # Only the loopback address is permitted as a host port binding. + # Rationale: every Argus service exposes either metrics, dashboards, or + # log endpoints that we deliberately do NOT publish to the LAN — remote + # access goes via SSH tunnel or Tailscale (see CLAUDE.md "Operator + # Notes"). Binding to 0.0.0.0 (or any non-loopback address) would expose + # unauthenticated /metrics, /readyz, etc. to anyone on the same network. + # + # To add an exception (a service legitimately designed for LAN + # discovery), open a tracking issue, document the threat model in + # docker-compose.yml, and extend this set in the same PR — never bypass + # the test silently. ALLOWED_BINDINGS = {"127.0.0.1"} def setUp(self) -> None: From 27fef65515468748b9092533e8f0693069b4e50a Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:50:24 -0700 Subject: [PATCH 11/18] docs(env): expand GF_ADMIN_PASSWORD comment with import-dashboards context Closes #261 --- .env.example | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.env.example b/.env.example index 61421d9..e65d331 100644 --- a/.env.example +++ b/.env.example @@ -5,6 +5,13 @@ # Do not leave this as 'changeme'. Use a strong unique password. # Grafana is bound to 127.0.0.1:3000 (loopback only); access requires # SSH/VPN tunnel or local browser — remote LAN access is intentionally blocked. +# +# This value is also required by `just import-dashboards`: the recipe +# authenticates to the Grafana API using GF_ADMIN_PASSWORD and will exit +# with an explicit "GF_ADMIN_PASSWORD is unset" error if missing. It must +# match the password Grafana was first started with — if you rotate it in +# `.env` you also need to update Grafana itself (or wipe `grafana_data` +# and restart) before `just import-dashboards` can authenticate again. GF_ADMIN_PASSWORD=changeme # Prometheus is bound to 127.0.0.1:9090 (loopback only). From e1a595e7a057498fb24d263e5f89a2c91018dfb1 Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:50:29 -0700 Subject: [PATCH 12/18] build(pixi): cap pytest-cov at <8 to avoid silent breakage on major bumps pytest-cov 7.x and earlier ship coverage.py 6.x/7.x respectively, which have changed default branch-coverage semantics across major versions. Capping below 8.0 keeps `pixi update` from silently swapping in a breaking version. Lower bound stays at 5.0. Closes #280 --- pixi.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pixi.toml b/pixi.toml index 02298f8..7e3a8f9 100644 --- a/pixi.toml +++ b/pixi.toml @@ -12,7 +12,7 @@ ruff = ">=0.4,<1" bandit = ">=1.7,<2" pyyaml = ">=6.0,<7" pytest = ">=8.0" -pytest-cov = ">=5.0" +pytest-cov = ">=5.0,<8" python = ">=3.11,<3.14" [target.linux-64.dependencies] From 5ebc4d6d67886ea014c78aa9564a0269eaf8255e Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:50:35 -0700 Subject: [PATCH 13/18] test(dockerfile): widen approved Python ceiling to 3.13 with policy comment Python 3.13 reached GA in October 2024, so the (3, 12) cap is now unnecessarily conservative. Documents the upgrade process in a comment so future maintainers know advancing the ceiling is an intentional review gate, not a magic number. Closes #312 --- tests/test_dockerfile_constraints.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_dockerfile_constraints.py b/tests/test_dockerfile_constraints.py index ff9fc06..279e93c 100644 --- a/tests/test_dockerfile_constraints.py +++ b/tests/test_dockerfile_constraints.py @@ -11,7 +11,14 @@ DOCKERFILE = REPO_ROOT / "exporter" / "Dockerfile" _MIN_VERSION = (3, 11) -_MAX_VERSION = (3, 12) +# Approved Python version ceiling. Advance this only after the next CPython +# release is GA (https://devguide.python.org/versions/) AND has been +# manually verified to build the exporter image and pass the full test +# suite. Process: bump _MAX_VERSION in the same PR that bumps the FROM +# line in exporter/Dockerfile so the test stays a single source of truth. +# Python 3.13 reached GA on 2024-10-07; widen the ceiling to (3, 13) so +# a manual base-image bump doesn't trip the regression test. +_MAX_VERSION = (3, 13) class TestDockerfileConstraints(unittest.TestCase): From 84fb433ef90430cd51d2c7db595ad83d1cb459bf Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:51:14 -0700 Subject: [PATCH 14/18] fix(scripts): fall back to tomli when stdlib tomllib is unavailable tomllib was added to the stdlib in Python 3.11. The script previously hard-imported it, which would crash with ModuleNotFoundError under any pre-commit environment provisioned with an older interpreter. Falls back to the upstream tomli package so the script can run under Python 3.10 when tomli is on the path. Closes #402 --- scripts/check-version-consistency.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/check-version-consistency.py b/scripts/check-version-consistency.py index 62fd0a0..75b4704 100755 --- a/scripts/check-version-consistency.py +++ b/scripts/check-version-consistency.py @@ -15,9 +15,16 @@ import re import sys -import tomllib from pathlib import Path +try: + import tomllib # Python 3.11+ stdlib +except ModuleNotFoundError: # pragma: no cover - exercised on Python < 3.11 + # tomli is the upstream of stdlib tomllib; install it via the pre-commit + # hook's `additional_dependencies`, or `pip install tomli`, when the + # interpreter pre-commit picks up is older than 3.11. + import tomli as tomllib # type: ignore[no-redef] + def load_version(pixi_toml: Path) -> str: """Return the version string from pixi.toml [project].version or [workspace].version.""" From 9bb46f42b65967562d2f78a5ee444a87666a9497 Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:52:24 -0700 Subject: [PATCH 15/18] docs(prometheus): document the deliberate Promtail scrape omission Closes #259 --- configs/prometheus.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/configs/prometheus.yml b/configs/prometheus.yml index 003f7c4..292ecef 100644 --- a/configs/prometheus.yml +++ b/configs/prometheus.yml @@ -49,3 +49,12 @@ scrape_configs: static_configs: - targets: ['argus-dashboard:3002'] metrics_path: /metrics + + # NOTE: Promtail (port 9080) is intentionally NOT scraped here. Promtail + # exposes useful self-metrics (lines ingested, target counts, push errors) + # but they cardinality-multiply per tailed file/job and we keep this + # Prometheus instance focused on the service-level signals from + # homeric-exporter and the upstreams above. If you need Promtail metrics + # during an incident, add an ad-hoc scrape via `just debug-prometheus` + # or stand up a sidecar Prometheus pointed at `promtail:9080`. Re-enable + # here only after a cardinality budget is documented in CLAUDE.md. From 5e47a9394c9c4410dcff990bd1145493ec150e46 Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:53:54 -0700 Subject: [PATCH 16/18] docs(CLAUDE.md): satisfy markdownlint MD060/MD013 in env-vars and metrics sections Two of the optional env-var rows had cell content too long to align in the existing pipe table, tripping MD060/table-column-style. Move them out of the table into a follow-up bullet list. Wrap the long nats_*_bytes line under 120 cols. Cleans up so the new markdownlint pre-commit hook can land green. --- CLAUDE.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index a3db535..1c2b556 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -65,8 +65,15 @@ to start without a `.env` file. | `NESTOR_URL` | `http://172.20.0.1:8081` | Yes | Nestor API base URL | | `NATS_URL` | `http://172.24.0.1:8222` | Yes | NATS monitoring API base URL | | `NATS_LOG_DIR` | `/home/mvillmow/.local/share/nats` | Yes | Host path to NATS log files (Promtail mounts this) | -| `PROMTAIL_HOST_LABEL` | unset (Promtail uses `$HOSTNAME`) | No | Override the `host` label Promtail attaches to log streams | -| `CONTAINER_CMD` | `docker` (auto-set to `podman` if `podman-compose` is on `$PATH`) | No | Runtime used by `scripts/backup.sh` and `scripts/restore.sh`; the justfile recipes pass `CONTAINER_CMD={{container_cmd}}` so you usually don't set this manually | + +Optional overrides (not required by `just start`): + +- `PROMTAIL_HOST_LABEL` — overrides the `host` label Promtail attaches to log + streams. Defaults to the container's `$HOSTNAME`. +- `CONTAINER_CMD` — runtime used by `scripts/backup.sh` and `scripts/restore.sh`. + Defaults to `docker` (auto-promoted to `podman` if `podman-compose` is on + `$PATH`). Justfile recipes pass `CONTAINER_CMD={{container_cmd}}` + automatically, so you rarely need to set it by hand. `172.20.0.1` / `172.24.0.1` are WSL2 host gateway addresses — they reach services running on the Windows host or in other WSL distros. Substitute Tailscale IPs for @@ -149,7 +156,8 @@ All HomericIntelligence-specific metrics follow the `hi_` prefix: NATS metrics use the `nats_` prefix: - `nats_connections`, `nats_slow_consumers` — current state (gauges) -- `nats_in_msgs`, `nats_out_msgs`, `nats_in_bytes`, `nats_out_bytes` — current rates from `/varz` (gauges; reset on NATS restart) +- `nats_in_msgs`, `nats_out_msgs`, `nats_in_bytes`, `nats_out_bytes` — + current rates from `/varz` (gauges; reset on NATS restart) - `nats_jetstream_*` — JetStream stats Exporter self-metrics use the `homeric_exporter_` prefix: From 32a42cfb86183803e3cfb0ad0b7f4a5ee50f59c2 Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:53:59 -0700 Subject: [PATCH 17/18] chore(pre-commit): add markdownlint-cli2 hook for Markdown linting Mirrors the yamllint hook pattern: enforces .markdownlint.yaml (already in the repo) on every commit so Markdown quality follows the same guard-rail as YAML and Python. Pinned to v0.13.0. Closes #379 --- .pre-commit-config.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1053477..09cee83 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,6 +11,14 @@ repos: - id: ruff args: ["--fix"] + # Markdown linting — enforces .markdownlint.yaml across every Markdown + # file in the repo, mirroring how the yamllint hook above enforces YAML + # quality. Refs #379. + - repo: https://github.com/DavidAnson/markdownlint-cli2 + rev: v0.13.0 + hooks: + - id: markdownlint-cli2 + - repo: local hooks: - id: bandit From cf66cfe704506900f267f98167e27f0658ea98df Mon Sep 17 00:00:00 2001 From: HomericIntelligence Agent <4211002+mvillmow@users.noreply.github.com> Date: Mon, 11 May 2026 19:55:10 -0700 Subject: [PATCH 18/18] style: ruff SIM103 fix in check-version-consistency.py + dedent prometheus.yml comment - Ruff's auto-fix flips `not X is None` to `X is not None` (SIM103); the existing noqa was vestigial. - yamllint flagged the new Promtail comment block in prometheus.yml as comments-indentation; dedent to column 0 since it's a top-level note about the scrape_configs list as a whole, not a comment on a list item. --- configs/prometheus.yml | 16 ++++++++-------- scripts/check-version-consistency.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/configs/prometheus.yml b/configs/prometheus.yml index 292ecef..3a6d7b6 100644 --- a/configs/prometheus.yml +++ b/configs/prometheus.yml @@ -50,11 +50,11 @@ scrape_configs: - targets: ['argus-dashboard:3002'] metrics_path: /metrics - # NOTE: Promtail (port 9080) is intentionally NOT scraped here. Promtail - # exposes useful self-metrics (lines ingested, target counts, push errors) - # but they cardinality-multiply per tailed file/job and we keep this - # Prometheus instance focused on the service-level signals from - # homeric-exporter and the upstreams above. If you need Promtail metrics - # during an incident, add an ad-hoc scrape via `just debug-prometheus` - # or stand up a sidecar Prometheus pointed at `promtail:9080`. Re-enable - # here only after a cardinality budget is documented in CLAUDE.md. +# NOTE: Promtail (port 9080) is intentionally NOT scraped here. Promtail +# exposes useful self-metrics (lines ingested, target counts, push errors) +# but they cardinality-multiply per tailed file/job and we keep this +# Prometheus instance focused on the service-level signals from +# homeric-exporter and the upstreams above. If you need Promtail metrics +# during an incident, add an ad-hoc scrape via `just debug-prometheus` +# or stand up a sidecar Prometheus pointed at `promtail:9080`. Re-enable +# here only after a cardinality budget is documented in CLAUDE.md. diff --git a/scripts/check-version-consistency.py b/scripts/check-version-consistency.py index 75b4704..31ebdf4 100755 --- a/scripts/check-version-consistency.py +++ b/scripts/check-version-consistency.py @@ -46,7 +46,7 @@ def versioned_sections(changelog: Path) -> list[str]: def has_versioned_header(version: str, changelog: Path) -> bool: """Return True if CHANGELOG contains a ## [] section header.""" pattern = re.compile(rf"^## \[{re.escape(version)}\]", re.MULTILINE) - return not pattern.search(changelog.read_text()) is None # noqa: SIM103 + return pattern.search(changelog.read_text()) is not None # noqa: SIM103 def check(repo_root: Path) -> int: