Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions certs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Generated certificates and private keys — never commit these
*.key
*.crt
*.csr
*.srl
77 changes: 77 additions & 0 deletions certs/gen-certs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env bash
# Generate a self-signed CA and per-service TLS certificates for the argus stack.
# Run once before `just start`. Certificates are written to this directory.
# Re-running is idempotent: existing certs are skipped unless --force is passed.
set -euo pipefail

CERTS_DIR="$(cd "$(dirname "$0")" && pwd)"
FORCE="${1:-}"

SERVICES=(prometheus loki grafana promtail argus-exporter)
DAYS=3650

# Subject Alternative Names per service (hostname inside Docker + localhost)
declare -A SANS
SANS[prometheus]="DNS:prometheus,DNS:argus-prometheus,DNS:localhost,IP:127.0.0.1"
SANS[loki]="DNS:loki,DNS:argus-loki,DNS:localhost,IP:127.0.0.1"
SANS[grafana]="DNS:grafana,DNS:argus-grafana,DNS:localhost,IP:127.0.0.1"
SANS[promtail]="DNS:promtail,DNS:argus-promtail,DNS:localhost,IP:127.0.0.1"
SANS[argus-exporter]="DNS:argus-exporter,DNS:localhost,IP:127.0.0.1"

cd "$CERTS_DIR"

# ── CA ─────────────────────────────────────────────────────────────────────────
if [[ -f ca.crt && -z "$FORCE" ]]; then
echo "[skip] CA already exists (pass --force to regenerate)"
else
echo "[gen] Generating CA key and certificate..."
openssl genrsa -out ca.key 4096
openssl req -new -x509 -days "$DAYS" -key ca.key -out ca.crt \
-subj "/CN=argus-local-ca/O=ProjectArgus/OU=HomericIntelligence"
echo "[ok] CA generated: ca.crt"
fi

# ── Per-service certs ──────────────────────────────────────────────────────────
for svc in "${SERVICES[@]}"; do
if [[ -f "${svc}.crt" && -z "$FORCE" ]]; then
echo "[skip] ${svc}.crt already exists"
continue
fi

echo "[gen] Generating cert for ${svc}..."
openssl genrsa -out "${svc}.key" 2048

# Write SAN extension to a temp file
san_ext=$(mktemp)
cat > "$san_ext" <<EOF
[req]
distinguished_name = req_distinguished_name
req_extensions = v3_req
prompt = no

[req_distinguished_name]
CN = ${svc}
O = ProjectArgus

[v3_req]
subjectAltName = ${SANS[$svc]}
EOF

openssl req -new -key "${svc}.key" -out "${svc}.csr" -config "$san_ext"
openssl x509 -req -days "$DAYS" \
-in "${svc}.csr" \
-CA ca.crt -CAkey ca.key -CAcreateserial \
-out "${svc}.crt" \
-extfile "$san_ext" -extensions v3_req
rm -f "$san_ext" "${svc}.csr"
echo "[ok] ${svc}.crt generated"
done

echo ""
echo "Certificate generation complete."
echo "Files written to: $CERTS_DIR"
echo ""
echo "Next steps:"
echo " 1. Run 'just start' to bring up the stack with TLS enabled."
echo " 2. Trust ca.crt in your browser/OS to avoid certificate warnings."
echo " 3. See docs/tls-setup.md for full runbook."
14 changes: 13 additions & 1 deletion configs/grafana/datasources.yml
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
apiVersion: 1

Check warning on line 1 in configs/grafana/datasources.yml

View workflow job for this annotation

GitHub Actions / Validate configs

1:1 [document-start] missing document start "---"

datasources:
- name: Prometheus
type: prometheus
uid: prometheus
access: proxy
url: http://prometheus:9090
url: https://prometheus:9090
isDefault: true
editable: false
jsonData:
httpMethod: POST
prometheusType: Prometheus
tlsSkipVerify: false
tlsAuthWithCACert: true
secureJsonData:
tlsCACert: |
# Mount the CA cert content here, or use Grafana's provisioning
# secret injection. See docs/tls-setup.md for options.

- name: Loki
type: loki
Expand All @@ -21,3 +27,9 @@
editable: false
jsonData:
maxLines: 1000
tlsSkipVerify: false
tlsAuthWithCACert: true
secureJsonData:
tlsCACert: |
# Mount the CA cert content here, or use Grafana's provisioning
# secret injection. See docs/tls-setup.md for options.
3 changes: 3 additions & 0 deletions configs/loki.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
auth_enabled: false

Check warning on line 1 in configs/loki.yml

View workflow job for this annotation

GitHub Actions / Validate configs

1:1 [document-start] missing document start "---"

server:
http_listen_port: 3100
grpc_listen_port: 9096
http_tls_config:
cert_file: /etc/loki/tls/loki.crt
key_file: /etc/loki/tls/loki.key

common:
instance_addr: 127.0.0.1
Expand Down
11 changes: 10 additions & 1 deletion configs/prometheus.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
global:

Check warning on line 1 in configs/prometheus.yml

View workflow job for this annotation

GitHub Actions / Validate configs

1:1 [document-start] missing document start "---"
scrape_interval: 15s
evaluation_interval: 15s

Expand All @@ -10,6 +10,12 @@
rule_files:
- /etc/prometheus/rules/*.yml

# TLS configuration for the Prometheus HTTPS server.
# Cert files are mounted by docker-compose from certs/.
tls_server_config:
cert_file: /etc/prometheus/tls/prometheus.crt
key_file: /etc/prometheus/tls/prometheus.key

scrape_configs:
# Custom exporter: converts Agamemnon + NATS + Nestor JSON APIs to Prometheus metrics format
# Runs as argus-exporter sidecar on port 9100
Expand All @@ -23,8 +29,11 @@
static_configs:
- targets: ['jetstream-consumer:9101']

# Prometheus self-monitoring
# Prometheus self-monitoring (over HTTPS since tls_server_config is active)
- job_name: 'prometheus'
scheme: https
tls_config:
ca_file: /etc/prometheus/tls/ca.crt
static_configs:
- targets: ['localhost:9090']

Expand Down
4 changes: 3 additions & 1 deletion configs/promtail.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
server:

Check warning on line 1 in configs/promtail.yml

View workflow job for this annotation

GitHub Actions / Validate configs

1:1 [document-start] missing document start "---"
http_listen_port: 9080
grpc_listen_port: 0

Expand All @@ -6,7 +6,9 @@
filename: /run/promtail/positions.yaml

clients:
- url: http://loki:3100/loki/api/v1/push
- url: https://loki:3100/loki/api/v1/push
tls_config:
ca_file: /etc/promtail/tls/ca.crt

scrape_configs:
# Tail syslog for host-level events (available in WSL2)
Expand Down
27 changes: 20 additions & 7 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
networks:

Check warning on line 1 in docker-compose.yml

View workflow job for this annotation

GitHub Actions / Validate configs

1:1 [document-start] missing document start "---"
argus:
driver: bridge
loki-internal:
Expand All @@ -23,18 +23,16 @@
volumes:
- ./configs/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./rules:/etc/prometheus/rules:ro
- ./certs/prometheus.crt:/etc/prometheus/tls/prometheus.crt:ro
- ./certs/prometheus.key:/etc/prometheus/tls/prometheus.key:ro
- ./certs/ca.crt:/etc/prometheus/tls/ca.crt:ro
- prometheus_data:/prometheus
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/ready"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
- "--web.config.file=/etc/prometheus/prometheus.yml"
networks:
- argus
deploy:
Expand All @@ -59,6 +57,9 @@
max-file: "3"
volumes:
- ./configs/loki.yml:/etc/loki/local-config.yaml:ro
- ./certs/loki.crt:/etc/loki/tls/loki.crt:ro
- ./certs/loki.key:/etc/loki/tls/loki.key:ro
- ./certs/ca.crt:/etc/loki/tls/ca.crt:ro
- loki_data:/loki
command: -config.file=/etc/loki/local-config.yaml
healthcheck:
Expand Down Expand Up @@ -112,6 +113,7 @@
max-file: "3"
volumes:
- ./configs/promtail.yml:/etc/promtail/config.yml:ro
- ./certs/ca.crt:/etc/promtail/tls/ca.crt:ro
- /var/log:/var/log:ro
- ${NATS_LOG_DIR:-/home/mvillmow/.local/share/nats}:/logs/nats:ro
- /tmp/hermes.log:/logs/hermes.log:ro
Expand Down Expand Up @@ -169,7 +171,8 @@
volumes:
- ./dashboards:/var/lib/grafana/dashboards:ro
- ./configs/grafana:/etc/grafana/provisioning:ro
- ./configs/grafana/alerting.yml:/etc/grafana/provisioning/alerting/alerting.yml:ro
- ./certs/grafana.crt:/etc/grafana/tls/grafana.crt:ro
- ./certs/grafana.key:/etc/grafana/tls/grafana.key:ro
- grafana_data:/var/lib/grafana
environment:
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
Expand All @@ -178,6 +181,9 @@
GF_ANALYTICS_REPORTING_ENABLED: "false"
GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
GF_ANALYTICS_CHECK_FOR_PLUGIN_UPDATES: "false"
GF_SERVER_PROTOCOL: https
GF_SERVER_CERT_FILE: /etc/grafana/tls/grafana.crt
GF_SERVER_CERT_KEY: /etc/grafana/tls/grafana.key
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:3000/api/health"]
interval: 30s
Expand Down Expand Up @@ -213,11 +219,18 @@
- "127.0.0.1:${EXPORTER_PORT:-9100}:9100"
volumes:
- ./exporter/exporter.py:/exporter.py:ro
- ./certs/ca.crt:/certs/ca.crt:ro
command: python /exporter.py
environment:
AGAMEMNON_URL: ${AGAMEMNON_URL:-http://172.20.0.1:8080}
NESTOR_URL: ${NESTOR_URL:-http://172.20.0.1:8081}
NATS_URL: ${NATS_URL:-http://172.24.0.1:8222}
# Set *_TLS_CA to /certs/ca.crt when upstream services serve HTTPS
# with our self-signed CA (or a Tailscale-issued cert).
# Leave empty while upstream services remain HTTP-only.
AGAMEMNON_TLS_CA: ""
NESTOR_TLS_CA: ""
NATS_TLS_CA: ""
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9100/health"]
interval: 30s
Expand Down
140 changes: 140 additions & 0 deletions docs/tls-setup.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# TLS Setup Runbook — ProjectArgus

This document describes how to enable and maintain TLS for all inter-service communication in the argus observability stack.

## Overview

The stack uses a two-tier TLS strategy:

| Tier | Path | Mechanism |
|------|------|-----------|
| 1 (high priority) | exporter → Agamemnon/NATS/Nestor | Tailscale transport encryption (cross-host WSL2 boundary) |
| 2 (best practice) | Docker-internal services | Self-signed CA + per-service certificates |

## Quick Start

### 1. Generate certificates

```bash
just gen-certs
```

This runs `certs/gen-certs.sh`, which creates:
- `certs/ca.crt` / `certs/ca.key` — local Certificate Authority

Check failure on line 23 in docs/tls-setup.md

View workflow job for this annotation

GitHub Actions / markdownlint

Lists should be surrounded by blank lines

docs/tls-setup.md:23 MD032/blanks-around-lists Lists should be surrounded by blank lines [Context: "- `certs/ca.crt` / `certs/ca.k..."] https://github.com/DavidAnson/markdownlint/blob/v0.37.4/doc/md032.md
- `certs/<service>.crt` / `certs/<service>.key` — one cert per service

Certificates are valid for 10 years. The `certs/` directory is git-ignored for `*.crt` and `*.key` — private keys must never be committed.

Check failure on line 26 in docs/tls-setup.md

View workflow job for this annotation

GitHub Actions / markdownlint

Line length

docs/tls-setup.md:26:121 MD013/line-length Line length [Expected: 120; Actual: 138] https://github.com/DavidAnson/markdownlint/blob/v0.37.4/doc/md013.md

### 2. Start the stack

```bash
just start
```

All services mount their certificates from `certs/` via the volumes defined in `docker-compose.yml`.

### 3. Verify

```bash
just test-scrape # Prometheus queries over HTTPS
just reload-prometheus # Prometheus reload over HTTPS
just import-dashboards # Grafana API calls over HTTPS
```

Open Grafana at `https://localhost:3001`. Your browser will warn about the self-signed certificate; add `certs/ca.crt` to your OS/browser trust store to suppress the warning.

Check failure on line 44 in docs/tls-setup.md

View workflow job for this annotation

GitHub Actions / markdownlint

Line length

docs/tls-setup.md:44:121 MD013/line-length Line length [Expected: 120; Actual: 174] https://github.com/DavidAnson/markdownlint/blob/v0.37.4/doc/md013.md

## Tier 1: Cross-Host Paths (Exporter → Agamemnon / NATS)

The exporter reaches Agamemnon (`172.20.0.1:8080`) and NATS (`172.24.0.1:8222`) across the WSL2 host gateway. These paths cross a network boundary and are the highest-risk.

Check failure on line 48 in docs/tls-setup.md

View workflow job for this annotation

GitHub Actions / markdownlint

Line length

docs/tls-setup.md:48:121 MD013/line-length Line length [Expected: 120; Actual: 172] https://github.com/DavidAnson/markdownlint/blob/v0.37.4/doc/md013.md

**Recommended approach: Tailscale**

Route these URLs through Tailscale IPs instead of raw gateway IPs. Tailscale encrypts the hop end-to-end and sidesteps the self-signed certificate distribution problem for external services.

Check failure on line 52 in docs/tls-setup.md

View workflow job for this annotation

GitHub Actions / markdownlint

Line length

docs/tls-setup.md:52:121 MD013/line-length Line length [Expected: 120; Actual: 190] https://github.com/DavidAnson/markdownlint/blob/v0.37.4/doc/md013.md

Update `docker-compose.yml`:
```yaml

Check failure on line 55 in docs/tls-setup.md

View workflow job for this annotation

GitHub Actions / markdownlint

Fenced code blocks should be surrounded by blank lines

docs/tls-setup.md:55 MD031/blanks-around-fences Fenced code blocks should be surrounded by blank lines [Context: "```yaml"] https://github.com/DavidAnson/markdownlint/blob/v0.37.4/doc/md031.md
AGAMEMNON_URL: "https://<tailscale-ip-of-agamemnon-host>:8080"
NESTOR_URL: "https://<tailscale-ip-of-nestor-host>:8081"
NATS_URL: "https://<tailscale-ip-of-nats-host>:8222"
```

If Agamemnon/NATS serve HTTPS with our self-signed CA, also set:
```yaml

Check failure on line 62 in docs/tls-setup.md

View workflow job for this annotation

GitHub Actions / markdownlint

Fenced code blocks should be surrounded by blank lines

docs/tls-setup.md:62 MD031/blanks-around-fences Fenced code blocks should be surrounded by blank lines [Context: "```yaml"] https://github.com/DavidAnson/markdownlint/blob/v0.37.4/doc/md031.md
AGAMEMNON_TLS_CA: "/certs/ca.crt"
NESTOR_TLS_CA: "/certs/ca.crt"
NATS_TLS_CA: "/certs/ca.crt"
```

The CA file `/certs/ca.crt` is already mounted in the `argus-exporter` container.

**Fallback: Plain HTTP (current default)**

The default `AGAMEMNON_TLS_CA=""` / `NESTOR_TLS_CA=""` / `NATS_TLS_CA=""` preserves backward compatibility — the exporter uses plain HTTP as long as the upstream services don't serve HTTPS. This avoids `SSL_ERROR_RX_RECORD_TOO_LONG` errors when `https://` is pointed at an HTTP-only endpoint.

Check failure on line 72 in docs/tls-setup.md

View workflow job for this annotation

GitHub Actions / markdownlint

Line length

docs/tls-setup.md:72:121 MD013/line-length Line length [Expected: 120; Actual: 292] https://github.com/DavidAnson/markdownlint/blob/v0.37.4/doc/md013.md

## Tier 2: Docker-Internal Paths

| Service | Certificate | Mounted at |
|---------|-------------|------------|
| Prometheus | `certs/prometheus.{crt,key}` | `/etc/prometheus/tls/` |
| Loki | `certs/loki.{crt,key}` | `/etc/loki/tls/` |
| Grafana | `certs/grafana.{crt,key}` | `/etc/grafana/tls/` |
| Promtail (client) | `certs/ca.crt` | `/etc/promtail/tls/` |

### Grafana CA cert for datasources

Grafana provisioning (`configs/grafana/datasources.yml`) includes `tlsAuthWithCACert: true` and a `secureJsonData.tlsCACert` placeholder. To inject the actual CA cert at startup, either:

Check failure on line 85 in docs/tls-setup.md

View workflow job for this annotation

GitHub Actions / markdownlint

Line length

docs/tls-setup.md:85:121 MD013/line-length Line length [Expected: 120; Actual: 186] https://github.com/DavidAnson/markdownlint/blob/v0.37.4/doc/md013.md

**Option A — Env var injection (recommended for Docker)**

Add to `docker-compose.yml` under `grafana.environment`:
```yaml

Check failure on line 90 in docs/tls-setup.md

View workflow job for this annotation

GitHub Actions / markdownlint

Fenced code blocks should be surrounded by blank lines

docs/tls-setup.md:90 MD031/blanks-around-fences Fenced code blocks should be surrounded by blank lines [Context: "```yaml"] https://github.com/DavidAnson/markdownlint/blob/v0.37.4/doc/md031.md
GF_DATASOURCE_PROMETHEUS_JSONDATA_TLSCACERT: |
<contents of certs/ca.crt>
```

Or use a startup script that patches the provisioning file:
```bash
sed -i "s|# Mount the CA cert content here.*|$(cat certs/ca.crt | sed 's/^/ /')|" \
configs/grafana/datasources.yml
```

**Option B — Grafana UI**

After startup, navigate to each datasource in the Grafana UI and paste the CA cert content into the "TLS CA Certificate" field. Export the datasource JSON and check it in.

## Certificate Rotation

1. Remove existing certificates: `rm certs/*.crt certs/*.key certs/*.srl`
2. Regenerate: `just gen-certs`
3. Restart the stack: `just stop && just start`

Or regenerate without removing first (force mode):
```bash
bash certs/gen-certs.sh --force
just stop && just start
```

## Troubleshooting

### `SSL_ERROR_RX_RECORD_TOO_LONG`

This means `https://` was used against a service that is still serving plain HTTP. Check:
1. Is the target service configured with TLS? (Prometheus `tls_server_config`, Loki `http_tls_config`, etc.)
2. Are the certificates mounted correctly? Check `docker compose logs <service>` for TLS init errors.
3. Did `just gen-certs` complete without errors?

### Certificate not trusted in browser

Add `certs/ca.crt` to your OS trust store:
- **Ubuntu/Debian**: `sudo cp certs/ca.crt /usr/local/share/ca-certificates/argus-ca.crt && sudo update-ca-certificates`
- **macOS**: `sudo security add-trusted-cert -d -r trustRoot -k /Library/Keychains/System.keychain certs/ca.crt`
- **Windows**: Import via Certificate Manager (`certmgr.msc`) → Trusted Root Certification Authorities

### Promtail push failures after TLS

Check that Loki is serving HTTPS and that `certs/ca.crt` is present in the container:
```bash
just logs promtail
just logs loki
docker exec argus-promtail ls /etc/promtail/tls/
```
Loading
Loading