From 0695776ec98d3ebc54850041ad28a012746d0bdc Mon Sep 17 00:00:00 2001 From: nexus Date: Fri, 29 May 2026 13:08:06 +0800 Subject: [PATCH] feat(ami): single-instance AMI / appliance form factor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the Packer template, install / first-boot / harden scripts, prod-shape config yamls, and systemd units that bake a Nexus Gateway single-instance appliance image (AWS Marketplace target, with on-prem VMware / KVM / bare- metal reuse planned via the same install logic). What's in the image: - PostgreSQL 16, Valkey 9 with valkey-search, NATS JetStream - The four Nexus Go services (nexus-hub, control-plane, ai-gateway, compliance-proxy) + control-plane-ui Vite dist - nginx HTTPS reverse proxy with /api/, /oauth/, /authserver/, /healthz and SPA fallback - Per-instance secrets, MITM CA, self-signed TLS cert (with IP SANs) and randomised admin password all generated at first boot - Marketplace per-instance-uniqueness invariant honoured (admin password is random per launch, NOT shared across launches) Fixes discovered iterating Packer builds in us-east-1: AWS / Packer: 1. ami_description must be pure ASCII (em dash rejected at end of build, AMI auto-deregistered) systemd ordering: 2. nexus-first-boot.service: drop Before=postgresql.service (deadlocks systemctl start postgresql called from inside the unit) 3. valkey.service: use RuntimeDirectory= instead of ReadWritePaths=/var/run/valkey (the dir is on tmpfs, wiped at boot) 15. first-boot.sh kicks nexus-* + nginx at the tail of its run, to clear sticky "Dependency failed" cascades from the boot race 21. same kick now includes nginx (which fails its ExecStartPre nginx -t before first-boot-ca writes the cert) Valkey + valkey-search: 4. install -m 0755 libsearch.so (Valkey rejects 0644 — no exec bit) 5. Valkey bumped to 9.0.4 (valkey-search 1.2.0 requires >= 9.0.1) Runtime quota / iteration: 6. vCPU limit guidance in operator doc (Standard family bucket 16) first-boot idempotency: 7. first-boot-{secrets,ca,db}.sh skip cleanly if state already exists 8. first-boot-db.sh: `EXISTING_URL=\$(grep ... || true)` so pipefail does not abort the script when no DATABASE_URL line is present yet Node / Prisma: 9. first-boot-db.sh: prepend /opt/nexus/node/bin to PATH so npx shebangs resolve `node` 10. drop removed-in-Prisma-7 `--skip-generate` flag from `prisma db push` 11. CREATE ROLE nexus WITH SUPERUSER so the seed can DISABLE TRIGGER ALL on system RI triggers (PG is 127.0.0.1 + SCRAM only) Per-instance config: 12. first-boot.sh stamps publicURL into the four yamls from IMDSv2 (public-ipv4 -> local-ipv4 -> hostname fallback) 13. control-plane.config.yaml has authServer.issuer = env override, first-boot writes AUTH_SERVER_ISSUER to match publicURL 14. compliance-proxy.config.yaml now has the mq / registry / auth blocks that the new validators require 17. first-boot appends https:///auth/callback to cp-ui OAuthClient.redirectUris so the SPA's PKCE flow lands cleanly Auth surface end-to-end: 16. nginx /oauth/ -> control-plane:3001 (PKCE authorize/token endpoints) 18. nginx /authserver/ -> control-plane:3001 (IDP list + password POST) 19. cert is regenerated with subjectAltName IPs and added to the system CA trust bundle so the JWT verifier can fetch JWKS over HTTPS at the public IP without skipping verification Docs: - docs/developers/architecture/cross-cutting/deployment/ ami-appliance-architecture.md - full design rationale - docs/operators/ops/ami-build.md - operator-facing build steps + common failure modes - docs/developers/architecture/README.md - trigger-table row for nexus-ami/** -> the new arch doc - top-level README - Deployment section pointing at the AMI form factor - nexus-ami/README.md - quick build / test commands Verified by: - Eleven Packer builds — final build is clean - Two fresh t3.medium launches reach all-9-units-active with no manual intervention on first boot - End-to-end OAuth flow: /oauth/authorize 302 -> /login -> POST /authserver/password 200 -> /oauth/token 200 -> bearer token verifies via JWKS -> /api/admin/me 200 -> /api/admin/me/permissions returns 151 admin actions Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 17 ++ Makefile | 14 +- README.md | 11 + docs/developers/architecture/README.md | 6 + .../deployment/ami-appliance-architecture.md | 260 ++++++++++++++++++ docs/operators/README.md | 1 + docs/operators/ops/ami-build.md | 81 ++++++ nexus-ami/README.md | 143 ++++++++++ .../artifacts/configs/ai-gateway.config.yaml | 87 ++++++ .../configs/compliance-proxy.config.yaml | 175 ++++++++++++ .../configs/control-plane.config.yaml | 97 +++++++ .../artifacts/configs/nexus-hub.config.yaml | 97 +++++++ nexus-ami/artifacts/configs/nginx-nexus.conf | 87 ++++++ nexus-ami/artifacts/systemd/nats.service | 25 ++ .../systemd/nexus-control-plane.service | 25 ++ .../systemd/nexus-first-boot.service | 26 ++ .../artifacts/systemd/nexus-gateway.service | 25 ++ nexus-ami/artifacts/systemd/nexus-hub.service | 25 ++ .../artifacts/systemd/nexus-proxy.service | 25 ++ nexus-ami/artifacts/systemd/valkey.service | 32 +++ nexus-ami/build.sh | 119 ++++++++ nexus-ami/nexus.pkr.hcl | 109 ++++++++ nexus-ami/scripts/first-boot-ca.sh | 83 ++++++ nexus-ami/scripts/first-boot-db.sh | 186 +++++++++++++ nexus-ami/scripts/first-boot-secrets.sh | 72 +++++ nexus-ami/scripts/first-boot.sh | 114 ++++++++ nexus-ami/scripts/harden.sh | 78 ++++++ nexus-ami/scripts/install-nats.sh | 57 ++++ nexus-ami/scripts/install-node-prisma.sh | 54 ++++ nexus-ami/scripts/install-postgres.sh | 23 ++ nexus-ami/scripts/install-valkey.sh | 215 +++++++++++++++ nexus-ami/scripts/install.sh | 184 +++++++++++++ nexus-ami/scripts/set-admin-password.js | 30 ++ 33 files changed, 2582 insertions(+), 1 deletion(-) create mode 100644 docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md create mode 100644 docs/operators/ops/ami-build.md create mode 100644 nexus-ami/README.md create mode 100644 nexus-ami/artifacts/configs/ai-gateway.config.yaml create mode 100644 nexus-ami/artifacts/configs/compliance-proxy.config.yaml create mode 100644 nexus-ami/artifacts/configs/control-plane.config.yaml create mode 100644 nexus-ami/artifacts/configs/nexus-hub.config.yaml create mode 100644 nexus-ami/artifacts/configs/nginx-nexus.conf create mode 100644 nexus-ami/artifacts/systemd/nats.service create mode 100644 nexus-ami/artifacts/systemd/nexus-control-plane.service create mode 100644 nexus-ami/artifacts/systemd/nexus-first-boot.service create mode 100644 nexus-ami/artifacts/systemd/nexus-gateway.service create mode 100644 nexus-ami/artifacts/systemd/nexus-hub.service create mode 100644 nexus-ami/artifacts/systemd/nexus-proxy.service create mode 100644 nexus-ami/artifacts/systemd/valkey.service create mode 100755 nexus-ami/build.sh create mode 100644 nexus-ami/nexus.pkr.hcl create mode 100755 nexus-ami/scripts/first-boot-ca.sh create mode 100755 nexus-ami/scripts/first-boot-db.sh create mode 100755 nexus-ami/scripts/first-boot-secrets.sh create mode 100755 nexus-ami/scripts/first-boot.sh create mode 100755 nexus-ami/scripts/harden.sh create mode 100755 nexus-ami/scripts/install-nats.sh create mode 100755 nexus-ami/scripts/install-node-prisma.sh create mode 100755 nexus-ami/scripts/install-postgres.sh create mode 100755 nexus-ami/scripts/install-valkey.sh create mode 100755 nexus-ami/scripts/install.sh create mode 100644 nexus-ami/scripts/set-admin-password.js diff --git a/.gitignore b/.gitignore index 295d8d37..905585fe 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,23 @@ node_modules/ dist/ +# nexus-ami staging artifacts populated by nexus-ami/build.sh — the binaries, +# UI dist, and Prisma bundle are derived from the source tree on every build +# and should never be committed. Architecture: +# docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +nexus-ami/artifacts/bin/ +nexus-ami/artifacts/ui-dist/ +nexus-ami/artifacts/prisma/ +nexus-ami/artifacts/scripts/ +nexus-ami/artifacts.tar.gz +# Build-cycle log + tarball rotations created by ./build.sh during iteration. +nexus-ami/artifacts.tar.gz.* +nexus-ami/build.log* +# Packer leaves a manifest + crash log in the working dir. +nexus-ami/packer_cache/ +nexus-ami/manifest.json +nexus-ami/crash.log + # Wails dashboard build artefacts (regenerated by `wails build` / # `wails dev`; bindings are auto-generated from the Go bridge). # Wails build outputs. Everything is generated EXCEPT: diff --git a/Makefile b/Makefile index 71ae8d7c..bda5b8d1 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,8 @@ compliance-proxy-build compliance-proxy-test \ agent-build agent-test \ agent-build-macos agent-package-macos agent-clean-macos \ - agent-build-windows agent-package-windows agent-clean-windows + agent-build-windows agent-package-windows agent-clean-windows \ + ami-build ami-stage # ── Build output convention ──────────────────────────────────────────── # All Go service binaries land in dist/bin// so they @@ -132,3 +133,14 @@ agent-package-windows: agent-build-windows agent-clean-windows: rm -rf dist/windows + +# ── AMI / appliance build (E-OSS marketplace) ──────────────────────── +# Wraps Go binaries + UI dist + Prisma bundle + Packer build into one +# invocation. Architecture: +# docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md + +ami-build: + bash nexus-ami/build.sh + +ami-stage: + bash nexus-ami/build.sh --skip-packer diff --git a/README.md b/README.md index 61f3a9a6..d545f96b 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,17 @@ The lateral dotted arrow is the **attestation handoff**: the Agent always egress --- +## Deployment + +| Form factor | How | Status | +|---|---|---| +| **AWS Marketplace AMI / single-instance appliance** | `cd nexus-ami && ./build.sh` — bakes binaries + UI + Prisma + nginx + Postgres + Valkey + NATS into one AL2023 image via Packer | [`nexus-ami/README.md`](./nexus-ami/README.md) for build steps, [`docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md`](./docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md) for design | +| **Local development** | docker-compose + `./scripts/dev-start.sh` (Postgres + Valkey + NATS) and per-service `go run ./cmd//` | See **Quick start** below | +| **VMware / KVM image / bare-metal appliance** | Reuses the same `install.sh` + `harden.sh` from `nexus-ami/scripts/` under a different Packer builder | Future | +| **Container / Kubernetes** | Out of scope for the appliance form factor — separate product line | Future | + +--- + ## Quick start (local development) ### Prerequisites diff --git a/docs/developers/architecture/README.md b/docs/developers/architecture/README.md index 9cc3866a..baf94391 100644 --- a/docs/developers/architecture/README.md +++ b/docs/developers/architecture/README.md @@ -168,6 +168,12 @@ If you are about to edit code in an area that is genuinely **not** covered by an | i18n keys (`t('namespace:section.key')`), locale files (`packages/*/src/i18n/locales/**`), `packages/ui-shared/src/i18n/**` | `docs/developers/architecture/cross-cutting/ui/ui-i18n-architecture.md` | | `useApi` / `useApiMutation` hooks + queryKey shape, `shellRouteConfig.tsx` / `Sidebar.tsx` IA, `packages/ui-shared/**` cross-bundle components | `docs/developers/architecture/cross-cutting/ui/ui-shell-architecture.md` | +## Cross-cutting — deployment + +| Editing area / file glob | Read FIRST | +|---|---| +| `nexus-ami/**` — Packer template, install / first-boot / harden scripts, prod-shape `*.config.yaml`, systemd unit files for the AMI / bare-metal appliance form factor | `docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md` | + ## Adding a new arch doc When you ship a new `docs/developers/architecture/**/*-architecture.md`: diff --git a/docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md b/docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md new file mode 100644 index 00000000..473f2862 --- /dev/null +++ b/docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md @@ -0,0 +1,260 @@ +--- +updated: 2026-05-28 +--- + +# AMI / appliance deployment architecture + +Single-box deployment form factor for Nexus Gateway. Packages **all** runtime +dependencies (PostgreSQL 16, Valkey 8 with `valkey-search`, NATS JetStream, +4 Go services, the React UI, and an nginx reverse proxy) into one disk image +managed by systemd. The same artifacts ship as: + +| Target | Wrapped by | +|---|---| +| **AWS Marketplace AMI** | `nexus-ami/nexus.pkr.hcl` (Packer + Amazon Linux 2023) | +| **VMware / KVM image** | future — same `install.sh`, different Packer builder | +| **Bare-metal appliance** | future — same `install.sh` invoked from a kickstart / preseed | + +This doc is the architecture source of truth for **everything under +`nexus-ami/`**. Any change to a config file, systemd unit, install script, +or first-boot script in that directory MUST update this doc in the same +commit (Code/Doc Lockstep — see `.cursor/rules/code-doc-lockstep.mdc`). + +## 1. Why one form factor for AMI + bare-metal + +Two distribution channels share the same install logic: + +- **Cloud appliance** — AWS Marketplace AMI (initial target). Customer hits + "Launch", gets a working single-instance Nexus in ~5 minutes. +- **On-prem appliance** — pre-installed disk image / ISO for hardware + shipped to customer sites (future). Same systemd-managed services, same + first-boot secret generation. + +Containerised / Kubernetes deployment is **out of scope** for this doc. If +the project later ships a Helm chart or container Marketplace listing, that +is a separate architecture (`-container-architecture.md`) with its +own dependency wiring (RDS / ElastiCache / managed MQ). + +## 2. Boot sequence (every fresh instance / fresh hardware) + +``` +1. cloud-init / kickstart → network + ec2-user / nexus shell login +2. firewalld → open 443, 3128, 22; close everything else +3. nexus-first-boot.service → oneshot, gated by /etc/nexus/.initialized + ├─ first-boot-secrets.sh → generate 5 [MUST MATCH] secrets, write + │ /etc/nexus/{nexus-hub,control-plane, + │ ai-gateway,compliance-proxy}.env + ├─ first-boot-ca.sh → generate compliance-proxy MITM CA at + │ /etc/compliance-proxy/{ca.crt,ca.key} + └─ first-boot-db.sh → start postgresql, wait, prisma db push, + prisma db seed, randomise admin password, + write /var/log/nexus/admin-credentials.txt + and /etc/motd +4. postgresql.service → After=nexus-first-boot +5. valkey.service → After=nexus-first-boot +6. nats.service → After=nexus-first-boot +7. nexus-hub.service → After=postgresql valkey nats +8. nexus-control-plane.service → After=nexus-hub +9. nexus-gateway.service → After=nexus-hub +10. nexus-proxy.service → After=nexus-hub +11. nginx.service → After=nexus-control-plane (reverse proxy) +``` + +`/etc/nexus/.initialized` is the idempotency marker. Removing it triggers a +fresh init on next boot (destructive — generates new secrets, re-seeds DB). +Customers should never touch it. + +## 3. Filesystem layout + +| Path | Owner | Mode | Contents | +|---|---|---|---| +| `/opt/nexus/bin/` | root:root | 0755 | 4 Go service binaries (immutable, part of AMI) | +| `/opt/nexus/ui/` | root:root | 0755 | Vite-built UI dist (immutable, part of AMI) | +| `/opt/nexus/prisma/` | root:root | 0755 | Prisma schema + seed (immutable, part of AMI) | +| `/etc/nexus/` | root:nexus | 0750 | 4 prod-shape `*.config.yaml` + 4 `*.env` + nginx-nexus.conf + `.initialized` marker | +| `/etc/compliance-proxy/` | root:nexus | 0750 | MITM CA cert + key (generated first-boot) | +| `/var/lib/nexus/` | nexus:nexus | 0750 | Service runtime state (agent CA dir, NDJSON spool, file-backed alerting state) | +| `/var/lib/postgresql/data/` | postgres:postgres | 0700 | PostgreSQL data directory (AL2023 dnf default) | +| `/var/lib/valkey/` | valkey:valkey | 0750 | Valkey AOF + RDB | +| `/var/lib/nats/` | nats:nats | 0750 | NATS JetStream file store | +| `/var/log/nexus/` | nexus:nexus | 0750 | Service log files (rotated by logrotate); also holds `admin-credentials.txt` (mode 0640, root:nexus) | + +## 4. Secret generation (`first-boot-secrets.sh`) + +Five environment variables MUST be unique-per-instance and identical across +the four services that share them (see `.env.example` `[MUST MATCH]` tags): + +| Env var | Used by | Generation | +|---|---|---| +| `INTERNAL_SERVICE_TOKEN` | all 4 | `openssl rand -hex 32` | +| `ADMIN_KEY_HMAC_SECRET` | control-plane, ai-gateway | `openssl rand -hex 32` | +| `CREDENTIAL_ENCRYPTION_KEY` | control-plane, ai-gateway | `openssl rand -hex 32` (AES-256, 64 hex chars) | +| `COMPLIANCE_PROXY_API_TOKEN` | control-plane, compliance-proxy | `openssl rand -hex 32` | +| `AI_GATEWAY_API_TOKEN` | ai-gateway only | `openssl rand -hex 32` | + +Each is written to the appropriate per-service `.env` file under `/etc/nexus/` +which the systemd unit picks up via `EnvironmentFile=`. File mode `0640`, +owner `root:nexus` (services run as `nexus` and read; only root can rewrite). + +`DATABASE_URL`, `REDIS_ADDRS`, `NATS_URL`, `NEXUS_HUB_URL`, +`AUTH_SERVER_URL`, `AUTH_SERVER_JWKS_URL`, `AUTH_SERVER_ISSUER`, +`AI_GATEWAY_URL`, `COMPLIANCE_PROXY_URL`, `COMPLIANCE_PROXY_RUNTIME_URL` — +all bind to `localhost` with fixed ports (see §6), baked into the per-service +`.env` files at first boot. + +## 5. Database initialisation (`first-boot-db.sh`) + +1. `systemctl start postgresql` (synchronous via `--wait`). +2. `psql` create role `nexus` with a per-instance random password; create + database `nexus_gateway` owned by `nexus`. +3. Write the matching `DATABASE_URL=postgresql://nexus:@localhost:5432/nexus_gateway?sslmode=disable` + into every `*.env` file under `/etc/nexus/`. +4. `cd /opt/nexus/prisma && npx prisma db push --skip-generate` to materialise + the schema (no migration history table — fresh instance, no upgrade path + to preserve). +5. `npx tsx seed/seed.ts` to load baseline rows (organisations, IAM, + roles, default settings — see `tools/db-migrate/seed/seed.ts`). +6. Generate a 24-character random admin password, hash it with the same + scrypt parameters the seed uses (`tools/db-migrate/seed/lib.ts` + `hashPassword()` — N=16384, r=8, p=1, salt=32, key=64), and + `UPDATE "NexusUser" SET "passwordHash" = $1 WHERE email = 'admin@nexus.ai'`. +7. Write the plaintext password + login URL + warning to + `/var/log/nexus/admin-credentials.txt` (mode 0640, root:nexus) and append + a one-screen summary to `/etc/motd` so the operator sees it on first SSH. + +`admin@nexus.ai` is the only seeded user that ships with a password. All +other seeded users (alice / bob / carol / diana etc., listed in +`packages/control-plane-ui/README.md`) keep their dev-time passwords from +the seed and are documented as "demo accounts — disable for production" +in the operator-facing docs. + +## 6. Port map (all bound to `localhost` except nginx + compliance-proxy) + +| Port | Service | Binding | Exposed via firewall? | +|---|---|---|---| +| 5432 | PostgreSQL | localhost:5432 | no | +| 6379 | Valkey | localhost:6379 | no | +| 4222 | NATS client | localhost:4222 | no | +| 8222 | NATS HTTP monitoring | localhost:8222 | no | +| 3060 | Nexus Hub | localhost:3060 | no | +| 3001 | Control Plane API | localhost:3001 | no (nginx proxies `/api/*`) | +| 3050 | AI Gateway | 0.0.0.0:3050 | **yes** (SDK clients hit this directly) | +| 3040 | Compliance Proxy runtime API | localhost:3040 | no | +| 3128 | Compliance Proxy CONNECT | 0.0.0.0:3128 | **yes** (network-proxied apps) | +| 9090 | Prometheus metrics | localhost:9090 | no | +| 443 | nginx (UI + `/api/*` reverse proxy) | 0.0.0.0:443 | **yes** | +| 22 | sshd | 0.0.0.0:22 | yes (Marketplace standard) | + +The compliance-proxy CA file path (`/etc/compliance-proxy/ca.crt`, +`/etc/compliance-proxy/ca.key`) is hardcoded into the prod-shape config +because the path is also baked into the systemd unit's `ReadWritePaths` and +into the `first-boot-ca.sh` generator — three places must agree. + +## 7. Hardening (`harden.sh`) + +Runs as the **last** Packer provisioner (after `install.sh`). Standard +AWS Marketplace AMI cleanup; without this the AMI fails the Self-Service +Scan and is rejected on submission. + +| Action | Why | +|---|---| +| `rm -f /root/.ssh/authorized_keys /home/*/.ssh/authorized_keys` | No shared SSH keys (customers BYO) | +| `rm -f /etc/ssh/ssh_host_*` | Regenerated on first boot — no shared host keys across instances | +| `sed -i sshd_config` (PasswordAuthentication=no, PermitRootLogin=no) | Hard requirement for AWS Marketplace | +| `passwd -l root` | Lock root password | +| `find / -name authorized_keys -delete` | Recursive scrub | +| `rm -rf /var/lib/postgresql/data/* /var/lib/valkey/* /var/lib/nats/*` | Clear any pg/valkey/nats state accumulated during install validation | +| `truncate -s 0 /etc/machine-id` | Regenerated on first boot | +| `cloud-init clean --logs` | Fresh cloud-init state | +| `dnf clean all` | Shrink AMI size | +| `find /var/log -type f -exec truncate -s 0 {} \;` | No leaked build-time logs | +| `dd if=/dev/zero of=/zerofile && rm /zerofile && sync` | Free-space zeroing — EBS snapshot dedupes better | + +## 8. AMI build pipeline (`nexus-ami/build.sh` → `nexus.pkr.hcl`) + +``` +make build-all → dist/bin// (4 Go binaries) +make control-plane-ui-build → packages/control-plane-ui/dist/ +build.sh stages → nexus-ami/artifacts/ → flatten + copy + tar +packer init . && packer build → AMI ID in us-east-1 +``` + +Packer steps: + +1. Launch an `m5.4xlarge` builder instance (16 vCPU / 64 GB) from the + latest Amazon Linux 2023 AMI. **Must be `m5.4xlarge` (or larger), not + `t3.2xlarge`** — valkey-search 1.x vendors gRPC + Protobuf + Abseil + + ICU as submodules; template-heavy parallel C++ compile is heap-hungry + per translation unit. Empirically, `t3.2xlarge` (32 GB) is OOM-killed + silently mid-ICU-compile after ~11 minutes (kernel OOM-killer kills sshd + before the script can write stderr — no trace in Packer build logs); + 64 GB clears the failure mode. 2026-05-28 build evidence. +1a. **Linker = lld, not GNU ld.** `install-valkey.sh` installs `lld${ver}` + alongside `clang${ver}` and exports `LDFLAGS=-fuse-ld=lld` before + invoking valkey-search's `./build.sh`. Reason: valkey-search compiles + with `-flto`, and linking `libsearch.so` requires LTO bitcode handling. + GNU ld delegates LTO to the LLVMgold.so plugin, but AL2023's `clang20` + package **omits** LLVMgold.so (verified 2026-05-28: link failed with + `cannot open /usr/lib64/llvm20/lib64/LLVMgold.so`). lld is LLVM's + native linker and handles LTO bitcode directly without a plugin. +2. `file` provisioner uploads `nexus-ami/artifacts.tar.gz` (single file, + ~120 MB) to `/tmp/nexus-artifacts.tar.gz`. We deliberately do NOT upload + `artifacts/` as a directory — Packer's file provisioner uses recursive + SCP under the hood, which silently drops individual files on slow links + (a problem we hit on China → us-east-1 at ~250 KB/s). A single-file + transfer is atomic and fails loudly. +3. `shell` provisioner runs `scripts/install.sh`. The script first extracts + the tarball to `/tmp/nexus/`, then (~10 minutes total) installs + Postgres, builds Valkey from source, installs NATS, installs Node + + Prisma, places binaries + configs + systemd units. +4. `shell` provisioner runs `scripts/harden.sh` (~30 seconds). +5. Packer snapshots the EBS root volume → registers the AMI. + +Total build time: 15–20 minutes per region (on good links; ++5–10 minutes for the cross-Pacific tarball upload from China). + +## 9. Instance sizing recommendation (Marketplace listing) + +| Tier | Instance type | When | +|---|---|---| +| Minimum | `t3.large` (2 vCPU / 8 GB) | PoC, ≤ 100 traffic events/hour | +| Recommended | `t3.xlarge` (4 vCPU / 16 GB) | Small production, ≤ 10k events/hour | +| Performance | `m5.2xlarge` (8 vCPU / 32 GB) | Production, ≤ 100k events/hour | + +Root volume: **≥ 30 GiB** (Postgres + Valkey + NATS file store + log +retention). Marketplace listing should state this requirement explicitly. + +## 10. Out of scope (intentionally) + +- **HA / multi-instance** — by design single-instance. Customers wanting HA + use the Kubernetes / container deployment form factor (separate listing). +- **Schema migration across versions** — pre-GA policy is "fresh install + on every AMI version bump"; customers re-launch a new AMI and re-load + their data via the admin API. Documented as an evaluation product in + the Marketplace listing. +- **External SSO** — AMI ships with the embedded auth server bound to + `localhost`; OIDC federation requires the customer to edit + `/etc/nexus/control-plane.config.yaml` `authServer:` block and restart + the service. +- **TLS termination on a real domain** — AMI ships nginx with a self-signed + cert generated at first boot; documented as "replace with your domain's + cert in `/etc/nexus/tls.{crt,key}` and restart nginx". +- **Agent fleet enrollment from this AMI** — works, but the agent's + bootstrap URL needs to be reachable from the agent host; this is a + network-topology concern documented in the user-facing deployment guide, + not an AMI-side decision. + +## 11. Memory anchors + +- `[[ami_first_boot_5_secrets]]` — five `[MUST MATCH]` secrets must be + written before any Nexus service starts, or services 401 each other. +- `[[ami_random_admin_password_marketplace_safe]]` — random per-instance + admin password is the cheapest defence against the AWS Marketplace + default-credentials finding category. + +## 12. Related docs + +- `.env.example` — canonical env var contract (the AMI honours every + `[MUST MATCH]` tag). +- `docs/developers/architecture/cross-cutting/foundation/configuration-architecture.md` — 4-layer config model the AMI plugs into at L2 (yaml) + L3 (env). +- `nexus-ami/README.md` — operator-facing build / test / publish runbook. diff --git a/docs/operators/README.md b/docs/operators/README.md index 67ed22c7..a092ea59 100644 --- a/docs/operators/README.md +++ b/docs/operators/README.md @@ -6,6 +6,7 @@ Documentation for running Nexus Gateway in production. - [`ops/`](./ops/) — deployment and operations guides: - [`deployment.md`](./ops/deployment.md) — bring-up and topologies. + - [`ami-build.md`](./ops/ami-build.md) — build the single-instance appliance AMI (AWS Marketplace). - [`ec2-single-node.md`](./ops/ec2-single-node.md) — a single-node deployment. - [`install-test-env.md`](./ops/install-test-env.md) — a single-host test or staging install. - [`backup-dr.md`](./ops/backup-dr.md) — backup and disaster recovery. diff --git a/docs/operators/ops/ami-build.md b/docs/operators/ops/ami-build.md new file mode 100644 index 00000000..c845af22 --- /dev/null +++ b/docs/operators/ops/ami-build.md @@ -0,0 +1,81 @@ +--- +updated: 2026-05-29 +--- + +# AMI build (single-instance appliance) + +How to build the AWS Marketplace AMI / single-instance appliance image. The +source-of-truth for everything in this guide is [`nexus-ami/README.md`](../../../nexus-ami/README.md); +the design rationale is captured in +[`docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md`](../../developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md). + +## When to use this + +- Cutting a release for the AWS Marketplace listing. +- Producing an image for an on-prem customer that wants a single-VM install. +- Smoke-testing a Packer / install-script change before publishing. + +## Prerequisites + +- Go 1.25+, Node 20+, Packer 1.10+. +- AWS credentials with `AWS_PROFILE=` exporting EC2 + S3 + IAM + permissions in `us-east-1`. +- A `t3.medium` or larger key pair on the target account if you intend to + launch instances from the AMI after build. +- vCPU headroom: the build runs on an `m5.4xlarge` (16 vCPU). If the + Standard-family quota is 16 and another instance is already running, stop + it or request a quota bump first (otherwise Packer fails immediately with + `VcpuLimitExceeded`). + +## Build + +```bash +cd nexus-ami +./build.sh # full pipeline (compile + stage + packer build, ~55 min) +./build.sh --skip-packer # CI dry-run — stage only, skip the EC2 launch +``` + +A successful build prints the new AMI id (e.g. `ami-0xxxxxxxx`) and a +snapshot id. + +## After the build + +1. **Share with the Marketplace scanner** (account `679593333241`): + + ```bash + aws ec2 modify-image-attribute --image-id \ + --launch-permission "Add=[{UserId=679593333241}]" \ + --profile --region us-east-1 + aws ec2 modify-snapshot-attribute --snapshot-id \ + --create-volume-permission "Add=[{UserId=679593333241}]" \ + --profile --region us-east-1 + ``` + +2. **Trigger the AMI scan** in Partner Central → AMI Management Portal. + +3. **Test the AMI**: + + ```bash + aws ec2 run-instances --image-id --instance-type t3.medium \ + --key-name --associate-public-ip-address \ + --profile --region us-east-1 + # SSH in, then: sudo cat /var/log/nexus/admin-credentials.txt + ``` + + Two instances launched from the same AMI MUST have different admin + passwords — that is the most important first-boot invariant. + +## Common failure modes + +| Symptom | Root cause | Fix | +|---|---|---| +| `VcpuLimitExceeded` immediately at `packer build` | Standard-family quota hit because another instance is running | Stop or terminate it, or request a quota raise | +| `Script disconnected unexpectedly` mid-Valkey compile | Build host OOM-killed sshd | Default is `m5.4xlarge`; do not lower | +| `InvalidParameterValue: Character sets beyond ASCII are not supported` at `Modifying attributes on AMI` | Non-ASCII in `ami_description` (e.g. em dash) | Keep `nexus.pkr.hcl` `ami_description` ASCII-only | +| First-boot completes but 4 nexus-* services stay `inactive` | Boot-order race — nexus-* tried to start before postgres was up | Already handled by `first-boot.sh`'s tail `kick` block | + +## Iteration cadence + +Plan a **monthly rebuild** to absorb AL2023 + Postgres + Valkey + NATS CVE +patches. `./build.sh` is the single command; wire it into a CI cron once +the AMI is stabilised. diff --git a/nexus-ami/README.md b/nexus-ami/README.md new file mode 100644 index 00000000..d960dffe --- /dev/null +++ b/nexus-ami/README.md @@ -0,0 +1,143 @@ +# Nexus Gateway — AMI / appliance build + +Single-instance, all-in-one Nexus Gateway packaged as an AWS Marketplace AMI. +Same artifacts are the foundation for the future on-prem appliance form factor +(bare-metal / VMware / KVM disk images). + +> **Source of truth for everything in this directory:** +> [`docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md`](../docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md). +> Read it first before changing scripts, configs, or systemd units in this tree. + +## What's in the AMI + +| Layer | Component | Source | +|---|---|---| +| Runtime deps | PostgreSQL 16 | `dnf install postgresql16-server` (AL2023 default) | +| Runtime deps | Valkey 8 + `valkey-search` module | `scripts/install-valkey.sh` (source compile) | +| Runtime deps | NATS Server 2 (JetStream) | `scripts/install-nats.sh` (official binary) | +| Runtime deps | Node.js 20 + Prisma + tsx | `scripts/install-node-prisma.sh` (first-boot only) | +| Runtime deps | nginx | `dnf install nginx` | +| Nexus | Hub binary (3060) | `make nexus-hub-build` → `dist/bin/nexus-hub/nexus-hub` | +| Nexus | Control Plane binary (3001) | `make control-plane-build` | +| Nexus | AI Gateway binary (3050) | `make ai-gateway-build` | +| Nexus | Compliance Proxy binary (3128) | `make compliance-proxy-build` | +| Nexus | Control Plane UI dist | `make control-plane-ui-build` → `packages/control-plane-ui/dist/` | +| Nexus | DB schema + seed | `tools/db-migrate/{schema.prisma, seed/}` | +| Nexus | 4 prod-shape `*.config.yaml` | `artifacts/configs/` | +| Nexus | 7 systemd units | `artifacts/systemd/` | + +## Quick build + +```bash +# Prerequisites: Go 1.25+, Node 20+, Packer 1.10+, AWS credentials. +cd nexus-ami +./build.sh # full pipeline: compile + stage + packer build +./build.sh --skip-packer # stop after staging (CI dry-run) +``` + +The full pipeline takes 20–30 minutes: + +1. `make build-all` — Go binaries (≈ 2 min) +2. `make control-plane-ui-build` — Vite UI dist (≈ 30 s) +3. Stage `artifacts/{bin,ui-dist,prisma}` (≈ 5 s) +4. `packer build` — launches a `t3.xlarge`, runs `install.sh` (Valkey + source compile is the long pole) + `harden.sh`, snapshots the AMI + (≈ 15–20 min) + +Output: a registered AMI ID in your AWS account (region per +`nexus.pkr.hcl` `aws_region` variable, default `us-east-1`). + +## Test a fresh AMI manually + +```bash +# 1. Launch a t3.xlarge from the AMI you just built. Wait for it to boot. +# 2. SSH in with your EC2 key pair: +ssh -i ~/.ssh/your-key.pem ec2-user@ + +# 3. Read the per-instance admin credentials: +sudo cat /var/log/nexus/admin-credentials.txt + +# 4. Verify all 7 Nexus-related services are green: +systemctl status nexus-first-boot postgresql valkey nats \ + nexus-hub nexus-control-plane nexus-gateway nexus-proxy nginx + +# 5. Open https:/// in a browser (accept the self-signed cert), +# log in with the credentials from step 3. + +# 6. Launch a SECOND instance from the same AMI and confirm +# /var/log/nexus/admin-credentials.txt contains a DIFFERENT password. +# Per-instance secret uniqueness is the most important first-boot invariant. +``` + +## Self-Service AMI Scan iteration + +Run AWS's Self-Service Scan from the Partner Central → Marketplace +Management Portal. Expect 2–3 rebuild cycles before the scan returns +zero findings. Common first-build hits the scan catches: + +- A package update landed a new CVE — `dnf update -y` is in `install.sh` + so the rebuild self-fixes; just re-run `packer build`. +- An overlooked `authorized_keys` file — re-run `harden.sh` (already + hardened with recursive `find / -name authorized_keys -delete`). +- SSH config not strict enough — `harden.sh` already enforces + `PasswordAuthentication=no`, `PermitRootLogin=no`, + `PermitEmptyPasswords=no`. If the scanner cites a new sshd directive, + add it to `harden.sh`. + +## Directory layout + +``` +nexus-ami/ +├── README.md ← this file +├── nexus.pkr.hcl ← Packer template +├── build.sh ← orchestrator (compile → stage → packer) +├── artifacts/ ← Packer file-provisioner source +│ ├── bin/ ← populated by build.sh (gitignored) +│ ├── ui-dist/ ← populated by build.sh (gitignored) +│ ├── prisma/ ← populated by build.sh (gitignored) +│ ├── configs/ +│ │ ├── nexus-hub.config.yaml +│ │ ├── control-plane.config.yaml +│ │ ├── ai-gateway.config.yaml +│ │ ├── compliance-proxy.config.yaml +│ │ └── nginx-nexus.conf +│ └── systemd/ +│ ├── nexus-first-boot.service +│ ├── valkey.service +│ ├── nats.service +│ ├── nexus-hub.service +│ ├── nexus-control-plane.service +│ ├── nexus-gateway.service +│ └── nexus-proxy.service +└── scripts/ + ├── install.sh ← orchestrator (runs at Packer time) + ├── install-postgres.sh + ├── install-valkey.sh + ├── install-nats.sh + ├── install-node-prisma.sh + ├── first-boot.sh ← orchestrator (runs once per instance) + ├── first-boot-secrets.sh + ├── first-boot-ca.sh + ├── first-boot-db.sh + ├── set-admin-password.js ← Node helper, deployed to /opt/nexus/prisma/ + └── harden.sh ← Marketplace cleanup (LAST provisioner) +``` + +## What's intentionally NOT here + +- **Multi-instance HA / Kubernetes manifests** — the appliance form factor + is single-instance by design. Container / K8s deployment is a separate + product line with its own architecture doc. +- **Schema migration across Nexus versions** — pre-GA policy. Customers + re-launch a new AMI version and re-create their workloads through the + admin API. Documented in the Marketplace listing as an evaluation + product. +- **Real TLS certificate provisioning** — first-boot generates a self-signed + cert at `/etc/nexus/tls.{crt,key}`. Operators replace with a real cert + and `systemctl reload nginx`. + +## Maintenance cadence + +Plan a **monthly rebuild** to absorb AL2023 + Postgres + Valkey + NATS +CVE patches. `build.sh` is the single command; wire it into a CI cron +once the AMI is stabilised. diff --git a/nexus-ami/artifacts/configs/ai-gateway.config.yaml b/nexus-ami/artifacts/configs/ai-gateway.config.yaml new file mode 100644 index 00000000..701c07e9 --- /dev/null +++ b/nexus-ami/artifacts/configs/ai-gateway.config.yaml @@ -0,0 +1,87 @@ +# Nexus AI Gateway — prod-shape config for AMI / appliance form factor. +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# Secrets + DATABASE_URL loaded from /etc/nexus/ai-gateway.env at boot. + +server: + port: 3050 + readTimeout: "30s" + writeTimeout: "360s" + +database: + url: "" # env DATABASE_URL + +redis: + mode: standalone + addrs: ["127.0.0.1:6379"] + username: "" + password: "" # env REDIS_PASSWORD + db: 0 + sentinel: + masterName: "" + username: "" + password: "" + cluster: + maxRedirects: 8 + routeRandomly: false + readOnly: false + tls: + enabled: false + insecureSkipVerify: false + caFile: "" + certFile: "" + keyFile: "" + serverName: "" + poolSize: 200 + minIdleConns: 50 + maxRetries: 3 + dialTimeout: 5s + readTimeout: 3s + writeTimeout: 3s + poolTimeout: 4s + +auth: + hmacSecret: "" # env ADMIN_KEY_HMAC_SECRET + credentialMasterKey: "" # env CREDENTIAL_ENCRYPTION_KEY (64 hex chars) + credentialKeyMap: "" + internalServiceToken: "" # env INTERNAL_SERVICE_TOKEN + +log: + level: "info" + format: "json" + file: "/var/log/nexus/ai-gateway.log" + +registry: + nexusHubUrl: "http://127.0.0.1:3060" + +mq: + driver: "nats" + nats: + url: "nats://127.0.0.1:4222" + +cors: + enabled: false + allowedOrigins: [] + allowedMethods: ["GET", "POST", "OPTIONS"] + allowedHeaders: ["Content-Type", "Authorization", "x-nexus-virtual-key", "x-request-id"] + maxAgeSec: 600 + +cache: + enabled: true + ttl: 5m + prefix: "ai-gw:" + broker: true + +otel: + endpoint: "" + serviceName: "nexus-ai-gateway" + +observability: + latencyDetail: true + +routing: + defaultRetryPolicy: + maxAttemptsPerTarget: 1 + retryOn: ["network", "timeout", "429", "5xx"] + backoffInitial: 250ms + backoffMax: 5s + backoffJitter: 0.2 diff --git a/nexus-ami/artifacts/configs/compliance-proxy.config.yaml b/nexus-ami/artifacts/configs/compliance-proxy.config.yaml new file mode 100644 index 00000000..01bdcb27 --- /dev/null +++ b/nexus-ami/artifacts/configs/compliance-proxy.config.yaml @@ -0,0 +1,175 @@ +# Nexus Compliance Proxy — prod-shape config for AMI / appliance form factor. +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# CA cert/key generated by first-boot-ca.sh. +# DATABASE_URL + COMPLIANCE_PROXY_API_TOKEN loaded from /etc/nexus/compliance-proxy.env at boot. + +listener: + address: ":3128" + +ca: + certPath: "/etc/compliance-proxy/ca.crt" + keyPath: "/etc/compliance-proxy/ca.key" + +database: + url: "" # env DATABASE_URL + +redis: + mode: standalone + addrs: ["127.0.0.1:6379"] + username: "" + password: "" # env REDIS_PASSWORD + db: 0 + sentinel: + masterName: "" + username: "" + password: "" + cluster: + maxRedirects: 8 + routeRandomly: false + readOnly: false + tls: + enabled: false + insecureSkipVerify: false + caFile: "" + certFile: "" + keyFile: "" + serverName: "" + poolSize: 200 + minIdleConns: 50 + maxRetries: 3 + dialTimeout: 5s + readTimeout: 3s + writeTimeout: 3s + poolTimeout: 4s + +accessControl: + sourceIpAllowlist: + - "10.0.0.0/8" + - "172.16.0.0/12" + - "192.168.0.0/16" + domainAllowlist: + - "api.openai.com:443" + - "*.openai.com:443" + - "api.anthropic.com:443" + - "*.anthropic.com:443" + - "generativelanguage.googleapis.com:443" + - "aistudio.google.com:443" + - "api.deepseek.com:443" + - "api.x.ai:443" + - "api.moonshot.cn:443" + - "open.bigmodel.cn:443" + - "api.minimax.chat:443" + - "copilot-proxy.githubusercontent.com:443" + internalNetworkExceptions: [] + +connections: + maxConcurrentTunnels: 10000 + maxStreamsPerConnection: 100 + idleTimeout: "300s" + shutdownGracePeriod: "30s" + +upstream: + maxConnsPerHost: 100 + idleConnTimeout: "90s" + dialTimeout: "10s" + +limits: + requestBodyLimit: "10MB" + responseBodyLimit: "10MB" + sseBufferLimit: "8MB" + +log: + level: "info" + format: "json" + file: "/var/log/nexus/compliance-proxy.log" + +metrics: + address: ":9090" + +runtimeApi: + listenAddress: "127.0.0.1:3040" + +# Required by the config validator (added 2026-05-28 after the upstream merge +# made these top-level checks). All three blocks point at the co-located Hub / +# NATS / internal-service-token — same pattern as control-plane.config.yaml. +mq: + driver: "nats" + nats: + url: "nats://127.0.0.1:4222" + +registry: + nexusHubUrl: "http://127.0.0.1:3060" + +auth: + internalServiceToken: "" # env INTERNAL_SERVICE_TOKEN (set by first-boot-secrets; [MUST MATCH] across services) + +compliance: + enabled: true + perHookTimeoutMs: 5000 + totalTimeoutMs: 15000 + parallelHooks: false + checkpointChars: 500 + redactionRulesPath: "" + rejectResponse: + defaultLevel: 1 + contactInfo: "Contact administrator" + hooks: + - implementationId: "keyword-filter" + name: "Default Keyword Filter" + priority: 10 + enabled: false + stage: "request" + failBehavior: "fail-open" + timeoutMs: 5000 + applicableIngress: "ALL" + config: + patterns: [] + caseSensitive: false + - implementationId: "pii-detector" + name: "Default PII Detector" + priority: 20 + enabled: false + stage: "request" + failBehavior: "fail-open" + timeoutMs: 5000 + applicableIngress: "ALL" + config: + types: ["email", "phone", "ssn", "credit_card"] + action: "reject_hard" + +alerting: + enabled: true + evalIntervalSec: 30 + webhook: + url: "" + headers: {} + timeoutSec: 10 + cooldown: + fireMinutes: 5 + resolveMinutes: 5 + persistenceDir: "/var/lib/nexus/alerting" + +audit: + enabled: true + batch: + size: 10 + flushIntervalMs: 500 + channelBufferSize: 1000 + adaptiveFlush: false + flushIntervalMinMs: 500 + flushIntervalMaxMs: 10000 + adaptiveBatchSize: false + batchSizeMin: 10 + batchSizeMax: 500 + ndjson: + enabled: true + dir: "/var/lib/nexus/audit-spool" + maxFileSizeMB: 100 + maxTotalSizeMB: 1000 + pinning: + exemptions: [] + autoExempt: + enabled: true + failureThreshold: 3 + windowSeconds: 3600 + exemptionDurationSeconds: 86400 diff --git a/nexus-ami/artifacts/configs/control-plane.config.yaml b/nexus-ami/artifacts/configs/control-plane.config.yaml new file mode 100644 index 00000000..da000c66 --- /dev/null +++ b/nexus-ami/artifacts/configs/control-plane.config.yaml @@ -0,0 +1,97 @@ +# Nexus Control Plane — prod-shape config for AMI / appliance form factor. +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# Secrets + DATABASE_URL loaded from /etc/nexus/control-plane.env at boot. + +server: + port: 3001 + shutdownTimeout: "10s" + +database: + url: "" # env DATABASE_URL + maxConns: 25 + minConns: 5 + maxConnLifetime: "300s" + +redis: + mode: standalone + addrs: ["127.0.0.1:6379"] + username: "" + password: "" # env REDIS_PASSWORD + db: 0 + sentinel: + masterName: "" + username: "" + password: "" + cluster: + maxRedirects: 8 + routeRandomly: false + readOnly: false + tls: + enabled: false + insecureSkipVerify: false + caFile: "" + certFile: "" + keyFile: "" + serverName: "" + poolSize: 200 + minIdleConns: 50 + maxRetries: 3 + dialTimeout: 5s + readTimeout: 3s + writeTimeout: 3s + poolTimeout: 4s + +log: + level: "info" + format: "json" + file: "/var/log/nexus/control-plane.log" + +bff: + complianceProxyUrl: "http://127.0.0.1:3040" + aiGatewayUrl: "http://127.0.0.1:3050" + complianceProxyRuntimeUrl: "http://127.0.0.1:3040" + complianceProxyApiToken: "" # env COMPLIANCE_PROXY_API_TOKEN + +registry: + nexusHubUrl: "http://127.0.0.1:3060" + +auth: + internalServiceToken: "" # env INTERNAL_SERVICE_TOKEN + +crypto: + encryptionKey: "" # env CREDENTIAL_ENCRYPTION_KEY (64 hex chars) + encryptionPassphrase: "" + encryptionSalt: "" + credentialKeyMap: "" + production: true + +retention: + auditLogDays: 90 + adminAuditLogDays: 365 + metricRollupDays: 365 + agentAuditDays: 90 + +agent: + caDir: "/var/lib/nexus/agentca" + +otel: + endpoint: "" + serviceName: "nexus-control-plane" + +scheduler: + enabled: true + +mq: + driver: "nats" + nats: + url: "nats://127.0.0.1:4222" + +# OIDC issuer for tokens minted by this Control Plane. The issuer claim must +# match the URL clients reach the CP on so JWKS fetch + iss-claim validation +# work. Issuer value is per-instance — first-boot stamps AUTH_SERVER_ISSUER +# into /etc/nexus/control-plane.env using the detected public IP. The empty +# string here is a placeholder so the env override hook fires; see +# configuration-architecture.md L3 > L2. +authServer: + issuer: "" # env AUTH_SERVER_ISSUER + keystoreDir: "/var/lib/nexus/authkeys" diff --git a/nexus-ami/artifacts/configs/nexus-hub.config.yaml b/nexus-ami/artifacts/configs/nexus-hub.config.yaml new file mode 100644 index 00000000..a44ecf27 --- /dev/null +++ b/nexus-ami/artifacts/configs/nexus-hub.config.yaml @@ -0,0 +1,97 @@ +# Nexus Hub — prod-shape config for AMI / appliance form factor. +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# +# All secrets and infra URLs are blank here — first-boot generates per-instance +# values into /etc/nexus/nexus-hub.env which systemd loads via EnvironmentFile=. +# Env values override every blank field below (L3 > L2 per configuration-architecture.md). + +server: + port: 3060 + readTimeout: 30s + writeTimeout: 30s + shutdownTimeout: 15s + +database: + url: "" # env DATABASE_URL + maxConns: 20 + minConns: 5 + +redis: + mode: standalone + addrs: ["127.0.0.1:6379"] + username: "" + password: "" # env REDIS_PASSWORD + db: 0 + sentinel: + masterName: "" + username: "" + password: "" + cluster: + maxRedirects: 8 + routeRandomly: false + readOnly: false + tls: + enabled: false + insecureSkipVerify: false + caFile: "" + certFile: "" + keyFile: "" + serverName: "" + poolSize: 200 + minIdleConns: 50 + maxRetries: 3 + dialTimeout: 5s + readTimeout: 3s + writeTimeout: 3s + poolTimeout: 4s + +mq: + driver: "nats" + nats: + url: "nats://127.0.0.1:4222" + +consumers: + enabled: true + batchSize: 100 + flushInterval: 5s + siem: + enabled: false + url: "" + headers: {} + format: "json" + batchSize: 200 + flushInterval: 5s + eventTypes: [] + +scheduler: + enabled: true + driftCheckInterval: 60s + identityEnrichInterval: 5m + enableAgentRollup: false + +auth: + internalServiceToken: "" # env INTERNAL_SERVICE_TOKEN + +authServer: + url: "http://127.0.0.1:3001" + jwksURL: "http://127.0.0.1:3001/.well-known/jwks.json" + issuer: "http://127.0.0.1:3001" + +agentCA: + certFile: "" + keyFile: "" + dir: "/var/lib/nexus/agentca" + +otel: + enabled: false + endpoint: "" + +log: + level: "info" + format: "json" + file: "/var/log/nexus/nexus-hub.log" + +hub: + id: "hub-appliance-1" + advertiseAddr: "http://127.0.0.1:3060" + allowedOrigins: [] diff --git a/nexus-ami/artifacts/configs/nginx-nexus.conf b/nexus-ami/artifacts/configs/nginx-nexus.conf new file mode 100644 index 00000000..30005a73 --- /dev/null +++ b/nexus-ami/artifacts/configs/nginx-nexus.conf @@ -0,0 +1,87 @@ +# Nexus Gateway — nginx reverse proxy for the AMI / appliance form factor. +# Serves the Vite-built React UI on :443 and proxies /api/* to the Control +# Plane on localhost:3001. Architecture: +# docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# +# Operator note: the self-signed cert at /etc/nexus/tls.{crt,key} is generated +# by first-boot-ca.sh on first launch. For production, replace those two files +# with a cert signed for your hostname and `systemctl reload nginx`. + +server { + listen 80 default_server; + server_name _; + return 301 https://$host$request_uri; +} + +server { + listen 443 ssl default_server; + server_name _; + + ssl_certificate /etc/nexus/tls.crt; + ssl_certificate_key /etc/nexus/tls.key; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers HIGH:!aNULL:!MD5; + + root /opt/nexus/ui; + index index.html; + + client_max_body_size 32m; + + # Vite SPA fallback — every unmatched path serves index.html so the + # client-side router takes over. + location / { + try_files $uri $uri/ /index.html; + } + + # Admin API + auth-server endpoints (both live in the control-plane + # binary on :3001). + location /api/ { + proxy_pass http://127.0.0.1:3001; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 300s; + proxy_send_timeout 300s; + } + + location /.well-known/ { + proxy_pass http://127.0.0.1:3001; + proxy_set_header Host $host; + } + + # OAuth/OIDC auth-server endpoints (authorization endpoint, token + # endpoint, userinfo, revoke, logout — all live in the control-plane + # binary). Without this location block /oauth/authorize falls through + # to the SPA `try_files` handler, the SPA can't process the PKCE + # parameters, bounces to /login, /login bounces to /oauth/authorize, + # infinite loop. Hit on 2026-05-29 first-user-test of build #10. + location /oauth/ { + proxy_pass http://127.0.0.1:3001; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Auth-server pre-bearer endpoints (`/authserver/idps`, `/authserver/ + # password`, `/authserver/idp//start`). The SPA's LoginPage calls + # these to list IDPs and post password credentials. Without the proxy + # they return the SPA index.html (200 with HTML body), which the SPA's + # JSON parser misreads as "loadProvidersFailed" — surfaced to the + # operator as "Unable to load sign-in methods". Hit on 2026-05-29. + location /authserver/ { + proxy_pass http://127.0.0.1:3001; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Health endpoint exposed for ELB / customer monitoring without + # authentication. + location = /healthz { + proxy_pass http://127.0.0.1:3001/api/healthz; + proxy_set_header Host $host; + } +} diff --git a/nexus-ami/artifacts/systemd/nats.service b/nexus-ami/artifacts/systemd/nats.service new file mode 100644 index 00000000..b8b7bb99 --- /dev/null +++ b/nexus-ami/artifacts/systemd/nats.service @@ -0,0 +1,25 @@ +[Unit] +Description=NATS Server with JetStream (event streaming + Hub coordination MQ) +Documentation=https://docs.nats.io/ +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=nats +Group=nats +ExecStart=/usr/local/bin/nats-server --config /etc/nats/nats-server.conf +ExecReload=/bin/kill -HUP $MAINPID +Restart=on-failure +RestartSec=5 +TimeoutStopSec=30 + +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/var/lib/nats /var/log/nats +LimitNOFILE=65535 + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/artifacts/systemd/nexus-control-plane.service b/nexus-ami/artifacts/systemd/nexus-control-plane.service new file mode 100644 index 00000000..bde4a977 --- /dev/null +++ b/nexus-ami/artifacts/systemd/nexus-control-plane.service @@ -0,0 +1,25 @@ +[Unit] +Description=Nexus Control Plane (admin API / BFF, IAM, SSO, analytics) +Documentation=https://github.com/AlphaBitCore/nexus-gateway +After=network-online.target nexus-hub.service +Requires=nexus-hub.service +Wants=network-online.target + +[Service] +Type=simple +User=nexus +Group=nexus +EnvironmentFile=/etc/nexus/control-plane.env +ExecStart=/opt/nexus/bin/control-plane -config /etc/nexus/control-plane.config.yaml +WorkingDirectory=/var/lib/nexus +Restart=on-failure +RestartSec=5 + +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/var/lib/nexus /var/log/nexus + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/artifacts/systemd/nexus-first-boot.service b/nexus-ami/artifacts/systemd/nexus-first-boot.service new file mode 100644 index 00000000..6470f709 --- /dev/null +++ b/nexus-ami/artifacts/systemd/nexus-first-boot.service @@ -0,0 +1,26 @@ +[Unit] +Description=Nexus Gateway first-boot initialization (generates per-instance secrets, MITM CA, DB schema + seed, random admin password) +Documentation=https://github.com/AlphaBitCore/nexus-gateway +# NOTE: this unit does NOT Require=postgresql.service AND it does NOT +# Before=postgresql.service. first-boot-db.sh calls `systemctl start +# postgresql` from inside this unit; either dependency would deadlock +# (systemd refuses to activate postgresql until first-boot finishes, +# first-boot blocks waiting for postgresql to come up, TimeoutStartSec +# kills first-boot 300 s later). Both keywords share the same hazard — +# Before= alone deadlocks even without Requires=. Hit on 2026-05-28 +# first-launch test, fixed by dropping postgresql.service from Before=. +After=network-online.target +Wants=network-online.target +Before=nexus-hub.service nexus-control-plane.service nexus-gateway.service nexus-proxy.service +ConditionPathExists=!/etc/nexus/.initialized + +[Service] +Type=oneshot +ExecStart=/usr/local/sbin/nexus-first-boot +RemainAfterExit=yes +TimeoutStartSec=300 +StandardOutput=journal+console +StandardError=journal+console + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/artifacts/systemd/nexus-gateway.service b/nexus-ami/artifacts/systemd/nexus-gateway.service new file mode 100644 index 00000000..87cd576a --- /dev/null +++ b/nexus-ami/artifacts/systemd/nexus-gateway.service @@ -0,0 +1,25 @@ +[Unit] +Description=Nexus AI Gateway (/v1 SDK traffic, provider adapters, routing, quota) +Documentation=https://github.com/AlphaBitCore/nexus-gateway +After=network-online.target nexus-hub.service +Requires=nexus-hub.service +Wants=network-online.target + +[Service] +Type=simple +User=nexus +Group=nexus +EnvironmentFile=/etc/nexus/ai-gateway.env +ExecStart=/opt/nexus/bin/ai-gateway -config /etc/nexus/ai-gateway.config.yaml +WorkingDirectory=/var/lib/nexus +Restart=on-failure +RestartSec=5 + +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/var/lib/nexus /var/log/nexus + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/artifacts/systemd/nexus-hub.service b/nexus-ami/artifacts/systemd/nexus-hub.service new file mode 100644 index 00000000..afc7c15d --- /dev/null +++ b/nexus-ami/artifacts/systemd/nexus-hub.service @@ -0,0 +1,25 @@ +[Unit] +Description=Nexus Hub (control-plane kernel: Thing Registry, Device Shadow, config sync, agent CA) +Documentation=https://github.com/AlphaBitCore/nexus-gateway +After=network-online.target nexus-first-boot.service postgresql.service valkey.service nats.service +Requires=nexus-first-boot.service postgresql.service valkey.service nats.service +Wants=network-online.target + +[Service] +Type=simple +User=nexus +Group=nexus +EnvironmentFile=/etc/nexus/nexus-hub.env +ExecStart=/opt/nexus/bin/nexus-hub -config /etc/nexus/nexus-hub.config.yaml +WorkingDirectory=/var/lib/nexus +Restart=on-failure +RestartSec=5 + +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/var/lib/nexus /var/log/nexus + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/artifacts/systemd/nexus-proxy.service b/nexus-ami/artifacts/systemd/nexus-proxy.service new file mode 100644 index 00000000..87cc49ff --- /dev/null +++ b/nexus-ami/artifacts/systemd/nexus-proxy.service @@ -0,0 +1,25 @@ +[Unit] +Description=Nexus Compliance Proxy (transparent TLS proxy, MITM, compliance pipeline) +Documentation=https://github.com/AlphaBitCore/nexus-gateway +After=network-online.target nexus-hub.service +Requires=nexus-hub.service +Wants=network-online.target + +[Service] +Type=simple +User=nexus +Group=nexus +EnvironmentFile=/etc/nexus/compliance-proxy.env +ExecStart=/opt/nexus/bin/compliance-proxy -config /etc/nexus/compliance-proxy.config.yaml +WorkingDirectory=/var/lib/nexus +Restart=on-failure +RestartSec=5 + +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/var/lib/nexus /var/log/nexus /etc/compliance-proxy + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/artifacts/systemd/valkey.service b/nexus-ami/artifacts/systemd/valkey.service new file mode 100644 index 00000000..9a000e32 --- /dev/null +++ b/nexus-ami/artifacts/systemd/valkey.service @@ -0,0 +1,32 @@ +[Unit] +Description=Valkey 8 (Redis-wire-compatible cache with valkey-search module) +Documentation=https://valkey.io/ +After=network-online.target +Wants=network-online.target + +[Service] +Type=notify +User=valkey +Group=valkey +ExecStart=/usr/local/bin/valkey-server /etc/valkey/valkey.conf --supervised systemd +ExecStop=/usr/local/bin/valkey-cli -h 127.0.0.1 -p 6379 shutdown nosave +Restart=on-failure +RestartSec=5 +TimeoutStopSec=30 + +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +# /run is tmpfs and is wiped on every boot — the /var/run/valkey directory +# install-valkey.sh creates at AMI build time does NOT survive AMI snapshot + +# fresh boot. Without RuntimeDirectory= here, systemd's namespace setup fails +# 226/NAMESPACE because ReadWritePaths can't bind a missing directory. Hit on +# 2026-05-28 first-launch test. +RuntimeDirectory=valkey +RuntimeDirectoryMode=0750 +ReadWritePaths=/var/lib/valkey /var/log/valkey +LimitNOFILE=65535 + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/build.sh b/nexus-ami/build.sh new file mode 100755 index 00000000..e1a8996b --- /dev/null +++ b/nexus-ami/build.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +# build.sh — staging wrapper. Compiles all Nexus binaries + UI dist + bundles +# the Prisma schema, then invokes `packer build`. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# +# Usage: +# cd nexus-ami +# ./build.sh # full pipeline (binaries + UI + packer) +# ./build.sh --skip-packer # stage artifacts only; don't run packer (for CI dry-run) +# ./build.sh --stage-only # alias for --skip-packer +# +# Prerequisites: +# - Go 1.25+ (`make build-all` driver) +# - Node 20+ (`make control-plane-ui-build`) +# - Packer 1.10+ (https://www.packer.io/) unless --skip-packer +# - AWS credentials in environment (AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY +# or AWS_PROFILE) unless --skip-packer + +set -euo pipefail + +SKIP_PACKER=false +for arg in "$@"; do + case "$arg" in + --skip-packer|--stage-only) SKIP_PACKER=true ;; + -h|--help) + sed -n '2,18p' "$0" + exit 0 + ;; + *) echo "ERROR: unknown flag $arg" >&2; exit 1 ;; + esac +done + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ARTIFACTS_DIR="$SCRIPT_DIR/artifacts" + +echo "==> [build] cleaning previous staging dirs..." +rm -rf "$ARTIFACTS_DIR/bin" "$ARTIFACTS_DIR/ui-dist" "$ARTIFACTS_DIR/prisma" "$ARTIFACTS_DIR/scripts" +rm -f "$SCRIPT_DIR/artifacts.tar.gz" +mkdir -p "$ARTIFACTS_DIR/bin" "$ARTIFACTS_DIR/ui-dist" "$ARTIFACTS_DIR/prisma" + +# ─── 1. Build Go binaries ────────────────────────────────────────────────── + +echo "==> [build] compiling Nexus Go binaries (make build-all)..." +cd "$REPO_ROOT" +GOOS=linux GOARCH=amd64 CGO_ENABLED=0 make \ + nexus-hub-build control-plane-build ai-gateway-build compliance-proxy-build + +for svc in nexus-hub control-plane ai-gateway compliance-proxy; do + src="$REPO_ROOT/dist/bin/$svc/$svc" + [ -x "$src" ] || { echo "ERROR: missing $src" >&2; exit 1; } + cp "$src" "$ARTIFACTS_DIR/bin/$svc" +done + +# ─── 2. Build Control Plane UI Vite dist ─────────────────────────────────── + +echo "==> [build] building Control Plane UI (Vite)..." +cd "$REPO_ROOT" +make control-plane-ui-build + +ui_dist="$REPO_ROOT/packages/control-plane-ui/dist" +[ -d "$ui_dist" ] || { echo "ERROR: missing UI dist at $ui_dist" >&2; exit 1; } +cp -r "$ui_dist"/. "$ARTIFACTS_DIR/ui-dist/" + +# ─── 3. Bundle Prisma schema + seed ──────────────────────────────────────── + +echo "==> [build] bundling Prisma schema + seed..." +cd "$REPO_ROOT/tools/db-migrate" +cp schema.prisma "$ARTIFACTS_DIR/prisma/" +cp package.json package-lock.json "$ARTIFACTS_DIR/prisma/" +cp -r seed "$ARTIFACTS_DIR/prisma/seed" +cp prisma.config.ts "$ARTIFACTS_DIR/prisma/" +cp -r migrations "$ARTIFACTS_DIR/prisma/migrations" 2>/dev/null || true + +# ─── 3b. Bundle scripts/ into artifacts/scripts/ ─────────────────────────── +# Packer's file provisioner needs the destination dir to exist before scp can +# upload into it. Bundling scripts/ as a subdir of artifacts/ means one +# `file` provisioner uploads everything in one shot (see nexus.pkr.hcl). + +echo "==> [build] bundling scripts/ into artifacts/scripts/..." +cp -r "$SCRIPT_DIR/scripts" "$ARTIFACTS_DIR/scripts" + +# ─── 4. Show what we staged ──────────────────────────────────────────────── + +echo "==> [build] artifact tree:" +( cd "$ARTIFACTS_DIR" && find . -maxdepth 3 -type d -print ) | sed 's|^| |' + +# ─── 4b. Compress artifacts/ → artifacts.tar.gz ──────────────────────────── +# Packer's file provisioner uses recursive SCP. For our 234 MB payload over +# slow links (e.g., China → us-east-1), SCP silently drops individual files +# on transient connection blips — leading to "missing binary" errors at +# install.sh time with no upload-side error message. Tarballing makes the +# transfer atomic (one file → succeed or fail as a whole) AND faster +# (gzipped Go binaries compress to ~40-50% of their uncompressed size). + +TARBALL="$SCRIPT_DIR/artifacts.tar.gz" +echo "==> [build] compressing artifacts/ → artifacts.tar.gz ..." +rm -f "$TARBALL" +tar -C "$ARTIFACTS_DIR" -czf "$TARBALL" . +echo "==> [build] tarball: $(du -h "$TARBALL" | awk '{print $1}') (vs $(du -sh "$ARTIFACTS_DIR" | awk '{print $1}') uncompressed)" + +# ─── 5. packer build ─────────────────────────────────────────────────────── + +if $SKIP_PACKER; then + echo "==> [build] --skip-packer: stopping here. Run 'cd $SCRIPT_DIR && packer init . && packer build nexus.pkr.hcl' yourself." + exit 0 +fi + +if ! command -v packer >/dev/null 2>&1; then + echo "ERROR: packer is not installed (https://www.packer.io/downloads). Pass --skip-packer to stop after staging." >&2 + exit 1 +fi + +cd "$SCRIPT_DIR" +echo "==> [build] packer init ..." +packer init . +echo "==> [build] packer build ..." +packer build nexus.pkr.hcl diff --git a/nexus-ami/nexus.pkr.hcl b/nexus-ami/nexus.pkr.hcl new file mode 100644 index 00000000..0483f90f --- /dev/null +++ b/nexus-ami/nexus.pkr.hcl @@ -0,0 +1,109 @@ +# Nexus Gateway — Packer template for the AMI / appliance form factor. +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# +# Build: cd nexus-ami && packer init . && packer build nexus.pkr.hcl +# Variables: pass via -var "nexus_version=0.1.0" or set NEXUS_VERSION env. + +packer { + required_plugins { + amazon = { + version = ">= 1.3.0" + source = "github.com/hashicorp/amazon" + } + } +} + +variable "nexus_version" { + type = string + default = "0.1.0" +} + +variable "aws_region" { + type = string + default = "us-east-1" +} + +variable "instance_type" { + type = string + # m5.4xlarge (16 vCPU / 64 GB) needed because valkey-search 1.x vendors + # gRPC + Protobuf + Abseil + ICU as submodules; template-heavy parallel C++ + # compile is heap-hungry per translation unit. A 2026-05-28 build on + # t3.2xlarge (32 GB) was OOM-killed silently mid-ICU-compile after 11 + # minutes — kernel OOM-killer leaves no trace in build logs (sshd dies + # before the script can write stderr). 64 GB gives the headroom the 32 GB + # tier was supposed to but no longer does. + default = "m5.4xlarge" +} + +variable "root_volume_size_gb" { + type = number + default = 30 # Postgres + Valkey + NATS file store + log headroom. +} + +source "amazon-ebs" "nexus" { + region = var.aws_region + instance_type = var.instance_type + + ami_name = "nexus-gateway-${var.nexus_version}-{{timestamp}}" + # ami_description is ASCII-only: AWS ModifyImageAttribute rejects non-ASCII + # (we hit this on 2026-05-28: em dash U+2014 → InvalidParameterValue; the + # AMI was deregistered and the snapshot deleted at the end of the build). + ami_description = "Nexus Gateway ${var.nexus_version} - single-instance AI traffic gateway appliance (OSS, Apache 2.0)" + + source_ami_filter { + filters = { + name = "al2023-ami-2023.*-x86_64" + virtualization-type = "hvm" + root-device-type = "ebs" + } + owners = ["amazon"] + most_recent = true + } + + ssh_username = "ec2-user" + + launch_block_device_mappings { + device_name = "/dev/xvda" + volume_size = var.root_volume_size_gb + volume_type = "gp3" + delete_on_termination = true + } + + ami_block_device_mappings { + device_name = "/dev/xvda" + volume_size = var.root_volume_size_gb + volume_type = "gp3" + } + + tags = { + Name = "nexus-gateway-${var.nexus_version}" + Product = "Nexus Gateway" + Version = var.nexus_version + BuildToolchain = "packer+al2023" + } +} + +build { + name = "nexus-gateway-ami" + sources = ["source.amazon-ebs.nexus"] + + # Upload artifacts.tar.gz (built by build.sh) as a single file. We avoid + # uploading artifacts/ as a directory because Packer's file provisioner uses + # recursive SCP — over slow links it silently drops individual files when + # the connection blips, causing "missing binary" errors at install.sh time. + # A single-file SCP is atomic: either the whole tarball lands or the + # transfer errors loudly. install.sh extracts the tarball before doing + # anything else. + provisioner "file" { + source = "artifacts.tar.gz" + destination = "/tmp/nexus-artifacts.tar.gz" + } + + provisioner "shell" { + execute_command = "sudo -E bash '{{.Path}}'" + scripts = [ + "scripts/install.sh", + "scripts/harden.sh", + ] + } +} diff --git a/nexus-ami/scripts/first-boot-ca.sh b/nexus-ami/scripts/first-boot-ca.sh new file mode 100755 index 00000000..0dba2184 --- /dev/null +++ b/nexus-ami/scripts/first-boot-ca.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# first-boot-ca.sh — generate per-instance certificates. +# +# Two CAs / keypairs are produced: +# 1. /etc/compliance-proxy/{ca.crt,ca.key} — Compliance Proxy MITM CA +# used to mint leaf certs for upstream provider domains. +# 2. /etc/nexus/{tls.crt,tls.key} — nginx HTTPS cert (self-signed, +# CN=nexus-gateway). The operator is expected to replace this with a +# real cert signed for their hostname in production. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md + +set -euo pipefail + +PROXY_CA_DIR=/etc/compliance-proxy +NEXUS_DIR=/etc/nexus + +# Idempotent — re-issuing the MITM CA invalidates every agent's trust store +# entry (operators have to redistribute the new ca.crt). Re-issuing the nginx +# cert is harmless but pointless. +if [ -f "$PROXY_CA_DIR/ca.crt" ] && [ -f "$NEXUS_DIR/tls.crt" ]; then + echo "[first-boot-ca] CAs already present; skipping (idempotent)." + exit 0 +fi + +echo "[first-boot-ca] generating Compliance Proxy MITM CA..." + +# ECDSA P-256 — small + fast leaf signing; matches dev CA shape used by +# packages/compliance-proxy/dev-certs/. +openssl ecparam -genkey -name prime256v1 -noout -out "$PROXY_CA_DIR/ca.key" +openssl req -x509 -new -nodes -key "$PROXY_CA_DIR/ca.key" -sha256 -days 3650 \ + -subj "/CN=Nexus Compliance Proxy CA/O=Nexus Gateway" \ + -out "$PROXY_CA_DIR/ca.crt" + +chmod 0640 "$PROXY_CA_DIR/ca.crt" "$PROXY_CA_DIR/ca.key" +chown root:nexus "$PROXY_CA_DIR/ca.crt" "$PROXY_CA_DIR/ca.key" + +echo "[first-boot-ca] generating nginx HTTPS self-signed cert..." + +# Detect the instance's reachable IPs so the cert SAN covers everything Go's +# default TLS client will check against. Without IP SANs, Go's HTTPS client +# rejects `https:///.well-known/jwks.json` with x509: cannot validate +# certificate for because it doesn't contain any IP SANs — tokens are +# issued correctly at /oauth/token but cannot be verified at /api/admin/me, +# the SPA bounces back to /login on every login attempt. Hit on 2026-05-29. +TOKEN=$(curl -fsS -X PUT "http://169.254.169.254/latest/api/token" \ + -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" -m 3 2>/dev/null || true) +PUBLIC_IP="" +LOCAL_IP="" +if [ -n "$TOKEN" ]; then + PUBLIC_IP=$(curl -fsS -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/public-ipv4 -m 3 2>/dev/null || true) + LOCAL_IP=$(curl -fsS -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/local-ipv4 -m 3 2>/dev/null || true) +fi +SAN="IP:127.0.0.1,DNS:nexus-gateway,DNS:localhost" +[ -n "$PUBLIC_IP" ] && SAN="IP:${PUBLIC_IP},${SAN}" +[ -n "$LOCAL_IP" ] && [ "$LOCAL_IP" != "$PUBLIC_IP" ] && SAN="${SAN},IP:${LOCAL_IP}" +echo "[first-boot-ca] cert SAN: ${SAN}" + +openssl req -x509 -nodes -newkey rsa:2048 -days 365 \ + -subj "/CN=nexus-gateway/O=Nexus Gateway" \ + -addext "subjectAltName=${SAN}" \ + -keyout "$NEXUS_DIR/tls.key" \ + -out "$NEXUS_DIR/tls.crt" 2>/dev/null + +chmod 0640 "$NEXUS_DIR/tls.crt" "$NEXUS_DIR/tls.key" +chown root:nexus "$NEXUS_DIR/tls.crt" "$NEXUS_DIR/tls.key" + + +# Install the nginx self-signed cert into the system CA trust store. Without +# this, Go's default HTTP client (used by the JWT verifier's JWKS fetcher in +# the control-plane) rejects the self-signed cert with x509 "unknown +# authority" — tokens are issued correctly at /oauth/token but cannot be +# verified at /api/admin/me, the SPA bounces back to /login on every login +# attempt. Hit on 2026-05-29 first-user-test of build #10. Acceptable: the +# anchor is per-instance and only ever signs this appliance's own hostname. +echo "[first-boot-ca] trusting self-signed nginx cert in the system CA bundle..." +install -o root -g root -m 0644 "$NEXUS_DIR/tls.crt" \ + /etc/pki/ca-trust/source/anchors/nexus-gateway.crt +update-ca-trust + +echo "[first-boot-ca] complete (proxy CA + nginx self-signed cert + system CA anchor)." diff --git a/nexus-ami/scripts/first-boot-db.sh b/nexus-ami/scripts/first-boot-db.sh new file mode 100755 index 00000000..21bd53c7 --- /dev/null +++ b/nexus-ami/scripts/first-boot-db.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# first-boot-db.sh — initialise PostgreSQL, materialise schema, seed baseline +# rows, randomise the admin password, and surface credentials to the operator. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# Hashing matches tools/db-migrate/seed/lib.ts hashPassword(): scrypt N=16384, +# r=8, p=1, salt=32B, key=64B, format "salt_hex:hash_hex". + +set -euo pipefail + +# Put the bundled Node 20 on PATH so `npx` (whose shebang is +# `#!/usr/bin/env node`) can resolve `node`. Without this the script aborts +# at the first prisma call with `/usr/bin/env: 'node': No such file or +# directory` because systemd starts this unit with the system PATH that does +# not include /opt/nexus/node/bin. Hit on 2026-05-28 first-launch test of +# build #8. +export PATH=/opt/nexus/node/bin:$PATH + +PRISMA_DIR=/opt/nexus/prisma +ADMIN_CREDS=/var/log/nexus/admin-credentials.txt + +# Source the per-service env file written by first-boot-secrets.sh — the seed +# requires CREDENTIAL_ENCRYPTION_KEY (re-encrypts seeded credential rows) and +# ADMIN_KEY_HMAC_SECRET (re-hashes seeded VK lookup keys). Both live in +# control-plane.env which has the union of secrets for the two services that +# need them. +# shellcheck disable=SC1091 +. /etc/nexus/control-plane.env +export CREDENTIAL_ENCRYPTION_KEY ADMIN_KEY_HMAC_SECRET INTERNAL_SERVICE_TOKEN COMPLIANCE_PROXY_API_TOKEN + +DB_NAME=nexus_gateway +DB_USER=nexus +DB_PASSWORD=$(openssl rand -hex 24) +ADMIN_PASSWORD=$(openssl rand -base64 18 | tr -d '/+=' | cut -c1-20) +PGDATA=/var/lib/pgsql/data + +# ─── initdb on first launch (install.sh deferred this so harden.sh's wipe +# leaves a clean snapshot; see install-postgres.sh for the why). ──────── +if [ ! -f "$PGDATA/PG_VERSION" ]; then + echo "[first-boot-db] initialising PostgreSQL data directory..." + /usr/bin/postgresql-setup --initdb + + echo "[first-boot-db] enforcing localhost-only + scram-sha-256 auth..." + sed -i "s/^#listen_addresses.*/listen_addresses = '127.0.0.1'/" "$PGDATA/postgresql.conf" + sed -i "s/^listen_addresses.*/listen_addresses = '127.0.0.1'/" "$PGDATA/postgresql.conf" + sed -i "s/^#password_encryption.*/password_encryption = scram-sha-256/" "$PGDATA/postgresql.conf" + sed -i "s/^password_encryption.*/password_encryption = scram-sha-256/" "$PGDATA/postgresql.conf" + + cat > "$PGDATA/pg_hba.conf" <<'PGHBA' +# Nexus appliance — localhost-only, scram-sha-256 for nexus user, peer for postgres OS user. +local all postgres peer +local all all scram-sha-256 +host all all 127.0.0.1/32 scram-sha-256 +host all all ::1/128 scram-sha-256 +PGHBA + chown postgres:postgres "$PGDATA/pg_hba.conf" + chmod 0600 "$PGDATA/pg_hba.conf" +fi + +echo "[first-boot-db] starting PostgreSQL..." +systemctl start postgresql + +# Wait until accepting connections (postgresql-setup is async-ish on some AL2023 builds). +for i in 1 2 3 4 5 6 7 8 9 10; do + if sudo -u postgres pg_isready -q -h /var/run/postgresql; then + break + fi + echo "[first-boot-db] waiting for PostgreSQL... ($i/10)" + sleep 1 +done + +# If a previous DATABASE_URL was already stamped into an env file, reuse the +# password it encodes — the role already exists in PG with that password, and +# rotating it here would break that consistency. Otherwise this is the first +# run and we generate a fresh DB_PASSWORD above. +# +# `|| true` is load-bearing: on a fresh boot the env file exists (written by +# first-boot-secrets) but contains NO DATABASE_URL line yet, so grep returns 1. +# Under `set -euo pipefail` that fails the command substitution and `set -e` +# kills the whole script BEFORE we ever reach the role-creation block. Hit on +# 2026-05-28 first-launch test of build #8. +EXISTING_URL=$(grep -h '^DATABASE_URL=' /etc/nexus/control-plane.env 2>/dev/null | tail -1 | sed 's/^DATABASE_URL=//' || true) +if [ -n "$EXISTING_URL" ]; then + echo "[first-boot-db] reusing prior DATABASE_URL from /etc/nexus/control-plane.env (idempotent)." + DATABASE_URL="$EXISTING_URL" + DB_PASSWORD=$(echo "$DATABASE_URL" | sed -E "s|.*://$DB_USER:([^@]+)@.*|\1|") +fi + +echo "[first-boot-db] ensuring role and database exist (idempotent)..." +# SUPERUSER is required because seed/data/seed-baseline.sql is a pg_dump that +# uses `ALTER TABLE ... DISABLE TRIGGER ALL` to load FK-related rows out of +# topological order. Postgres only lets SUPERUSER touch the system-generated +# RI_ConstraintTrigger_* triggers — without it the seed aborts with +# permission denied: "RI_ConstraintTrigger_a_NNNN" is a system trigger +# Acceptable for this appliance: Postgres binds 127.0.0.1 only (see the +# listen_addresses tweak above) and pg_hba.conf forces scram-sha-256, so the +# attack surface is local processes only — same boundary as the rest of the +# appliance. Hit on 2026-05-28 first-launch test of build #8. +sudo -u postgres psql -v ON_ERROR_STOP=1 <> "$envfile" +done + +echo "[first-boot-db] materialising schema via prisma db push..." +cd "$PRISMA_DIR" +# --skip-generate was removed in newer Prisma CLI; client generation is now a +# separate explicit call below. Hit on 2026-05-28 first-launch test of build #8: +# "! unknown or unexpected option: --skip-generate". --accept-data-loss alone is +# enough — on a fresh DB there is no data to lose, but Prisma requires the flag +# to push without an interactive y/n prompt. +DATABASE_URL="$DATABASE_URL" /opt/nexus/node/bin/npx prisma db push --accept-data-loss + +echo "[first-boot-db] generating Prisma client (required by seed)..." +DATABASE_URL="$DATABASE_URL" /opt/nexus/node/bin/npx prisma generate + +echo "[first-boot-db] loading baseline seed (organisations, IAM, roles)..." +DATABASE_URL="$DATABASE_URL" /opt/nexus/node/bin/npx tsx seed/seed.ts + +echo "[first-boot-db] randomising admin@nexus.ai password..." +NEW_ADMIN_HASH=$(NEW_PASSWORD="$ADMIN_PASSWORD" /opt/nexus/node/bin/node "$PRISMA_DIR/set-admin-password.js") +DATABASE_URL="$DATABASE_URL" /opt/nexus/node/bin/npx prisma db execute --stdin < "$ADMIN_CREDS" </ +Username: admin@nexus.ai +Password: $ADMIN_PASSWORD + +IMPORTANT +--------- +1. This file is mode 0640, root:nexus — root or members of the 'nexus' group + can read it. Remove this file once you have changed the admin password + from the UI: sudo rm $ADMIN_CREDS +2. The TLS certificate at /etc/nexus/tls.crt is SELF-SIGNED. Replace it with + a cert signed for your hostname before exposing the appliance publicly, + then run: sudo systemctl reload nginx +3. The Compliance Proxy MITM CA at /etc/compliance-proxy/ca.crt must be + distributed to every device that egresses through the proxy on port 3128. +4. Demo accounts (alice@/bob@/carol@/diana@nexus.ai) ship with documented + dev passwords — disable them from the UI before opening this instance + to external traffic. + +For full operator documentation see: + https://github.com/AlphaBitCore/nexus-gateway/blob/main/docs/operators/ +================================================================================ +EOF +chmod 0640 "$ADMIN_CREDS" +chown root:nexus "$ADMIN_CREDS" + +cat > /etc/motd < "$CONFIG_DIR/nexus-hub.env" < "$CONFIG_DIR/control-plane.env" < "$CONFIG_DIR/ai-gateway.env" < "$CONFIG_DIR/compliance-proxy.env" </dev/null || true) +if [ -n "$TOKEN" ]; then + IP=$(curl -fsS -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/public-ipv4 -m 3 2>/dev/null || true) + [ -z "$IP" ] && IP=$(curl -fsS -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/local-ipv4 -m 3 2>/dev/null || true) +fi +[ -z "$IP" ] && IP=$(hostname -I 2>/dev/null | awk '{print $1}') +[ -z "$IP" ] && IP=127.0.0.1 +echo "[nexus-first-boot] publicURL host = $IP" + +stamp_public_url() { + local yaml="$1"; local url="$2" + if grep -q '^publicURL:' "$yaml"; then + echo "[nexus-first-boot] $yaml already has publicURL; skipping." + else + sed -i "1i publicURL: \"$url\"" "$yaml" + echo "[nexus-first-boot] $yaml <- publicURL=$url" + fi +} +stamp_public_url /etc/nexus/nexus-hub.config.yaml "http://${IP}:3060" +stamp_public_url /etc/nexus/control-plane.config.yaml "https://${IP}/" +stamp_public_url /etc/nexus/ai-gateway.config.yaml "https://${IP}/v1" +stamp_public_url /etc/nexus/compliance-proxy.config.yaml "http://${IP}:3128" + +# Stamp AUTH_SERVER_ISSUER into control-plane.env (env override fills the +# yaml's empty authServer.issuer placeholder). Must match the publicURL the +# CP advertises so JWT iss-claim validation + JWKS fetch line up. Idempotent +# replace, not append. +if ! grep -q '^AUTH_SERVER_ISSUER=' /etc/nexus/control-plane.env; then + echo "AUTH_SERVER_ISSUER=https://${IP}/" >> /etc/nexus/control-plane.env + echo "[nexus-first-boot] /etc/nexus/control-plane.env <- AUTH_SERVER_ISSUER=https://${IP}/" +fi + +"$SCRIPT_DIR/nexus-first-boot-db" + +# Register this instance's redirect URI on the cp-ui OAuth client. The seed +# ships with localhost / cp.nexus.ai defaults; without this update an admin +# launching the appliance and clicking "Login" gets a 400 invalid_request +# from /oauth/authorize because the per-instance redirect_uri is not in the +# OAuthClient.redirectUris array. Idempotent — array_append fires only if +# missing. Runs as the postgres OS user (peer auth in pg_hba.conf). Hit on +# 2026-05-29 first-user-test of build #10. +echo "[nexus-first-boot] registering cp-ui redirect_uri for this instance..." +sudo -u postgres psql -d nexus_gateway -v ON_ERROR_STOP=1 </dev/null || true +systemctl start --no-block nexus-hub nexus-control-plane nexus-gateway nexus-proxy nginx + +echo "[nexus-first-boot] initialization complete." diff --git a/nexus-ami/scripts/harden.sh b/nexus-ami/scripts/harden.sh new file mode 100755 index 00000000..5d3bd82e --- /dev/null +++ b/nexus-ami/scripts/harden.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# harden.sh — final cleanup before AMI snapshot. MUST run as the LAST Packer +# provisioner. AWS Marketplace rejects the AMI if any of this is left in. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md §7 + +set -euo pipefail + +echo "==> [harden] removing SSH authorized_keys (recursive)..." +find / -name 'authorized_keys' -type f -delete 2>/dev/null || true + +echo "==> [harden] removing SSH host keys (regenerated on first boot)..." +find /etc/ssh -name 'ssh_host_*' -type f -delete 2>/dev/null || true + +echo "==> [harden] enforcing strict sshd config..." +sed -i 's/^#*PermitRootLogin.*/PermitRootLogin no/' /etc/ssh/sshd_config +sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config +sed -i 's/^#*PermitEmptyPasswords.*/PermitEmptyPasswords no/' /etc/ssh/sshd_config + +echo "==> [harden] locking the root password..." +passwd -l root || true + +echo "==> [harden] clearing shell history..." +find /root /home -name '.bash_history' -type f -delete 2>/dev/null || true +find /root /home -name '.zsh_history' -type f -delete 2>/dev/null || true +unset HISTFILE || true +history -c 2>/dev/null || true + +echo "==> [harden] truncating logs..." +find /var/log -type f -exec truncate -s 0 {} \; 2>/dev/null || true +journalctl --rotate 2>/dev/null || true +journalctl --vacuum-time=1s 2>/dev/null || true + +echo "==> [harden] resetting machine-id (regenerated on first boot)..." +truncate -s 0 /etc/machine-id +# /var/lib/dbus/machine-id is the legacy compatibility symlink for systems +# that ship dbus (Fedora desktop, RHEL with dbus). AL2023 minimal AMI does +# NOT install dbus by default — /var/lib/dbus/ does not exist (verified +# 2026-05-28 build, `ln -sf` failed). Skip the symlink when dbus isn't +# around; systemd alone reads /etc/machine-id directly and regenerates it +# on first boot. +if [ -d /var/lib/dbus ]; then + rm -f /var/lib/dbus/machine-id + ln -sf /etc/machine-id /var/lib/dbus/machine-id +fi + +echo "==> [harden] cleaning cloud-init state..." +cloud-init clean --logs 2>/dev/null || true + +echo "==> [harden] clearing DHCP leases and MAC-bound network rules..." +rm -rf /var/lib/dhclient/* /var/lib/dhcp/* 2>/dev/null || true +rm -f /etc/udev/rules.d/70-persistent-net.rules + +echo "==> [harden] clearing sudo password caches..." +rm -rf /var/db/sudo/* 2>/dev/null || true + +echo "==> [harden] clearing package manager caches..." +dnf clean all +rm -rf /var/cache/dnf/* /var/cache/yum/* 2>/dev/null || true + +echo "==> [harden] clearing /tmp, /var/tmp, and any leftover Nexus staging..." +rm -rf /tmp/nexus 2>/dev/null || true +find /tmp -mindepth 1 -delete 2>/dev/null || true +find /var/tmp -mindepth 1 -delete 2>/dev/null || true + +echo "==> [harden] clearing per-stateful service data accumulated during install..." +# Each of these is regenerated on first-boot or by the service itself; leaving +# install-time content baked into the AMI is a leak / non-determinism source. +rm -rf /var/lib/pgsql/data/* /var/lib/valkey/* /var/lib/nats/* 2>/dev/null || true +rm -f /etc/nexus/.initialized 2>/dev/null || true +rm -f /var/log/nexus/admin-credentials.txt 2>/dev/null || true + +echo "==> [harden] zeroing free space (shrinks EBS snapshot)..." +dd if=/dev/zero of=/zerofile bs=1M 2>/dev/null || true +rm -f /zerofile +sync + +echo "==> [harden] Nexus AMI hardening complete." diff --git a/nexus-ami/scripts/install-nats.sh b/nexus-ami/scripts/install-nats.sh new file mode 100755 index 00000000..5605f424 --- /dev/null +++ b/nexus-ami/scripts/install-nats.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# install-nats.sh — install NATS Server 2.x (JetStream enabled) from the +# official release binary on AL2023. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md + +set -euo pipefail + +NATS_VERSION=2.10.20 +ARCH=$(uname -m) +case "$ARCH" in + x86_64) NATS_ARCH=amd64 ;; + aarch64) NATS_ARCH=arm64 ;; + *) echo "ERROR: unsupported arch $ARCH" >&2; exit 1 ;; +esac + +TARBALL="nats-server-v$NATS_VERSION-linux-$NATS_ARCH.tar.gz" +URL="https://github.com/nats-io/nats-server/releases/download/v$NATS_VERSION/$TARBALL" + +echo "==> [install-nats] downloading $URL..." +cd /tmp +curl -fsSL "$URL" -o "$TARBALL" +tar xzf "$TARBALL" +install -m 0755 "nats-server-v$NATS_VERSION-linux-$NATS_ARCH/nats-server" /usr/local/bin/nats-server +rm -rf "$TARBALL" "nats-server-v$NATS_VERSION-linux-$NATS_ARCH" + +echo "==> [install-nats] creating nats user + dirs..." +if ! id -u nats >/dev/null 2>&1; then + useradd --system --no-create-home --shell /sbin/nologin --user-group nats +fi +install -d -o nats -g nats -m 0750 /var/lib/nats /var/log/nats +install -d -o root -g root -m 0755 /etc/nats + +cat > /etc/nats/nats-server.conf <<'EOF' +# Nexus appliance — NATS Server with JetStream (localhost-only). +listen: "127.0.0.1:4222" +http: "127.0.0.1:8222" + +server_name: "nexus-appliance" + +jetstream { + store_dir: "/var/lib/nats" + max_memory_store: 1GB + max_file_store: 32GB +} + +log_file: "/var/log/nats/nats-server.log" +logtime: true +debug: false +trace: false + +# No external clustering for the appliance form factor; the Hub is the only +# JetStream client and runs on the same host. +EOF +chmod 0644 /etc/nats/nats-server.conf + +echo "==> [install-nats] complete (NATS $NATS_VERSION)." diff --git a/nexus-ami/scripts/install-node-prisma.sh b/nexus-ami/scripts/install-node-prisma.sh new file mode 100755 index 00000000..1889b4ec --- /dev/null +++ b/nexus-ami/scripts/install-node-prisma.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# install-node-prisma.sh — install a self-contained Node.js 20 runtime under +# /opt/nexus/node and run `npm install` inside /opt/nexus/prisma so the +# first-boot Prisma client / seed / tsx commands work offline. +# +# Why self-contained? +# - AL2023's dnf node is older + slower-moving; pinning a specific Node 20 +# binary keeps the AMI reproducible across Marketplace rebuilds. +# - Only the first-boot path uses Node; nothing else on the appliance needs +# it, so installing into /opt/nexus/node keeps it out of the system PATH. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md + +set -euo pipefail + +# NODE_VERSION must satisfy Prisma's engines.node constraint. Prisma 7.8.0 +# requires "^20.19 || ^22.12 || >=24.0"; chokidar@5 + readdirp@5 (transitive +# deps) also require ">=20.19.0". Hard-pinned 20.18.1 produced an npm +# EBADENGINE fatal at AMI build time — verified 2026-05-28. Stay within +# 20.x LTS line ("Iron") to keep the runtime delta minimal across rebuilds. +NODE_VERSION=20.19.0 +ARCH=$(uname -m) +case "$ARCH" in + x86_64) NODE_ARCH=x64 ;; + aarch64) NODE_ARCH=arm64 ;; + *) echo "ERROR: unsupported arch $ARCH" >&2; exit 1 ;; +esac + +NODE_DIR=/opt/nexus/node +PRISMA_DIR=/opt/nexus/prisma + +TARBALL="node-v$NODE_VERSION-linux-$NODE_ARCH.tar.xz" +URL="https://nodejs.org/dist/v$NODE_VERSION/$TARBALL" + +echo "==> [install-node-prisma] downloading Node.js $NODE_VERSION..." +cd /tmp +curl -fsSL "$URL" -o "$TARBALL" +mkdir -p "$NODE_DIR" +tar xJf "$TARBALL" -C "$NODE_DIR" --strip-components=1 +rm -f "$TARBALL" + +export PATH="$NODE_DIR/bin:$PATH" + +echo "==> [install-node-prisma] node $(node --version) | npm $(npm --version) installed at $NODE_DIR" + +echo "==> [install-node-prisma] running npm install in $PRISMA_DIR..." +cd "$PRISMA_DIR" +"$NODE_DIR/bin/npm" install --omit=dev --no-audit --no-fund + +# Install tsx + typescript globally so first-boot-db.sh can call them +# regardless of devDependencies. +"$NODE_DIR/bin/npm" install -g --no-audit --no-fund tsx typescript + +echo "==> [install-node-prisma] complete." diff --git a/nexus-ami/scripts/install-postgres.sh b/nexus-ami/scripts/install-postgres.sh new file mode 100755 index 00000000..e80da0e1 --- /dev/null +++ b/nexus-ami/scripts/install-postgres.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# install-postgres.sh — install PostgreSQL 16 from AL2023's dnf and initialise +# an empty cluster. The data directory is populated by first-boot-db.sh. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md + +set -euo pipefail + +echo "==> [install-postgres] installing postgresql16-server..." +dnf install -y postgresql16-server postgresql16-contrib + +echo "==> [install-postgres] enabling postgresql.service..." +systemctl enable postgresql + +# IMPORTANT: postgres `initdb` is NOT run here. It happens at first-boot +# (see first-boot-db.sh). Reason: harden.sh wipes /var/lib/pgsql/data/* +# before the AMI snapshot — if we initdb'd at build time those files would +# be removed and postgresql.service would refuse to start on the launched +# instance with "data directory not initialized". Deferring initdb to +# first-boot avoids that whole class of bug AND keeps every launched +# instance's cluster identifier unique. + +echo "==> [install-postgres] complete (initdb deferred to first-boot)." diff --git a/nexus-ami/scripts/install-valkey.sh b/nexus-ami/scripts/install-valkey.sh new file mode 100755 index 00000000..ed5cc7ca --- /dev/null +++ b/nexus-ami/scripts/install-valkey.sh @@ -0,0 +1,215 @@ +#!/bin/bash +# install-valkey.sh — build Valkey 8 + valkey-search from source on AL2023. +# AL2023 dnf has no valkey package (checked 2026-05); the official Valkey +# project ships valkey/valkey-bundle Docker images but no rpm. Source compile +# is the cleanest path for a baked AMI (no docker-in-AMI smell). +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# Wire-compatible with Redis 7 — every go-redis/v9 client in Nexus works +# unchanged against it. + +set -euo pipefail + +# valkey-search 1.2.0 hard-requires Valkey >= 9.0.1 at module-init time. Hit +# on 2026-05-28 first-launch test: a Valkey 8.1.2 server with libsearch.so +# loadmodule logged +# Minimum required server version is 9.0.1, Current version is 8.1.2 +# Module /usr/lib/valkey/libsearch.so initialization failed. +# and aborted boot. Bumping to 9.0.4 (the latest in the 9.0.x line) restores +# compatibility while staying on the 9.0 series (avoiding 9.1.x feature drift). +VALKEY_VERSION=9.0.4 +# valkey-search GitHub tags do NOT use a `v` prefix (e.g. `1.2.0`, not +# `v1.2.0`). Verified via GitHub API 2026-05-28; 1.2.0 is the latest stable +# release. Bumping requires re-checking the API: +# curl -fsSL https://api.github.com/repos/valkey-io/valkey-search/tags +VALKEY_SEARCH_VERSION=1.2.0 +BUILD_DIR=/tmp/valkey-build +INSTALL_PREFIX=/usr/local + +echo "==> [install-valkey] installing build dependencies..." +# valkey-search 1.x switched to ninja + cmake + submodules (gRPC, Protobuf, +# Abseil bundled as submodules); cmake hard-checks the C++ compiler is +# either GCC ≥ 12 or Clang ≥ 16. AL2023's default gcc is 11.5.0 and default +# clang is 15.0.7 — both too old. AL2023 dnf ships versioned clang packages +# (confirmed 2026-05-28 dnf list available 'clang*'): clang18 / clang19 / +# clang20 are present; clang17 / clang16 are NOT in the repo. We pick the +# newest available. + +dnf install -y \ + gcc \ + gcc-c++ \ + make \ + cmake \ + ninja-build \ + git \ + openssl-devel \ + systemd-devel \ + pkgconf + +echo "==> [install-valkey] selecting a Clang ≥ 16 ..." +# Each clang${ver} package installs /usr/bin/clang-${ver} + /usr/bin/clang++-${ver} +# following the standard LLVM versioned-binary convention. We try newest-first +# and stop at the first one whose binary lands on PATH. 17/16 stay in the list +# in case a future repo refresh adds them back; current AL2023 (2026-05) skips +# straight from 15 → 18. +CLANG_BIN="" +CLANGXX_BIN="" +# Also install lld${ver} alongside clang${ver}. valkey-search's CMake compiles +# with -flto; linking libsearch.so requires LTO bitcode handling. GNU ld +# delegates LTO to LLVMgold.so which AL2023's `clang${ver}` package does NOT +# ship — verified 2026-05-28 build: link of libsearch.so failed with +# "cannot open /usr/lib64/llvm20/lib64/LLVMgold.so". lld is LLVM's native +# linker; it handles LTO bitcode directly, no plugin needed. +for ver in 20 19 18 17 16; do + if dnf install -y "clang${ver}" "lld${ver}" 2>/dev/null && command -v "clang-${ver}" >/dev/null 2>&1; then + CLANG_BIN="clang-${ver}" + CLANGXX_BIN="clang++-${ver}" + # `-fuse-ld=lld` only finds `ld.lld` (unversioned) on PATH. AL2023's + # lld${ver} package installs versioned binaries (ld.lld-${ver}); add a + # symlink if the unversioned name is missing. + if ! command -v ld.lld >/dev/null 2>&1; then + for candidate in "/usr/bin/ld.lld-${ver}" "/usr/bin/lld-${ver}"; do + if [ -x "$candidate" ]; then + ln -sf "$candidate" /usr/bin/ld.lld + break + fi + done + fi + command -v ld.lld >/dev/null 2>&1 || { + echo "ERROR: ld.lld not in PATH after installing lld${ver}; cannot use lld for LTO link." >&2 + ls -la /usr/bin/ld.lld* /usr/bin/lld* 2>&1 | head -10 >&2 + exit 1 + } + echo "==> [install-valkey] using clang-${ver} + ld.lld (LTO via lld, not LLVMgold plugin)" + break + fi +done +if [ -z "$CLANG_BIN" ]; then + echo "ERROR: AL2023 dnf has no clang ≥ 16 (tried clang20 / 19 / 18 / 17 / 16); valkey-search 1.x requires ≥ 16." >&2 + echo "Available clang/lld packages:" >&2 + dnf list available 'clang*' 'lld*' 2>&1 | head -30 >&2 + exit 1 +fi + +echo "==> [install-valkey] verifying compiler versions..." +gcc --version | head -1 +"$CLANG_BIN" --version | head -1 + +# ─── Valkey core ──────────────────────────────────────────────────────────── + +echo "==> [install-valkey] downloading Valkey $VALKEY_VERSION source..." +mkdir -p "$BUILD_DIR" && cd "$BUILD_DIR" +curl -fsSL "https://github.com/valkey-io/valkey/archive/refs/tags/$VALKEY_VERSION.tar.gz" \ + -o "valkey-$VALKEY_VERSION.tar.gz" +tar xzf "valkey-$VALKEY_VERSION.tar.gz" +cd "valkey-$VALKEY_VERSION" + +echo "==> [install-valkey] building Valkey ($(nproc) parallel jobs)..." +make -j"$(nproc)" USE_SYSTEMD=yes BUILD_TLS=yes +make install PREFIX="$INSTALL_PREFIX" + +# ─── valkey-search module ─────────────────────────────────────────────────── +# semantic cache module — packages/ai-gateway/internal/cache/semantic/ requires +# the FT.CREATE / FT.SEARCH commands this module provides. +# +# valkey-search 1.x uses ninja + cmake + git submodules (gRPC, Protobuf, +# Abseil all vendored). We use the project's canonical `build.sh` rather +# than calling cmake directly — the build script knows where the submodules +# live and how to wire them. --recurse-submodules at clone time pulls all +# vendored deps in one shot. + +echo "==> [install-valkey] cloning valkey-search $VALKEY_SEARCH_VERSION (with submodules)..." +cd "$BUILD_DIR" +git clone --recurse-submodules --depth 1 --shallow-submodules \ + --branch "$VALKEY_SEARCH_VERSION" \ + https://github.com/valkey-io/valkey-search.git +cd valkey-search + +# ─── Patch: Linux x86_64 + clang duplicate-overload in type_conversions.h ─── +# vmsdk/src/type_conversions.h has: +# template <> inline absl::StatusOr To(absl::string_view); +# #if defined(__clang__) && !defined(RunningClangd) +# template <> inline absl::StatusOr To(absl::string_view); +# #endif +# On Linux x86_64 (LP64) `uint64_t === unsigned long`, so the guarded overload +# collides with the unguarded one and clang ≥ 18 emits a hard redefinition +# error in vmsdklib — verified 2026-05-28 against tags 1.2.0 AND main. The +# overload only matters on platforms where the two types differ (macOS arm64/ +# x86_64: uint64_t = unsigned long long ≠ unsigned long). Fix: tighten the +# guard to also require !defined(__linux__). No-op on macOS; eliminates the +# redefinition on Linux. Idempotent: re-running the sed is harmless. +echo "==> [install-valkey] patching type_conversions.h for Linux+clang duplicate template..." +sed -i 's|^#if defined(__clang__) && !defined(RunningClangd)$|#if defined(__clang__) \&\& !defined(RunningClangd) \&\& !defined(__linux__)|' vmsdk/src/type_conversions.h +grep -q '!defined(__linux__)' vmsdk/src/type_conversions.h || { + echo "ERROR: type_conversions.h patch did not apply — upstream source layout changed" >&2 + echo "Expected line to patch: #if defined(__clang__) && !defined(RunningClangd)" >&2 + echo "Current grep for the guard line:" >&2 + grep -n 'defined(__clang__)' vmsdk/src/type_conversions.h >&2 || true + exit 1 +} + +# Force the build to use Clang (≥ 16 on AL2023) instead of the default +# gcc-11.5.0 (which valkey-search 1.x cmake rejects: "Minimum GCC required +# is 12 and later"). +# +# Cap parallelism at 4 regardless of available cores. Each gRPC/Protobuf/ICU +# compile worker can hold 1.5–2 GB resident; running all 8 cores parallel +# on t3.2xlarge would push 16+ GB and risk OOM-killer even on 32 GB hosts. +# --jobs=4 is the sweet spot: comfortable 16–24 GB working set, no OOM. +echo "==> [install-valkey] building valkey-search with ${CLANG_BIN} (jobs=4, linker=lld)..." +export CC="$CLANG_BIN" +export CXX="$CLANGXX_BIN" +# Force lld as the linker for every link step. CMake propagates LDFLAGS into +# CMAKE_{EXE,SHARED,MODULE}_LINKER_FLAGS at configure time, so this reaches +# the libsearch.so shared-library link where the LTO bitcode lives. Required +# because AL2023's clang20 ships without LLVMgold.so — see comment above the +# clang/lld install loop. +export LDFLAGS="${LDFLAGS:-} -fuse-ld=lld" +./build.sh --jobs=4 + +echo "==> [install-valkey] locating built libsearch.so..." +SO_PATH=$(find . -name 'libsearch.so' -type f 2>/dev/null | head -1) +if [ -z "$SO_PATH" ]; then + echo "ERROR: libsearch.so not found after build" >&2 + find . -name '*.so' -type f 2>/dev/null | head -20 >&2 + exit 1 +fi +echo "==> [install-valkey] found: $SO_PATH" + +install -d -m 0755 /usr/lib/valkey +# 0755 (not 0644) — Valkey's module loader explicitly checks the execute bit +# before dlopen as a safety guard, and refuses to load with "It does not have +# execute permissions." Hit on 2026-05-28 — the AMI booted with valkey in a +# restart loop. Standard convention for shared libs on Linux is 0755. +install -m 0755 "$SO_PATH" /usr/lib/valkey/libsearch.so + +# ─── User + directories + config ──────────────────────────────────────────── + +echo "==> [install-valkey] creating valkey user + dirs..." +if ! id -u valkey >/dev/null 2>&1; then + useradd --system --no-create-home --shell /sbin/nologin --user-group valkey +fi +install -d -o valkey -g valkey -m 0750 /var/lib/valkey /var/log/valkey /var/run/valkey +install -d -o root -g root -m 0755 /etc/valkey + +cat > /etc/valkey/valkey.conf <<'EOF' +# Nexus appliance — Valkey config (localhost-only). +bind 127.0.0.1 +port 6379 +protected-mode yes +supervised systemd +loglevel notice +logfile /var/log/valkey/valkey.log +dir /var/lib/valkey +appendonly yes +appendfsync everysec +maxmemory-policy allkeys-lru +loadmodule /usr/lib/valkey/libsearch.so +EOF +chmod 0644 /etc/valkey/valkey.conf + +# ─── Cleanup ──────────────────────────────────────────────────────────────── + +rm -rf "$BUILD_DIR" + +echo "==> [install-valkey] complete (Valkey $VALKEY_VERSION + valkey-search $VALKEY_SEARCH_VERSION)." diff --git a/nexus-ami/scripts/install.sh b/nexus-ami/scripts/install.sh new file mode 100755 index 00000000..a4c785c6 --- /dev/null +++ b/nexus-ami/scripts/install.sh @@ -0,0 +1,184 @@ +#!/bin/bash +# install.sh — orchestrator. Runs ONCE during Packer build (NOT per-instance). +# Assumes: Amazon Linux 2023 base, artifacts/ staged at /tmp/nexus/ by Packer +# file provisioner. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md + +set -euo pipefail + +NEXUS_USER=nexus +NEXUS_GROUP=nexus +INSTALL_DIR=/opt/nexus +BIN_DIR=$INSTALL_DIR/bin +UI_DIR=$INSTALL_DIR/ui +PRISMA_DIR=$INSTALL_DIR/prisma +NODE_DIR=$INSTALL_DIR/node +CONFIG_DIR=/etc/nexus +LOG_DIR=/var/log/nexus +DATA_DIR=/var/lib/nexus +STAGING_DIR=/tmp/nexus +SCRIPT_DIR=/usr/local/sbin +TARBALL=/tmp/nexus-artifacts.tar.gz + +# ─── 0. Extract artifacts tarball uploaded by Packer file provisioner ────── +# Packer uploads a single artifacts.tar.gz to /tmp/nexus-artifacts.tar.gz +# (atomic transfer — avoids the recursive-SCP partial-upload bug we hit +# when source was directory-shape on slow links). We extract it under +# /tmp/nexus/ so the rest of this script can reference $STAGING_DIR/bin/ +# etc. exactly as if Packer had uploaded the directory directly. + +echo "==> [install] extracting $TARBALL -> $STAGING_DIR ..." +if [ ! -f "$TARBALL" ]; then + echo "ERROR: tarball not found at $TARBALL — Packer file provisioner did not deliver it" >&2 + exit 1 +fi +mkdir -p "$STAGING_DIR" +tar -C "$STAGING_DIR" -xzf "$TARBALL" +rm -f "$TARBALL" +echo "==> [install] extracted artifacts ($(du -sh "$STAGING_DIR" | awk '{print $1}'))" + +# ─── 1. Update base OS + install base packages ────────────────────────────── + +echo "==> [install] dnf update -y (required for Marketplace scan-clean)..." +dnf update -y +# Only firewalld + nginx need installing — openssl, ca-certificates, jq, tar, +# gzip, rsync, procps-ng, curl-minimal all ship preinstalled in AL2023. We +# explicitly do NOT install the full `curl` package because it conflicts with +# the pre-installed curl-minimal (and curl-minimal already provides the curl +# CLI features the install/first-boot scripts need: -f / -s / -S / -L / +# --connect-timeout / etc.). +dnf install -y \ + firewalld \ + nginx + +# ─── 2. Create system user ────────────────────────────────────────────────── + +echo "==> [install] creating nexus system user..." +if ! id -u "$NEXUS_USER" >/dev/null 2>&1; then + useradd --system --no-create-home --shell /sbin/nologin --user-group "$NEXUS_USER" +fi + +# ─── 3. Create directory structure ────────────────────────────────────────── + +echo "==> [install] creating directory structure..." +install -d -o root -g root -m 0755 "$INSTALL_DIR" "$BIN_DIR" "$UI_DIR" "$PRISMA_DIR" "$NODE_DIR" +install -d -o root -g "$NEXUS_GROUP" -m 0750 "$CONFIG_DIR" /etc/compliance-proxy +install -d -o "$NEXUS_USER" -g "$NEXUS_GROUP" -m 0750 "$LOG_DIR" "$DATA_DIR" \ + "$DATA_DIR/agentca" \ + "$DATA_DIR/audit-spool" \ + "$DATA_DIR/alerting" + +# ─── 4. Install Nexus Go binaries ─────────────────────────────────────────── + +echo "==> [install] installing Nexus Go binaries..." +for binary in nexus-hub control-plane ai-gateway compliance-proxy; do + if [ ! -f "$STAGING_DIR/bin/$binary" ]; then + echo "ERROR: missing binary $STAGING_DIR/bin/$binary" >&2 + exit 1 + fi + install -o root -g root -m 0755 "$STAGING_DIR/bin/$binary" "$BIN_DIR/$binary" +done + +# ─── 5. Install UI static assets ──────────────────────────────────────────── + +echo "==> [install] installing UI static assets..." +if [ ! -d "$STAGING_DIR/ui-dist" ]; then + echo "ERROR: missing UI dist at $STAGING_DIR/ui-dist" >&2 + exit 1 +fi +rsync -a --delete "$STAGING_DIR/ui-dist/" "$UI_DIR/" +chown -R root:root "$UI_DIR" + +# ─── 6. Install Prisma schema + seed + admin-password helper ──────────────── + +echo "==> [install] installing Prisma schema + seed..." +if [ ! -d "$STAGING_DIR/prisma" ]; then + echo "ERROR: missing prisma bundle at $STAGING_DIR/prisma" >&2 + exit 1 +fi +rsync -a --delete "$STAGING_DIR/prisma/" "$PRISMA_DIR/" +install -o root -g root -m 0755 "$STAGING_DIR/scripts/set-admin-password.js" "$PRISMA_DIR/set-admin-password.js" +chown -R root:root "$PRISMA_DIR" + +# ─── 7. Install service configs ───────────────────────────────────────────── + +echo "==> [install] installing prod-shape config files..." +for svc in nexus-hub control-plane ai-gateway compliance-proxy; do + install -o root -g "$NEXUS_GROUP" -m 0640 \ + "$STAGING_DIR/configs/$svc.config.yaml" "$CONFIG_DIR/$svc.config.yaml" +done +install -o root -g root -m 0644 "$STAGING_DIR/configs/nginx-nexus.conf" /etc/nginx/conf.d/nexus.conf +rm -f /etc/nginx/conf.d/default.conf + +# ─── 8. Install systemd units ─────────────────────────────────────────────── + +echo "==> [install] installing systemd units..." +install -o root -g root -m 0644 "$STAGING_DIR/systemd/"*.service /etc/systemd/system/ + +# ─── 9. Install first-boot helpers under /usr/local/sbin ──────────────────── + +echo "==> [install] installing first-boot scripts..." +install -o root -g root -m 0755 "$STAGING_DIR/scripts/first-boot.sh" "$SCRIPT_DIR/nexus-first-boot" +install -o root -g root -m 0755 "$STAGING_DIR/scripts/first-boot-secrets.sh" "$SCRIPT_DIR/nexus-first-boot-secrets" +install -o root -g root -m 0755 "$STAGING_DIR/scripts/first-boot-ca.sh" "$SCRIPT_DIR/nexus-first-boot-ca" +install -o root -g root -m 0755 "$STAGING_DIR/scripts/first-boot-db.sh" "$SCRIPT_DIR/nexus-first-boot-db" + +# ─── 10. Install runtime dependencies (Postgres / Valkey / NATS / Node) ───── + +bash "$STAGING_DIR/scripts/install-postgres.sh" +bash "$STAGING_DIR/scripts/install-valkey.sh" +bash "$STAGING_DIR/scripts/install-nats.sh" +bash "$STAGING_DIR/scripts/install-node-prisma.sh" + +# ─── 11. Configure firewall ───────────────────────────────────────────────── + +echo "==> [install] configuring firewalld..." +systemctl enable firewalld +systemctl start firewalld +firewall-cmd --permanent --add-service=ssh +firewall-cmd --permanent --add-port=443/tcp # nginx (UI + /api/*) +firewall-cmd --permanent --add-port=80/tcp # nginx (HTTP redirect to 443) +firewall-cmd --permanent --add-port=3050/tcp # AI Gateway (SDK direct) +firewall-cmd --permanent --add-port=3128/tcp # Compliance Proxy CONNECT +firewall-cmd --reload + +# ─── 12. Enable services to start at boot ─────────────────────────────────── + +echo "==> [install] enabling services..." +systemctl daemon-reload +systemctl enable nginx +systemctl enable postgresql +systemctl enable valkey +systemctl enable nats +systemctl enable nexus-first-boot.service +systemctl enable nexus-hub.service +systemctl enable nexus-control-plane.service +systemctl enable nexus-gateway.service +systemctl enable nexus-proxy.service + +# ─── 13. Configure logrotate for Nexus log dir ────────────────────────────── + +echo "==> [install] writing logrotate config..." +cat > /etc/logrotate.d/nexus <<'EOF' +/var/log/nexus/*.log { + daily + rotate 14 + compress + delaycompress + missingok + notifempty + create 0640 nexus nexus + sharedscripts + postrotate + systemctl reload-or-restart nexus-hub.service nexus-control-plane.service \ + nexus-gateway.service nexus-proxy.service \ + > /dev/null 2>&1 || true + endscript +} +EOF + +echo "==> [install] cleaning staging directory..." +rm -rf "$STAGING_DIR" + +echo "==> [install] install.sh complete." diff --git a/nexus-ami/scripts/set-admin-password.js b/nexus-ami/scripts/set-admin-password.js new file mode 100644 index 00000000..8cdf75f6 --- /dev/null +++ b/nexus-ami/scripts/set-admin-password.js @@ -0,0 +1,30 @@ +// set-admin-password.js — generate a scrypt password hash compatible with the +// NexusUser.passwordHash column. Reads NEW_PASSWORD from env, prints the hash +// to stdout in "salt_hex:hash_hex" format. +// +// Parameters MUST match tools/db-migrate/seed/lib.ts hashPassword(): +// N = 16384, r = 8, p = 1 +// salt = 32 bytes (random) +// key = 64 bytes +// +// Used by first-boot-db.sh to replace the seeded admin@nexus.ai password +// with a per-instance random one. This file is shipped to /opt/nexus/prisma/ +// alongside the schema and seed code. + +'use strict'; + +const { scryptSync, randomBytes } = require('crypto'); + +const SALT_LENGTH = 32; +const KEY_LENGTH = 64; +const SCRYPT_OPTIONS = { N: 16384, r: 8, p: 1 }; + +const password = process.env.NEW_PASSWORD; +if (!password || password.length < 8) { + process.stderr.write('set-admin-password: NEW_PASSWORD env must be set and >= 8 chars\n'); + process.exit(1); +} + +const salt = randomBytes(SALT_LENGTH); +const hash = scryptSync(password, salt, KEY_LENGTH, SCRYPT_OPTIONS); +process.stdout.write(`${salt.toString('hex')}:${hash.toString('hex')}`);