diff --git a/.gitignore b/.gitignore index 295d8d37..905585fe 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,23 @@ node_modules/ dist/ +# nexus-ami staging artifacts populated by nexus-ami/build.sh — the binaries, +# UI dist, and Prisma bundle are derived from the source tree on every build +# and should never be committed. Architecture: +# docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +nexus-ami/artifacts/bin/ +nexus-ami/artifacts/ui-dist/ +nexus-ami/artifacts/prisma/ +nexus-ami/artifacts/scripts/ +nexus-ami/artifacts.tar.gz +# Build-cycle log + tarball rotations created by ./build.sh during iteration. +nexus-ami/artifacts.tar.gz.* +nexus-ami/build.log* +# Packer leaves a manifest + crash log in the working dir. +nexus-ami/packer_cache/ +nexus-ami/manifest.json +nexus-ami/crash.log + # Wails dashboard build artefacts (regenerated by `wails build` / # `wails dev`; bindings are auto-generated from the Go bridge). # Wails build outputs. Everything is generated EXCEPT: diff --git a/Makefile b/Makefile index 71ae8d7c..bda5b8d1 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,8 @@ compliance-proxy-build compliance-proxy-test \ agent-build agent-test \ agent-build-macos agent-package-macos agent-clean-macos \ - agent-build-windows agent-package-windows agent-clean-windows + agent-build-windows agent-package-windows agent-clean-windows \ + ami-build ami-stage # ── Build output convention ──────────────────────────────────────────── # All Go service binaries land in dist/bin// so they @@ -132,3 +133,14 @@ agent-package-windows: agent-build-windows agent-clean-windows: rm -rf dist/windows + +# ── AMI / appliance build (E-OSS marketplace) ──────────────────────── +# Wraps Go binaries + UI dist + Prisma bundle + Packer build into one +# invocation. Architecture: +# docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md + +ami-build: + bash nexus-ami/build.sh + +ami-stage: + bash nexus-ami/build.sh --skip-packer diff --git a/README.md b/README.md index 61f3a9a6..d545f96b 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,17 @@ The lateral dotted arrow is the **attestation handoff**: the Agent always egress --- +## Deployment + +| Form factor | How | Status | +|---|---|---| +| **AWS Marketplace AMI / single-instance appliance** | `cd nexus-ami && ./build.sh` — bakes binaries + UI + Prisma + nginx + Postgres + Valkey + NATS into one AL2023 image via Packer | [`nexus-ami/README.md`](./nexus-ami/README.md) for build steps, [`docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md`](./docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md) for design | +| **Local development** | docker-compose + `./scripts/dev-start.sh` (Postgres + Valkey + NATS) and per-service `go run ./cmd//` | See **Quick start** below | +| **VMware / KVM image / bare-metal appliance** | Reuses the same `install.sh` + `harden.sh` from `nexus-ami/scripts/` under a different Packer builder | Future | +| **Container / Kubernetes** | Out of scope for the appliance form factor — separate product line | Future | + +--- + ## Quick start (local development) ### Prerequisites diff --git a/docs/developers/architecture/README.md b/docs/developers/architecture/README.md index 9cc3866a..baf94391 100644 --- a/docs/developers/architecture/README.md +++ b/docs/developers/architecture/README.md @@ -168,6 +168,12 @@ If you are about to edit code in an area that is genuinely **not** covered by an | i18n keys (`t('namespace:section.key')`), locale files (`packages/*/src/i18n/locales/**`), `packages/ui-shared/src/i18n/**` | `docs/developers/architecture/cross-cutting/ui/ui-i18n-architecture.md` | | `useApi` / `useApiMutation` hooks + queryKey shape, `shellRouteConfig.tsx` / `Sidebar.tsx` IA, `packages/ui-shared/**` cross-bundle components | `docs/developers/architecture/cross-cutting/ui/ui-shell-architecture.md` | +## Cross-cutting — deployment + +| Editing area / file glob | Read FIRST | +|---|---| +| `nexus-ami/**` — Packer template, install / first-boot / harden scripts, prod-shape `*.config.yaml`, systemd unit files for the AMI / bare-metal appliance form factor | `docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md` | + ## Adding a new arch doc When you ship a new `docs/developers/architecture/**/*-architecture.md`: diff --git a/docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md b/docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md new file mode 100644 index 00000000..473f2862 --- /dev/null +++ b/docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md @@ -0,0 +1,260 @@ +--- +updated: 2026-05-28 +--- + +# AMI / appliance deployment architecture + +Single-box deployment form factor for Nexus Gateway. Packages **all** runtime +dependencies (PostgreSQL 16, Valkey 8 with `valkey-search`, NATS JetStream, +4 Go services, the React UI, and an nginx reverse proxy) into one disk image +managed by systemd. The same artifacts ship as: + +| Target | Wrapped by | +|---|---| +| **AWS Marketplace AMI** | `nexus-ami/nexus.pkr.hcl` (Packer + Amazon Linux 2023) | +| **VMware / KVM image** | future — same `install.sh`, different Packer builder | +| **Bare-metal appliance** | future — same `install.sh` invoked from a kickstart / preseed | + +This doc is the architecture source of truth for **everything under +`nexus-ami/`**. Any change to a config file, systemd unit, install script, +or first-boot script in that directory MUST update this doc in the same +commit (Code/Doc Lockstep — see `.cursor/rules/code-doc-lockstep.mdc`). + +## 1. Why one form factor for AMI + bare-metal + +Two distribution channels share the same install logic: + +- **Cloud appliance** — AWS Marketplace AMI (initial target). Customer hits + "Launch", gets a working single-instance Nexus in ~5 minutes. +- **On-prem appliance** — pre-installed disk image / ISO for hardware + shipped to customer sites (future). Same systemd-managed services, same + first-boot secret generation. + +Containerised / Kubernetes deployment is **out of scope** for this doc. If +the project later ships a Helm chart or container Marketplace listing, that +is a separate architecture (`-container-architecture.md`) with its +own dependency wiring (RDS / ElastiCache / managed MQ). + +## 2. Boot sequence (every fresh instance / fresh hardware) + +``` +1. cloud-init / kickstart → network + ec2-user / nexus shell login +2. firewalld → open 443, 3128, 22; close everything else +3. nexus-first-boot.service → oneshot, gated by /etc/nexus/.initialized + ├─ first-boot-secrets.sh → generate 5 [MUST MATCH] secrets, write + │ /etc/nexus/{nexus-hub,control-plane, + │ ai-gateway,compliance-proxy}.env + ├─ first-boot-ca.sh → generate compliance-proxy MITM CA at + │ /etc/compliance-proxy/{ca.crt,ca.key} + └─ first-boot-db.sh → start postgresql, wait, prisma db push, + prisma db seed, randomise admin password, + write /var/log/nexus/admin-credentials.txt + and /etc/motd +4. postgresql.service → After=nexus-first-boot +5. valkey.service → After=nexus-first-boot +6. nats.service → After=nexus-first-boot +7. nexus-hub.service → After=postgresql valkey nats +8. nexus-control-plane.service → After=nexus-hub +9. nexus-gateway.service → After=nexus-hub +10. nexus-proxy.service → After=nexus-hub +11. nginx.service → After=nexus-control-plane (reverse proxy) +``` + +`/etc/nexus/.initialized` is the idempotency marker. Removing it triggers a +fresh init on next boot (destructive — generates new secrets, re-seeds DB). +Customers should never touch it. + +## 3. Filesystem layout + +| Path | Owner | Mode | Contents | +|---|---|---|---| +| `/opt/nexus/bin/` | root:root | 0755 | 4 Go service binaries (immutable, part of AMI) | +| `/opt/nexus/ui/` | root:root | 0755 | Vite-built UI dist (immutable, part of AMI) | +| `/opt/nexus/prisma/` | root:root | 0755 | Prisma schema + seed (immutable, part of AMI) | +| `/etc/nexus/` | root:nexus | 0750 | 4 prod-shape `*.config.yaml` + 4 `*.env` + nginx-nexus.conf + `.initialized` marker | +| `/etc/compliance-proxy/` | root:nexus | 0750 | MITM CA cert + key (generated first-boot) | +| `/var/lib/nexus/` | nexus:nexus | 0750 | Service runtime state (agent CA dir, NDJSON spool, file-backed alerting state) | +| `/var/lib/postgresql/data/` | postgres:postgres | 0700 | PostgreSQL data directory (AL2023 dnf default) | +| `/var/lib/valkey/` | valkey:valkey | 0750 | Valkey AOF + RDB | +| `/var/lib/nats/` | nats:nats | 0750 | NATS JetStream file store | +| `/var/log/nexus/` | nexus:nexus | 0750 | Service log files (rotated by logrotate); also holds `admin-credentials.txt` (mode 0640, root:nexus) | + +## 4. Secret generation (`first-boot-secrets.sh`) + +Five environment variables MUST be unique-per-instance and identical across +the four services that share them (see `.env.example` `[MUST MATCH]` tags): + +| Env var | Used by | Generation | +|---|---|---| +| `INTERNAL_SERVICE_TOKEN` | all 4 | `openssl rand -hex 32` | +| `ADMIN_KEY_HMAC_SECRET` | control-plane, ai-gateway | `openssl rand -hex 32` | +| `CREDENTIAL_ENCRYPTION_KEY` | control-plane, ai-gateway | `openssl rand -hex 32` (AES-256, 64 hex chars) | +| `COMPLIANCE_PROXY_API_TOKEN` | control-plane, compliance-proxy | `openssl rand -hex 32` | +| `AI_GATEWAY_API_TOKEN` | ai-gateway only | `openssl rand -hex 32` | + +Each is written to the appropriate per-service `.env` file under `/etc/nexus/` +which the systemd unit picks up via `EnvironmentFile=`. File mode `0640`, +owner `root:nexus` (services run as `nexus` and read; only root can rewrite). + +`DATABASE_URL`, `REDIS_ADDRS`, `NATS_URL`, `NEXUS_HUB_URL`, +`AUTH_SERVER_URL`, `AUTH_SERVER_JWKS_URL`, `AUTH_SERVER_ISSUER`, +`AI_GATEWAY_URL`, `COMPLIANCE_PROXY_URL`, `COMPLIANCE_PROXY_RUNTIME_URL` — +all bind to `localhost` with fixed ports (see §6), baked into the per-service +`.env` files at first boot. + +## 5. Database initialisation (`first-boot-db.sh`) + +1. `systemctl start postgresql` (synchronous via `--wait`). +2. `psql` create role `nexus` with a per-instance random password; create + database `nexus_gateway` owned by `nexus`. +3. Write the matching `DATABASE_URL=postgresql://nexus:@localhost:5432/nexus_gateway?sslmode=disable` + into every `*.env` file under `/etc/nexus/`. +4. `cd /opt/nexus/prisma && npx prisma db push --skip-generate` to materialise + the schema (no migration history table — fresh instance, no upgrade path + to preserve). +5. `npx tsx seed/seed.ts` to load baseline rows (organisations, IAM, + roles, default settings — see `tools/db-migrate/seed/seed.ts`). +6. Generate a 24-character random admin password, hash it with the same + scrypt parameters the seed uses (`tools/db-migrate/seed/lib.ts` + `hashPassword()` — N=16384, r=8, p=1, salt=32, key=64), and + `UPDATE "NexusUser" SET "passwordHash" = $1 WHERE email = 'admin@nexus.ai'`. +7. Write the plaintext password + login URL + warning to + `/var/log/nexus/admin-credentials.txt` (mode 0640, root:nexus) and append + a one-screen summary to `/etc/motd` so the operator sees it on first SSH. + +`admin@nexus.ai` is the only seeded user that ships with a password. All +other seeded users (alice / bob / carol / diana etc., listed in +`packages/control-plane-ui/README.md`) keep their dev-time passwords from +the seed and are documented as "demo accounts — disable for production" +in the operator-facing docs. + +## 6. Port map (all bound to `localhost` except nginx + compliance-proxy) + +| Port | Service | Binding | Exposed via firewall? | +|---|---|---|---| +| 5432 | PostgreSQL | localhost:5432 | no | +| 6379 | Valkey | localhost:6379 | no | +| 4222 | NATS client | localhost:4222 | no | +| 8222 | NATS HTTP monitoring | localhost:8222 | no | +| 3060 | Nexus Hub | localhost:3060 | no | +| 3001 | Control Plane API | localhost:3001 | no (nginx proxies `/api/*`) | +| 3050 | AI Gateway | 0.0.0.0:3050 | **yes** (SDK clients hit this directly) | +| 3040 | Compliance Proxy runtime API | localhost:3040 | no | +| 3128 | Compliance Proxy CONNECT | 0.0.0.0:3128 | **yes** (network-proxied apps) | +| 9090 | Prometheus metrics | localhost:9090 | no | +| 443 | nginx (UI + `/api/*` reverse proxy) | 0.0.0.0:443 | **yes** | +| 22 | sshd | 0.0.0.0:22 | yes (Marketplace standard) | + +The compliance-proxy CA file path (`/etc/compliance-proxy/ca.crt`, +`/etc/compliance-proxy/ca.key`) is hardcoded into the prod-shape config +because the path is also baked into the systemd unit's `ReadWritePaths` and +into the `first-boot-ca.sh` generator — three places must agree. + +## 7. Hardening (`harden.sh`) + +Runs as the **last** Packer provisioner (after `install.sh`). Standard +AWS Marketplace AMI cleanup; without this the AMI fails the Self-Service +Scan and is rejected on submission. + +| Action | Why | +|---|---| +| `rm -f /root/.ssh/authorized_keys /home/*/.ssh/authorized_keys` | No shared SSH keys (customers BYO) | +| `rm -f /etc/ssh/ssh_host_*` | Regenerated on first boot — no shared host keys across instances | +| `sed -i sshd_config` (PasswordAuthentication=no, PermitRootLogin=no) | Hard requirement for AWS Marketplace | +| `passwd -l root` | Lock root password | +| `find / -name authorized_keys -delete` | Recursive scrub | +| `rm -rf /var/lib/postgresql/data/* /var/lib/valkey/* /var/lib/nats/*` | Clear any pg/valkey/nats state accumulated during install validation | +| `truncate -s 0 /etc/machine-id` | Regenerated on first boot | +| `cloud-init clean --logs` | Fresh cloud-init state | +| `dnf clean all` | Shrink AMI size | +| `find /var/log -type f -exec truncate -s 0 {} \;` | No leaked build-time logs | +| `dd if=/dev/zero of=/zerofile && rm /zerofile && sync` | Free-space zeroing — EBS snapshot dedupes better | + +## 8. AMI build pipeline (`nexus-ami/build.sh` → `nexus.pkr.hcl`) + +``` +make build-all → dist/bin// (4 Go binaries) +make control-plane-ui-build → packages/control-plane-ui/dist/ +build.sh stages → nexus-ami/artifacts/ → flatten + copy + tar +packer init . && packer build → AMI ID in us-east-1 +``` + +Packer steps: + +1. Launch an `m5.4xlarge` builder instance (16 vCPU / 64 GB) from the + latest Amazon Linux 2023 AMI. **Must be `m5.4xlarge` (or larger), not + `t3.2xlarge`** — valkey-search 1.x vendors gRPC + Protobuf + Abseil + + ICU as submodules; template-heavy parallel C++ compile is heap-hungry + per translation unit. Empirically, `t3.2xlarge` (32 GB) is OOM-killed + silently mid-ICU-compile after ~11 minutes (kernel OOM-killer kills sshd + before the script can write stderr — no trace in Packer build logs); + 64 GB clears the failure mode. 2026-05-28 build evidence. +1a. **Linker = lld, not GNU ld.** `install-valkey.sh` installs `lld${ver}` + alongside `clang${ver}` and exports `LDFLAGS=-fuse-ld=lld` before + invoking valkey-search's `./build.sh`. Reason: valkey-search compiles + with `-flto`, and linking `libsearch.so` requires LTO bitcode handling. + GNU ld delegates LTO to the LLVMgold.so plugin, but AL2023's `clang20` + package **omits** LLVMgold.so (verified 2026-05-28: link failed with + `cannot open /usr/lib64/llvm20/lib64/LLVMgold.so`). lld is LLVM's + native linker and handles LTO bitcode directly without a plugin. +2. `file` provisioner uploads `nexus-ami/artifacts.tar.gz` (single file, + ~120 MB) to `/tmp/nexus-artifacts.tar.gz`. We deliberately do NOT upload + `artifacts/` as a directory — Packer's file provisioner uses recursive + SCP under the hood, which silently drops individual files on slow links + (a problem we hit on China → us-east-1 at ~250 KB/s). A single-file + transfer is atomic and fails loudly. +3. `shell` provisioner runs `scripts/install.sh`. The script first extracts + the tarball to `/tmp/nexus/`, then (~10 minutes total) installs + Postgres, builds Valkey from source, installs NATS, installs Node + + Prisma, places binaries + configs + systemd units. +4. `shell` provisioner runs `scripts/harden.sh` (~30 seconds). +5. Packer snapshots the EBS root volume → registers the AMI. + +Total build time: 15–20 minutes per region (on good links; ++5–10 minutes for the cross-Pacific tarball upload from China). + +## 9. Instance sizing recommendation (Marketplace listing) + +| Tier | Instance type | When | +|---|---|---| +| Minimum | `t3.large` (2 vCPU / 8 GB) | PoC, ≤ 100 traffic events/hour | +| Recommended | `t3.xlarge` (4 vCPU / 16 GB) | Small production, ≤ 10k events/hour | +| Performance | `m5.2xlarge` (8 vCPU / 32 GB) | Production, ≤ 100k events/hour | + +Root volume: **≥ 30 GiB** (Postgres + Valkey + NATS file store + log +retention). Marketplace listing should state this requirement explicitly. + +## 10. Out of scope (intentionally) + +- **HA / multi-instance** — by design single-instance. Customers wanting HA + use the Kubernetes / container deployment form factor (separate listing). +- **Schema migration across versions** — pre-GA policy is "fresh install + on every AMI version bump"; customers re-launch a new AMI and re-load + their data via the admin API. Documented as an evaluation product in + the Marketplace listing. +- **External SSO** — AMI ships with the embedded auth server bound to + `localhost`; OIDC federation requires the customer to edit + `/etc/nexus/control-plane.config.yaml` `authServer:` block and restart + the service. +- **TLS termination on a real domain** — AMI ships nginx with a self-signed + cert generated at first boot; documented as "replace with your domain's + cert in `/etc/nexus/tls.{crt,key}` and restart nginx". +- **Agent fleet enrollment from this AMI** — works, but the agent's + bootstrap URL needs to be reachable from the agent host; this is a + network-topology concern documented in the user-facing deployment guide, + not an AMI-side decision. + +## 11. Memory anchors + +- `[[ami_first_boot_5_secrets]]` — five `[MUST MATCH]` secrets must be + written before any Nexus service starts, or services 401 each other. +- `[[ami_random_admin_password_marketplace_safe]]` — random per-instance + admin password is the cheapest defence against the AWS Marketplace + default-credentials finding category. + +## 12. Related docs + +- `.env.example` — canonical env var contract (the AMI honours every + `[MUST MATCH]` tag). +- `docs/developers/architecture/cross-cutting/foundation/configuration-architecture.md` — 4-layer config model the AMI plugs into at L2 (yaml) + L3 (env). +- `nexus-ami/README.md` — operator-facing build / test / publish runbook. diff --git a/docs/operators/README.md b/docs/operators/README.md index 67ed22c7..a092ea59 100644 --- a/docs/operators/README.md +++ b/docs/operators/README.md @@ -6,6 +6,7 @@ Documentation for running Nexus Gateway in production. - [`ops/`](./ops/) — deployment and operations guides: - [`deployment.md`](./ops/deployment.md) — bring-up and topologies. + - [`ami-build.md`](./ops/ami-build.md) — build the single-instance appliance AMI (AWS Marketplace). - [`ec2-single-node.md`](./ops/ec2-single-node.md) — a single-node deployment. - [`install-test-env.md`](./ops/install-test-env.md) — a single-host test or staging install. - [`backup-dr.md`](./ops/backup-dr.md) — backup and disaster recovery. diff --git a/docs/operators/ops/ami-build.md b/docs/operators/ops/ami-build.md new file mode 100644 index 00000000..c845af22 --- /dev/null +++ b/docs/operators/ops/ami-build.md @@ -0,0 +1,81 @@ +--- +updated: 2026-05-29 +--- + +# AMI build (single-instance appliance) + +How to build the AWS Marketplace AMI / single-instance appliance image. The +source-of-truth for everything in this guide is [`nexus-ami/README.md`](../../../nexus-ami/README.md); +the design rationale is captured in +[`docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md`](../../developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md). + +## When to use this + +- Cutting a release for the AWS Marketplace listing. +- Producing an image for an on-prem customer that wants a single-VM install. +- Smoke-testing a Packer / install-script change before publishing. + +## Prerequisites + +- Go 1.25+, Node 20+, Packer 1.10+. +- AWS credentials with `AWS_PROFILE=` exporting EC2 + S3 + IAM + permissions in `us-east-1`. +- A `t3.medium` or larger key pair on the target account if you intend to + launch instances from the AMI after build. +- vCPU headroom: the build runs on an `m5.4xlarge` (16 vCPU). If the + Standard-family quota is 16 and another instance is already running, stop + it or request a quota bump first (otherwise Packer fails immediately with + `VcpuLimitExceeded`). + +## Build + +```bash +cd nexus-ami +./build.sh # full pipeline (compile + stage + packer build, ~55 min) +./build.sh --skip-packer # CI dry-run — stage only, skip the EC2 launch +``` + +A successful build prints the new AMI id (e.g. `ami-0xxxxxxxx`) and a +snapshot id. + +## After the build + +1. **Share with the Marketplace scanner** (account `679593333241`): + + ```bash + aws ec2 modify-image-attribute --image-id \ + --launch-permission "Add=[{UserId=679593333241}]" \ + --profile --region us-east-1 + aws ec2 modify-snapshot-attribute --snapshot-id \ + --create-volume-permission "Add=[{UserId=679593333241}]" \ + --profile --region us-east-1 + ``` + +2. **Trigger the AMI scan** in Partner Central → AMI Management Portal. + +3. **Test the AMI**: + + ```bash + aws ec2 run-instances --image-id --instance-type t3.medium \ + --key-name --associate-public-ip-address \ + --profile --region us-east-1 + # SSH in, then: sudo cat /var/log/nexus/admin-credentials.txt + ``` + + Two instances launched from the same AMI MUST have different admin + passwords — that is the most important first-boot invariant. + +## Common failure modes + +| Symptom | Root cause | Fix | +|---|---|---| +| `VcpuLimitExceeded` immediately at `packer build` | Standard-family quota hit because another instance is running | Stop or terminate it, or request a quota raise | +| `Script disconnected unexpectedly` mid-Valkey compile | Build host OOM-killed sshd | Default is `m5.4xlarge`; do not lower | +| `InvalidParameterValue: Character sets beyond ASCII are not supported` at `Modifying attributes on AMI` | Non-ASCII in `ami_description` (e.g. em dash) | Keep `nexus.pkr.hcl` `ami_description` ASCII-only | +| First-boot completes but 4 nexus-* services stay `inactive` | Boot-order race — nexus-* tried to start before postgres was up | Already handled by `first-boot.sh`'s tail `kick` block | + +## Iteration cadence + +Plan a **monthly rebuild** to absorb AL2023 + Postgres + Valkey + NATS CVE +patches. `./build.sh` is the single command; wire it into a CI cron once +the AMI is stabilised. diff --git a/nexus-ami/README.md b/nexus-ami/README.md new file mode 100644 index 00000000..d960dffe --- /dev/null +++ b/nexus-ami/README.md @@ -0,0 +1,143 @@ +# Nexus Gateway — AMI / appliance build + +Single-instance, all-in-one Nexus Gateway packaged as an AWS Marketplace AMI. +Same artifacts are the foundation for the future on-prem appliance form factor +(bare-metal / VMware / KVM disk images). + +> **Source of truth for everything in this directory:** +> [`docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md`](../docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md). +> Read it first before changing scripts, configs, or systemd units in this tree. + +## What's in the AMI + +| Layer | Component | Source | +|---|---|---| +| Runtime deps | PostgreSQL 16 | `dnf install postgresql16-server` (AL2023 default) | +| Runtime deps | Valkey 8 + `valkey-search` module | `scripts/install-valkey.sh` (source compile) | +| Runtime deps | NATS Server 2 (JetStream) | `scripts/install-nats.sh` (official binary) | +| Runtime deps | Node.js 20 + Prisma + tsx | `scripts/install-node-prisma.sh` (first-boot only) | +| Runtime deps | nginx | `dnf install nginx` | +| Nexus | Hub binary (3060) | `make nexus-hub-build` → `dist/bin/nexus-hub/nexus-hub` | +| Nexus | Control Plane binary (3001) | `make control-plane-build` | +| Nexus | AI Gateway binary (3050) | `make ai-gateway-build` | +| Nexus | Compliance Proxy binary (3128) | `make compliance-proxy-build` | +| Nexus | Control Plane UI dist | `make control-plane-ui-build` → `packages/control-plane-ui/dist/` | +| Nexus | DB schema + seed | `tools/db-migrate/{schema.prisma, seed/}` | +| Nexus | 4 prod-shape `*.config.yaml` | `artifacts/configs/` | +| Nexus | 7 systemd units | `artifacts/systemd/` | + +## Quick build + +```bash +# Prerequisites: Go 1.25+, Node 20+, Packer 1.10+, AWS credentials. +cd nexus-ami +./build.sh # full pipeline: compile + stage + packer build +./build.sh --skip-packer # stop after staging (CI dry-run) +``` + +The full pipeline takes 20–30 minutes: + +1. `make build-all` — Go binaries (≈ 2 min) +2. `make control-plane-ui-build` — Vite UI dist (≈ 30 s) +3. Stage `artifacts/{bin,ui-dist,prisma}` (≈ 5 s) +4. `packer build` — launches a `t3.xlarge`, runs `install.sh` (Valkey + source compile is the long pole) + `harden.sh`, snapshots the AMI + (≈ 15–20 min) + +Output: a registered AMI ID in your AWS account (region per +`nexus.pkr.hcl` `aws_region` variable, default `us-east-1`). + +## Test a fresh AMI manually + +```bash +# 1. Launch a t3.xlarge from the AMI you just built. Wait for it to boot. +# 2. SSH in with your EC2 key pair: +ssh -i ~/.ssh/your-key.pem ec2-user@ + +# 3. Read the per-instance admin credentials: +sudo cat /var/log/nexus/admin-credentials.txt + +# 4. Verify all 7 Nexus-related services are green: +systemctl status nexus-first-boot postgresql valkey nats \ + nexus-hub nexus-control-plane nexus-gateway nexus-proxy nginx + +# 5. Open https:/// in a browser (accept the self-signed cert), +# log in with the credentials from step 3. + +# 6. Launch a SECOND instance from the same AMI and confirm +# /var/log/nexus/admin-credentials.txt contains a DIFFERENT password. +# Per-instance secret uniqueness is the most important first-boot invariant. +``` + +## Self-Service AMI Scan iteration + +Run AWS's Self-Service Scan from the Partner Central → Marketplace +Management Portal. Expect 2–3 rebuild cycles before the scan returns +zero findings. Common first-build hits the scan catches: + +- A package update landed a new CVE — `dnf update -y` is in `install.sh` + so the rebuild self-fixes; just re-run `packer build`. +- An overlooked `authorized_keys` file — re-run `harden.sh` (already + hardened with recursive `find / -name authorized_keys -delete`). +- SSH config not strict enough — `harden.sh` already enforces + `PasswordAuthentication=no`, `PermitRootLogin=no`, + `PermitEmptyPasswords=no`. If the scanner cites a new sshd directive, + add it to `harden.sh`. + +## Directory layout + +``` +nexus-ami/ +├── README.md ← this file +├── nexus.pkr.hcl ← Packer template +├── build.sh ← orchestrator (compile → stage → packer) +├── artifacts/ ← Packer file-provisioner source +│ ├── bin/ ← populated by build.sh (gitignored) +│ ├── ui-dist/ ← populated by build.sh (gitignored) +│ ├── prisma/ ← populated by build.sh (gitignored) +│ ├── configs/ +│ │ ├── nexus-hub.config.yaml +│ │ ├── control-plane.config.yaml +│ │ ├── ai-gateway.config.yaml +│ │ ├── compliance-proxy.config.yaml +│ │ └── nginx-nexus.conf +│ └── systemd/ +│ ├── nexus-first-boot.service +│ ├── valkey.service +│ ├── nats.service +│ ├── nexus-hub.service +│ ├── nexus-control-plane.service +│ ├── nexus-gateway.service +│ └── nexus-proxy.service +└── scripts/ + ├── install.sh ← orchestrator (runs at Packer time) + ├── install-postgres.sh + ├── install-valkey.sh + ├── install-nats.sh + ├── install-node-prisma.sh + ├── first-boot.sh ← orchestrator (runs once per instance) + ├── first-boot-secrets.sh + ├── first-boot-ca.sh + ├── first-boot-db.sh + ├── set-admin-password.js ← Node helper, deployed to /opt/nexus/prisma/ + └── harden.sh ← Marketplace cleanup (LAST provisioner) +``` + +## What's intentionally NOT here + +- **Multi-instance HA / Kubernetes manifests** — the appliance form factor + is single-instance by design. Container / K8s deployment is a separate + product line with its own architecture doc. +- **Schema migration across Nexus versions** — pre-GA policy. Customers + re-launch a new AMI version and re-create their workloads through the + admin API. Documented in the Marketplace listing as an evaluation + product. +- **Real TLS certificate provisioning** — first-boot generates a self-signed + cert at `/etc/nexus/tls.{crt,key}`. Operators replace with a real cert + and `systemctl reload nginx`. + +## Maintenance cadence + +Plan a **monthly rebuild** to absorb AL2023 + Postgres + Valkey + NATS +CVE patches. `build.sh` is the single command; wire it into a CI cron +once the AMI is stabilised. diff --git a/nexus-ami/artifacts/configs/ai-gateway.config.yaml b/nexus-ami/artifacts/configs/ai-gateway.config.yaml new file mode 100644 index 00000000..701c07e9 --- /dev/null +++ b/nexus-ami/artifacts/configs/ai-gateway.config.yaml @@ -0,0 +1,87 @@ +# Nexus AI Gateway — prod-shape config for AMI / appliance form factor. +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# Secrets + DATABASE_URL loaded from /etc/nexus/ai-gateway.env at boot. + +server: + port: 3050 + readTimeout: "30s" + writeTimeout: "360s" + +database: + url: "" # env DATABASE_URL + +redis: + mode: standalone + addrs: ["127.0.0.1:6379"] + username: "" + password: "" # env REDIS_PASSWORD + db: 0 + sentinel: + masterName: "" + username: "" + password: "" + cluster: + maxRedirects: 8 + routeRandomly: false + readOnly: false + tls: + enabled: false + insecureSkipVerify: false + caFile: "" + certFile: "" + keyFile: "" + serverName: "" + poolSize: 200 + minIdleConns: 50 + maxRetries: 3 + dialTimeout: 5s + readTimeout: 3s + writeTimeout: 3s + poolTimeout: 4s + +auth: + hmacSecret: "" # env ADMIN_KEY_HMAC_SECRET + credentialMasterKey: "" # env CREDENTIAL_ENCRYPTION_KEY (64 hex chars) + credentialKeyMap: "" + internalServiceToken: "" # env INTERNAL_SERVICE_TOKEN + +log: + level: "info" + format: "json" + file: "/var/log/nexus/ai-gateway.log" + +registry: + nexusHubUrl: "http://127.0.0.1:3060" + +mq: + driver: "nats" + nats: + url: "nats://127.0.0.1:4222" + +cors: + enabled: false + allowedOrigins: [] + allowedMethods: ["GET", "POST", "OPTIONS"] + allowedHeaders: ["Content-Type", "Authorization", "x-nexus-virtual-key", "x-request-id"] + maxAgeSec: 600 + +cache: + enabled: true + ttl: 5m + prefix: "ai-gw:" + broker: true + +otel: + endpoint: "" + serviceName: "nexus-ai-gateway" + +observability: + latencyDetail: true + +routing: + defaultRetryPolicy: + maxAttemptsPerTarget: 1 + retryOn: ["network", "timeout", "429", "5xx"] + backoffInitial: 250ms + backoffMax: 5s + backoffJitter: 0.2 diff --git a/nexus-ami/artifacts/configs/compliance-proxy.config.yaml b/nexus-ami/artifacts/configs/compliance-proxy.config.yaml new file mode 100644 index 00000000..01bdcb27 --- /dev/null +++ b/nexus-ami/artifacts/configs/compliance-proxy.config.yaml @@ -0,0 +1,175 @@ +# Nexus Compliance Proxy — prod-shape config for AMI / appliance form factor. +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# CA cert/key generated by first-boot-ca.sh. +# DATABASE_URL + COMPLIANCE_PROXY_API_TOKEN loaded from /etc/nexus/compliance-proxy.env at boot. + +listener: + address: ":3128" + +ca: + certPath: "/etc/compliance-proxy/ca.crt" + keyPath: "/etc/compliance-proxy/ca.key" + +database: + url: "" # env DATABASE_URL + +redis: + mode: standalone + addrs: ["127.0.0.1:6379"] + username: "" + password: "" # env REDIS_PASSWORD + db: 0 + sentinel: + masterName: "" + username: "" + password: "" + cluster: + maxRedirects: 8 + routeRandomly: false + readOnly: false + tls: + enabled: false + insecureSkipVerify: false + caFile: "" + certFile: "" + keyFile: "" + serverName: "" + poolSize: 200 + minIdleConns: 50 + maxRetries: 3 + dialTimeout: 5s + readTimeout: 3s + writeTimeout: 3s + poolTimeout: 4s + +accessControl: + sourceIpAllowlist: + - "10.0.0.0/8" + - "172.16.0.0/12" + - "192.168.0.0/16" + domainAllowlist: + - "api.openai.com:443" + - "*.openai.com:443" + - "api.anthropic.com:443" + - "*.anthropic.com:443" + - "generativelanguage.googleapis.com:443" + - "aistudio.google.com:443" + - "api.deepseek.com:443" + - "api.x.ai:443" + - "api.moonshot.cn:443" + - "open.bigmodel.cn:443" + - "api.minimax.chat:443" + - "copilot-proxy.githubusercontent.com:443" + internalNetworkExceptions: [] + +connections: + maxConcurrentTunnels: 10000 + maxStreamsPerConnection: 100 + idleTimeout: "300s" + shutdownGracePeriod: "30s" + +upstream: + maxConnsPerHost: 100 + idleConnTimeout: "90s" + dialTimeout: "10s" + +limits: + requestBodyLimit: "10MB" + responseBodyLimit: "10MB" + sseBufferLimit: "8MB" + +log: + level: "info" + format: "json" + file: "/var/log/nexus/compliance-proxy.log" + +metrics: + address: ":9090" + +runtimeApi: + listenAddress: "127.0.0.1:3040" + +# Required by the config validator (added 2026-05-28 after the upstream merge +# made these top-level checks). All three blocks point at the co-located Hub / +# NATS / internal-service-token — same pattern as control-plane.config.yaml. +mq: + driver: "nats" + nats: + url: "nats://127.0.0.1:4222" + +registry: + nexusHubUrl: "http://127.0.0.1:3060" + +auth: + internalServiceToken: "" # env INTERNAL_SERVICE_TOKEN (set by first-boot-secrets; [MUST MATCH] across services) + +compliance: + enabled: true + perHookTimeoutMs: 5000 + totalTimeoutMs: 15000 + parallelHooks: false + checkpointChars: 500 + redactionRulesPath: "" + rejectResponse: + defaultLevel: 1 + contactInfo: "Contact administrator" + hooks: + - implementationId: "keyword-filter" + name: "Default Keyword Filter" + priority: 10 + enabled: false + stage: "request" + failBehavior: "fail-open" + timeoutMs: 5000 + applicableIngress: "ALL" + config: + patterns: [] + caseSensitive: false + - implementationId: "pii-detector" + name: "Default PII Detector" + priority: 20 + enabled: false + stage: "request" + failBehavior: "fail-open" + timeoutMs: 5000 + applicableIngress: "ALL" + config: + types: ["email", "phone", "ssn", "credit_card"] + action: "reject_hard" + +alerting: + enabled: true + evalIntervalSec: 30 + webhook: + url: "" + headers: {} + timeoutSec: 10 + cooldown: + fireMinutes: 5 + resolveMinutes: 5 + persistenceDir: "/var/lib/nexus/alerting" + +audit: + enabled: true + batch: + size: 10 + flushIntervalMs: 500 + channelBufferSize: 1000 + adaptiveFlush: false + flushIntervalMinMs: 500 + flushIntervalMaxMs: 10000 + adaptiveBatchSize: false + batchSizeMin: 10 + batchSizeMax: 500 + ndjson: + enabled: true + dir: "/var/lib/nexus/audit-spool" + maxFileSizeMB: 100 + maxTotalSizeMB: 1000 + pinning: + exemptions: [] + autoExempt: + enabled: true + failureThreshold: 3 + windowSeconds: 3600 + exemptionDurationSeconds: 86400 diff --git a/nexus-ami/artifacts/configs/control-plane.config.yaml b/nexus-ami/artifacts/configs/control-plane.config.yaml new file mode 100644 index 00000000..da000c66 --- /dev/null +++ b/nexus-ami/artifacts/configs/control-plane.config.yaml @@ -0,0 +1,97 @@ +# Nexus Control Plane — prod-shape config for AMI / appliance form factor. +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# Secrets + DATABASE_URL loaded from /etc/nexus/control-plane.env at boot. + +server: + port: 3001 + shutdownTimeout: "10s" + +database: + url: "" # env DATABASE_URL + maxConns: 25 + minConns: 5 + maxConnLifetime: "300s" + +redis: + mode: standalone + addrs: ["127.0.0.1:6379"] + username: "" + password: "" # env REDIS_PASSWORD + db: 0 + sentinel: + masterName: "" + username: "" + password: "" + cluster: + maxRedirects: 8 + routeRandomly: false + readOnly: false + tls: + enabled: false + insecureSkipVerify: false + caFile: "" + certFile: "" + keyFile: "" + serverName: "" + poolSize: 200 + minIdleConns: 50 + maxRetries: 3 + dialTimeout: 5s + readTimeout: 3s + writeTimeout: 3s + poolTimeout: 4s + +log: + level: "info" + format: "json" + file: "/var/log/nexus/control-plane.log" + +bff: + complianceProxyUrl: "http://127.0.0.1:3040" + aiGatewayUrl: "http://127.0.0.1:3050" + complianceProxyRuntimeUrl: "http://127.0.0.1:3040" + complianceProxyApiToken: "" # env COMPLIANCE_PROXY_API_TOKEN + +registry: + nexusHubUrl: "http://127.0.0.1:3060" + +auth: + internalServiceToken: "" # env INTERNAL_SERVICE_TOKEN + +crypto: + encryptionKey: "" # env CREDENTIAL_ENCRYPTION_KEY (64 hex chars) + encryptionPassphrase: "" + encryptionSalt: "" + credentialKeyMap: "" + production: true + +retention: + auditLogDays: 90 + adminAuditLogDays: 365 + metricRollupDays: 365 + agentAuditDays: 90 + +agent: + caDir: "/var/lib/nexus/agentca" + +otel: + endpoint: "" + serviceName: "nexus-control-plane" + +scheduler: + enabled: true + +mq: + driver: "nats" + nats: + url: "nats://127.0.0.1:4222" + +# OIDC issuer for tokens minted by this Control Plane. The issuer claim must +# match the URL clients reach the CP on so JWKS fetch + iss-claim validation +# work. Issuer value is per-instance — first-boot stamps AUTH_SERVER_ISSUER +# into /etc/nexus/control-plane.env using the detected public IP. The empty +# string here is a placeholder so the env override hook fires; see +# configuration-architecture.md L3 > L2. +authServer: + issuer: "" # env AUTH_SERVER_ISSUER + keystoreDir: "/var/lib/nexus/authkeys" diff --git a/nexus-ami/artifacts/configs/nexus-hub.config.yaml b/nexus-ami/artifacts/configs/nexus-hub.config.yaml new file mode 100644 index 00000000..a44ecf27 --- /dev/null +++ b/nexus-ami/artifacts/configs/nexus-hub.config.yaml @@ -0,0 +1,97 @@ +# Nexus Hub — prod-shape config for AMI / appliance form factor. +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# +# All secrets and infra URLs are blank here — first-boot generates per-instance +# values into /etc/nexus/nexus-hub.env which systemd loads via EnvironmentFile=. +# Env values override every blank field below (L3 > L2 per configuration-architecture.md). + +server: + port: 3060 + readTimeout: 30s + writeTimeout: 30s + shutdownTimeout: 15s + +database: + url: "" # env DATABASE_URL + maxConns: 20 + minConns: 5 + +redis: + mode: standalone + addrs: ["127.0.0.1:6379"] + username: "" + password: "" # env REDIS_PASSWORD + db: 0 + sentinel: + masterName: "" + username: "" + password: "" + cluster: + maxRedirects: 8 + routeRandomly: false + readOnly: false + tls: + enabled: false + insecureSkipVerify: false + caFile: "" + certFile: "" + keyFile: "" + serverName: "" + poolSize: 200 + minIdleConns: 50 + maxRetries: 3 + dialTimeout: 5s + readTimeout: 3s + writeTimeout: 3s + poolTimeout: 4s + +mq: + driver: "nats" + nats: + url: "nats://127.0.0.1:4222" + +consumers: + enabled: true + batchSize: 100 + flushInterval: 5s + siem: + enabled: false + url: "" + headers: {} + format: "json" + batchSize: 200 + flushInterval: 5s + eventTypes: [] + +scheduler: + enabled: true + driftCheckInterval: 60s + identityEnrichInterval: 5m + enableAgentRollup: false + +auth: + internalServiceToken: "" # env INTERNAL_SERVICE_TOKEN + +authServer: + url: "http://127.0.0.1:3001" + jwksURL: "http://127.0.0.1:3001/.well-known/jwks.json" + issuer: "http://127.0.0.1:3001" + +agentCA: + certFile: "" + keyFile: "" + dir: "/var/lib/nexus/agentca" + +otel: + enabled: false + endpoint: "" + +log: + level: "info" + format: "json" + file: "/var/log/nexus/nexus-hub.log" + +hub: + id: "hub-appliance-1" + advertiseAddr: "http://127.0.0.1:3060" + allowedOrigins: [] diff --git a/nexus-ami/artifacts/configs/nginx-nexus.conf b/nexus-ami/artifacts/configs/nginx-nexus.conf new file mode 100644 index 00000000..30005a73 --- /dev/null +++ b/nexus-ami/artifacts/configs/nginx-nexus.conf @@ -0,0 +1,87 @@ +# Nexus Gateway — nginx reverse proxy for the AMI / appliance form factor. +# Serves the Vite-built React UI on :443 and proxies /api/* to the Control +# Plane on localhost:3001. Architecture: +# docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# +# Operator note: the self-signed cert at /etc/nexus/tls.{crt,key} is generated +# by first-boot-ca.sh on first launch. For production, replace those two files +# with a cert signed for your hostname and `systemctl reload nginx`. + +server { + listen 80 default_server; + server_name _; + return 301 https://$host$request_uri; +} + +server { + listen 443 ssl default_server; + server_name _; + + ssl_certificate /etc/nexus/tls.crt; + ssl_certificate_key /etc/nexus/tls.key; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers HIGH:!aNULL:!MD5; + + root /opt/nexus/ui; + index index.html; + + client_max_body_size 32m; + + # Vite SPA fallback — every unmatched path serves index.html so the + # client-side router takes over. + location / { + try_files $uri $uri/ /index.html; + } + + # Admin API + auth-server endpoints (both live in the control-plane + # binary on :3001). + location /api/ { + proxy_pass http://127.0.0.1:3001; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 300s; + proxy_send_timeout 300s; + } + + location /.well-known/ { + proxy_pass http://127.0.0.1:3001; + proxy_set_header Host $host; + } + + # OAuth/OIDC auth-server endpoints (authorization endpoint, token + # endpoint, userinfo, revoke, logout — all live in the control-plane + # binary). Without this location block /oauth/authorize falls through + # to the SPA `try_files` handler, the SPA can't process the PKCE + # parameters, bounces to /login, /login bounces to /oauth/authorize, + # infinite loop. Hit on 2026-05-29 first-user-test of build #10. + location /oauth/ { + proxy_pass http://127.0.0.1:3001; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Auth-server pre-bearer endpoints (`/authserver/idps`, `/authserver/ + # password`, `/authserver/idp//start`). The SPA's LoginPage calls + # these to list IDPs and post password credentials. Without the proxy + # they return the SPA index.html (200 with HTML body), which the SPA's + # JSON parser misreads as "loadProvidersFailed" — surfaced to the + # operator as "Unable to load sign-in methods". Hit on 2026-05-29. + location /authserver/ { + proxy_pass http://127.0.0.1:3001; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Health endpoint exposed for ELB / customer monitoring without + # authentication. + location = /healthz { + proxy_pass http://127.0.0.1:3001/api/healthz; + proxy_set_header Host $host; + } +} diff --git a/nexus-ami/artifacts/systemd/nats.service b/nexus-ami/artifacts/systemd/nats.service new file mode 100644 index 00000000..b8b7bb99 --- /dev/null +++ b/nexus-ami/artifacts/systemd/nats.service @@ -0,0 +1,25 @@ +[Unit] +Description=NATS Server with JetStream (event streaming + Hub coordination MQ) +Documentation=https://docs.nats.io/ +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=nats +Group=nats +ExecStart=/usr/local/bin/nats-server --config /etc/nats/nats-server.conf +ExecReload=/bin/kill -HUP $MAINPID +Restart=on-failure +RestartSec=5 +TimeoutStopSec=30 + +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/var/lib/nats /var/log/nats +LimitNOFILE=65535 + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/artifacts/systemd/nexus-control-plane.service b/nexus-ami/artifacts/systemd/nexus-control-plane.service new file mode 100644 index 00000000..bde4a977 --- /dev/null +++ b/nexus-ami/artifacts/systemd/nexus-control-plane.service @@ -0,0 +1,25 @@ +[Unit] +Description=Nexus Control Plane (admin API / BFF, IAM, SSO, analytics) +Documentation=https://github.com/AlphaBitCore/nexus-gateway +After=network-online.target nexus-hub.service +Requires=nexus-hub.service +Wants=network-online.target + +[Service] +Type=simple +User=nexus +Group=nexus +EnvironmentFile=/etc/nexus/control-plane.env +ExecStart=/opt/nexus/bin/control-plane -config /etc/nexus/control-plane.config.yaml +WorkingDirectory=/var/lib/nexus +Restart=on-failure +RestartSec=5 + +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/var/lib/nexus /var/log/nexus + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/artifacts/systemd/nexus-first-boot.service b/nexus-ami/artifacts/systemd/nexus-first-boot.service new file mode 100644 index 00000000..6470f709 --- /dev/null +++ b/nexus-ami/artifacts/systemd/nexus-first-boot.service @@ -0,0 +1,26 @@ +[Unit] +Description=Nexus Gateway first-boot initialization (generates per-instance secrets, MITM CA, DB schema + seed, random admin password) +Documentation=https://github.com/AlphaBitCore/nexus-gateway +# NOTE: this unit does NOT Require=postgresql.service AND it does NOT +# Before=postgresql.service. first-boot-db.sh calls `systemctl start +# postgresql` from inside this unit; either dependency would deadlock +# (systemd refuses to activate postgresql until first-boot finishes, +# first-boot blocks waiting for postgresql to come up, TimeoutStartSec +# kills first-boot 300 s later). Both keywords share the same hazard — +# Before= alone deadlocks even without Requires=. Hit on 2026-05-28 +# first-launch test, fixed by dropping postgresql.service from Before=. +After=network-online.target +Wants=network-online.target +Before=nexus-hub.service nexus-control-plane.service nexus-gateway.service nexus-proxy.service +ConditionPathExists=!/etc/nexus/.initialized + +[Service] +Type=oneshot +ExecStart=/usr/local/sbin/nexus-first-boot +RemainAfterExit=yes +TimeoutStartSec=300 +StandardOutput=journal+console +StandardError=journal+console + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/artifacts/systemd/nexus-gateway.service b/nexus-ami/artifacts/systemd/nexus-gateway.service new file mode 100644 index 00000000..87cd576a --- /dev/null +++ b/nexus-ami/artifacts/systemd/nexus-gateway.service @@ -0,0 +1,25 @@ +[Unit] +Description=Nexus AI Gateway (/v1 SDK traffic, provider adapters, routing, quota) +Documentation=https://github.com/AlphaBitCore/nexus-gateway +After=network-online.target nexus-hub.service +Requires=nexus-hub.service +Wants=network-online.target + +[Service] +Type=simple +User=nexus +Group=nexus +EnvironmentFile=/etc/nexus/ai-gateway.env +ExecStart=/opt/nexus/bin/ai-gateway -config /etc/nexus/ai-gateway.config.yaml +WorkingDirectory=/var/lib/nexus +Restart=on-failure +RestartSec=5 + +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/var/lib/nexus /var/log/nexus + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/artifacts/systemd/nexus-hub.service b/nexus-ami/artifacts/systemd/nexus-hub.service new file mode 100644 index 00000000..afc7c15d --- /dev/null +++ b/nexus-ami/artifacts/systemd/nexus-hub.service @@ -0,0 +1,25 @@ +[Unit] +Description=Nexus Hub (control-plane kernel: Thing Registry, Device Shadow, config sync, agent CA) +Documentation=https://github.com/AlphaBitCore/nexus-gateway +After=network-online.target nexus-first-boot.service postgresql.service valkey.service nats.service +Requires=nexus-first-boot.service postgresql.service valkey.service nats.service +Wants=network-online.target + +[Service] +Type=simple +User=nexus +Group=nexus +EnvironmentFile=/etc/nexus/nexus-hub.env +ExecStart=/opt/nexus/bin/nexus-hub -config /etc/nexus/nexus-hub.config.yaml +WorkingDirectory=/var/lib/nexus +Restart=on-failure +RestartSec=5 + +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/var/lib/nexus /var/log/nexus + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/artifacts/systemd/nexus-proxy.service b/nexus-ami/artifacts/systemd/nexus-proxy.service new file mode 100644 index 00000000..87cc49ff --- /dev/null +++ b/nexus-ami/artifacts/systemd/nexus-proxy.service @@ -0,0 +1,25 @@ +[Unit] +Description=Nexus Compliance Proxy (transparent TLS proxy, MITM, compliance pipeline) +Documentation=https://github.com/AlphaBitCore/nexus-gateway +After=network-online.target nexus-hub.service +Requires=nexus-hub.service +Wants=network-online.target + +[Service] +Type=simple +User=nexus +Group=nexus +EnvironmentFile=/etc/nexus/compliance-proxy.env +ExecStart=/opt/nexus/bin/compliance-proxy -config /etc/nexus/compliance-proxy.config.yaml +WorkingDirectory=/var/lib/nexus +Restart=on-failure +RestartSec=5 + +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/var/lib/nexus /var/log/nexus /etc/compliance-proxy + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/artifacts/systemd/valkey.service b/nexus-ami/artifacts/systemd/valkey.service new file mode 100644 index 00000000..9a000e32 --- /dev/null +++ b/nexus-ami/artifacts/systemd/valkey.service @@ -0,0 +1,32 @@ +[Unit] +Description=Valkey 8 (Redis-wire-compatible cache with valkey-search module) +Documentation=https://valkey.io/ +After=network-online.target +Wants=network-online.target + +[Service] +Type=notify +User=valkey +Group=valkey +ExecStart=/usr/local/bin/valkey-server /etc/valkey/valkey.conf --supervised systemd +ExecStop=/usr/local/bin/valkey-cli -h 127.0.0.1 -p 6379 shutdown nosave +Restart=on-failure +RestartSec=5 +TimeoutStopSec=30 + +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +# /run is tmpfs and is wiped on every boot — the /var/run/valkey directory +# install-valkey.sh creates at AMI build time does NOT survive AMI snapshot + +# fresh boot. Without RuntimeDirectory= here, systemd's namespace setup fails +# 226/NAMESPACE because ReadWritePaths can't bind a missing directory. Hit on +# 2026-05-28 first-launch test. +RuntimeDirectory=valkey +RuntimeDirectoryMode=0750 +ReadWritePaths=/var/lib/valkey /var/log/valkey +LimitNOFILE=65535 + +[Install] +WantedBy=multi-user.target diff --git a/nexus-ami/build.sh b/nexus-ami/build.sh new file mode 100755 index 00000000..e1a8996b --- /dev/null +++ b/nexus-ami/build.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +# build.sh — staging wrapper. Compiles all Nexus binaries + UI dist + bundles +# the Prisma schema, then invokes `packer build`. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# +# Usage: +# cd nexus-ami +# ./build.sh # full pipeline (binaries + UI + packer) +# ./build.sh --skip-packer # stage artifacts only; don't run packer (for CI dry-run) +# ./build.sh --stage-only # alias for --skip-packer +# +# Prerequisites: +# - Go 1.25+ (`make build-all` driver) +# - Node 20+ (`make control-plane-ui-build`) +# - Packer 1.10+ (https://www.packer.io/) unless --skip-packer +# - AWS credentials in environment (AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY +# or AWS_PROFILE) unless --skip-packer + +set -euo pipefail + +SKIP_PACKER=false +for arg in "$@"; do + case "$arg" in + --skip-packer|--stage-only) SKIP_PACKER=true ;; + -h|--help) + sed -n '2,18p' "$0" + exit 0 + ;; + *) echo "ERROR: unknown flag $arg" >&2; exit 1 ;; + esac +done + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ARTIFACTS_DIR="$SCRIPT_DIR/artifacts" + +echo "==> [build] cleaning previous staging dirs..." +rm -rf "$ARTIFACTS_DIR/bin" "$ARTIFACTS_DIR/ui-dist" "$ARTIFACTS_DIR/prisma" "$ARTIFACTS_DIR/scripts" +rm -f "$SCRIPT_DIR/artifacts.tar.gz" +mkdir -p "$ARTIFACTS_DIR/bin" "$ARTIFACTS_DIR/ui-dist" "$ARTIFACTS_DIR/prisma" + +# ─── 1. Build Go binaries ────────────────────────────────────────────────── + +echo "==> [build] compiling Nexus Go binaries (make build-all)..." +cd "$REPO_ROOT" +GOOS=linux GOARCH=amd64 CGO_ENABLED=0 make \ + nexus-hub-build control-plane-build ai-gateway-build compliance-proxy-build + +for svc in nexus-hub control-plane ai-gateway compliance-proxy; do + src="$REPO_ROOT/dist/bin/$svc/$svc" + [ -x "$src" ] || { echo "ERROR: missing $src" >&2; exit 1; } + cp "$src" "$ARTIFACTS_DIR/bin/$svc" +done + +# ─── 2. Build Control Plane UI Vite dist ─────────────────────────────────── + +echo "==> [build] building Control Plane UI (Vite)..." +cd "$REPO_ROOT" +make control-plane-ui-build + +ui_dist="$REPO_ROOT/packages/control-plane-ui/dist" +[ -d "$ui_dist" ] || { echo "ERROR: missing UI dist at $ui_dist" >&2; exit 1; } +cp -r "$ui_dist"/. "$ARTIFACTS_DIR/ui-dist/" + +# ─── 3. Bundle Prisma schema + seed ──────────────────────────────────────── + +echo "==> [build] bundling Prisma schema + seed..." +cd "$REPO_ROOT/tools/db-migrate" +cp schema.prisma "$ARTIFACTS_DIR/prisma/" +cp package.json package-lock.json "$ARTIFACTS_DIR/prisma/" +cp -r seed "$ARTIFACTS_DIR/prisma/seed" +cp prisma.config.ts "$ARTIFACTS_DIR/prisma/" +cp -r migrations "$ARTIFACTS_DIR/prisma/migrations" 2>/dev/null || true + +# ─── 3b. Bundle scripts/ into artifacts/scripts/ ─────────────────────────── +# Packer's file provisioner needs the destination dir to exist before scp can +# upload into it. Bundling scripts/ as a subdir of artifacts/ means one +# `file` provisioner uploads everything in one shot (see nexus.pkr.hcl). + +echo "==> [build] bundling scripts/ into artifacts/scripts/..." +cp -r "$SCRIPT_DIR/scripts" "$ARTIFACTS_DIR/scripts" + +# ─── 4. Show what we staged ──────────────────────────────────────────────── + +echo "==> [build] artifact tree:" +( cd "$ARTIFACTS_DIR" && find . -maxdepth 3 -type d -print ) | sed 's|^| |' + +# ─── 4b. Compress artifacts/ → artifacts.tar.gz ──────────────────────────── +# Packer's file provisioner uses recursive SCP. For our 234 MB payload over +# slow links (e.g., China → us-east-1), SCP silently drops individual files +# on transient connection blips — leading to "missing binary" errors at +# install.sh time with no upload-side error message. Tarballing makes the +# transfer atomic (one file → succeed or fail as a whole) AND faster +# (gzipped Go binaries compress to ~40-50% of their uncompressed size). + +TARBALL="$SCRIPT_DIR/artifacts.tar.gz" +echo "==> [build] compressing artifacts/ → artifacts.tar.gz ..." +rm -f "$TARBALL" +tar -C "$ARTIFACTS_DIR" -czf "$TARBALL" . +echo "==> [build] tarball: $(du -h "$TARBALL" | awk '{print $1}') (vs $(du -sh "$ARTIFACTS_DIR" | awk '{print $1}') uncompressed)" + +# ─── 5. packer build ─────────────────────────────────────────────────────── + +if $SKIP_PACKER; then + echo "==> [build] --skip-packer: stopping here. Run 'cd $SCRIPT_DIR && packer init . && packer build nexus.pkr.hcl' yourself." + exit 0 +fi + +if ! command -v packer >/dev/null 2>&1; then + echo "ERROR: packer is not installed (https://www.packer.io/downloads). Pass --skip-packer to stop after staging." >&2 + exit 1 +fi + +cd "$SCRIPT_DIR" +echo "==> [build] packer init ..." +packer init . +echo "==> [build] packer build ..." +packer build nexus.pkr.hcl diff --git a/nexus-ami/nexus.pkr.hcl b/nexus-ami/nexus.pkr.hcl new file mode 100644 index 00000000..0483f90f --- /dev/null +++ b/nexus-ami/nexus.pkr.hcl @@ -0,0 +1,109 @@ +# Nexus Gateway — Packer template for the AMI / appliance form factor. +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# +# Build: cd nexus-ami && packer init . && packer build nexus.pkr.hcl +# Variables: pass via -var "nexus_version=0.1.0" or set NEXUS_VERSION env. + +packer { + required_plugins { + amazon = { + version = ">= 1.3.0" + source = "github.com/hashicorp/amazon" + } + } +} + +variable "nexus_version" { + type = string + default = "0.1.0" +} + +variable "aws_region" { + type = string + default = "us-east-1" +} + +variable "instance_type" { + type = string + # m5.4xlarge (16 vCPU / 64 GB) needed because valkey-search 1.x vendors + # gRPC + Protobuf + Abseil + ICU as submodules; template-heavy parallel C++ + # compile is heap-hungry per translation unit. A 2026-05-28 build on + # t3.2xlarge (32 GB) was OOM-killed silently mid-ICU-compile after 11 + # minutes — kernel OOM-killer leaves no trace in build logs (sshd dies + # before the script can write stderr). 64 GB gives the headroom the 32 GB + # tier was supposed to but no longer does. + default = "m5.4xlarge" +} + +variable "root_volume_size_gb" { + type = number + default = 30 # Postgres + Valkey + NATS file store + log headroom. +} + +source "amazon-ebs" "nexus" { + region = var.aws_region + instance_type = var.instance_type + + ami_name = "nexus-gateway-${var.nexus_version}-{{timestamp}}" + # ami_description is ASCII-only: AWS ModifyImageAttribute rejects non-ASCII + # (we hit this on 2026-05-28: em dash U+2014 → InvalidParameterValue; the + # AMI was deregistered and the snapshot deleted at the end of the build). + ami_description = "Nexus Gateway ${var.nexus_version} - single-instance AI traffic gateway appliance (OSS, Apache 2.0)" + + source_ami_filter { + filters = { + name = "al2023-ami-2023.*-x86_64" + virtualization-type = "hvm" + root-device-type = "ebs" + } + owners = ["amazon"] + most_recent = true + } + + ssh_username = "ec2-user" + + launch_block_device_mappings { + device_name = "/dev/xvda" + volume_size = var.root_volume_size_gb + volume_type = "gp3" + delete_on_termination = true + } + + ami_block_device_mappings { + device_name = "/dev/xvda" + volume_size = var.root_volume_size_gb + volume_type = "gp3" + } + + tags = { + Name = "nexus-gateway-${var.nexus_version}" + Product = "Nexus Gateway" + Version = var.nexus_version + BuildToolchain = "packer+al2023" + } +} + +build { + name = "nexus-gateway-ami" + sources = ["source.amazon-ebs.nexus"] + + # Upload artifacts.tar.gz (built by build.sh) as a single file. We avoid + # uploading artifacts/ as a directory because Packer's file provisioner uses + # recursive SCP — over slow links it silently drops individual files when + # the connection blips, causing "missing binary" errors at install.sh time. + # A single-file SCP is atomic: either the whole tarball lands or the + # transfer errors loudly. install.sh extracts the tarball before doing + # anything else. + provisioner "file" { + source = "artifacts.tar.gz" + destination = "/tmp/nexus-artifacts.tar.gz" + } + + provisioner "shell" { + execute_command = "sudo -E bash '{{.Path}}'" + scripts = [ + "scripts/install.sh", + "scripts/harden.sh", + ] + } +} diff --git a/nexus-ami/scripts/first-boot-ca.sh b/nexus-ami/scripts/first-boot-ca.sh new file mode 100755 index 00000000..0dba2184 --- /dev/null +++ b/nexus-ami/scripts/first-boot-ca.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# first-boot-ca.sh — generate per-instance certificates. +# +# Two CAs / keypairs are produced: +# 1. /etc/compliance-proxy/{ca.crt,ca.key} — Compliance Proxy MITM CA +# used to mint leaf certs for upstream provider domains. +# 2. /etc/nexus/{tls.crt,tls.key} — nginx HTTPS cert (self-signed, +# CN=nexus-gateway). The operator is expected to replace this with a +# real cert signed for their hostname in production. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md + +set -euo pipefail + +PROXY_CA_DIR=/etc/compliance-proxy +NEXUS_DIR=/etc/nexus + +# Idempotent — re-issuing the MITM CA invalidates every agent's trust store +# entry (operators have to redistribute the new ca.crt). Re-issuing the nginx +# cert is harmless but pointless. +if [ -f "$PROXY_CA_DIR/ca.crt" ] && [ -f "$NEXUS_DIR/tls.crt" ]; then + echo "[first-boot-ca] CAs already present; skipping (idempotent)." + exit 0 +fi + +echo "[first-boot-ca] generating Compliance Proxy MITM CA..." + +# ECDSA P-256 — small + fast leaf signing; matches dev CA shape used by +# packages/compliance-proxy/dev-certs/. +openssl ecparam -genkey -name prime256v1 -noout -out "$PROXY_CA_DIR/ca.key" +openssl req -x509 -new -nodes -key "$PROXY_CA_DIR/ca.key" -sha256 -days 3650 \ + -subj "/CN=Nexus Compliance Proxy CA/O=Nexus Gateway" \ + -out "$PROXY_CA_DIR/ca.crt" + +chmod 0640 "$PROXY_CA_DIR/ca.crt" "$PROXY_CA_DIR/ca.key" +chown root:nexus "$PROXY_CA_DIR/ca.crt" "$PROXY_CA_DIR/ca.key" + +echo "[first-boot-ca] generating nginx HTTPS self-signed cert..." + +# Detect the instance's reachable IPs so the cert SAN covers everything Go's +# default TLS client will check against. Without IP SANs, Go's HTTPS client +# rejects `https:///.well-known/jwks.json` with x509: cannot validate +# certificate for because it doesn't contain any IP SANs — tokens are +# issued correctly at /oauth/token but cannot be verified at /api/admin/me, +# the SPA bounces back to /login on every login attempt. Hit on 2026-05-29. +TOKEN=$(curl -fsS -X PUT "http://169.254.169.254/latest/api/token" \ + -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" -m 3 2>/dev/null || true) +PUBLIC_IP="" +LOCAL_IP="" +if [ -n "$TOKEN" ]; then + PUBLIC_IP=$(curl -fsS -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/public-ipv4 -m 3 2>/dev/null || true) + LOCAL_IP=$(curl -fsS -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/local-ipv4 -m 3 2>/dev/null || true) +fi +SAN="IP:127.0.0.1,DNS:nexus-gateway,DNS:localhost" +[ -n "$PUBLIC_IP" ] && SAN="IP:${PUBLIC_IP},${SAN}" +[ -n "$LOCAL_IP" ] && [ "$LOCAL_IP" != "$PUBLIC_IP" ] && SAN="${SAN},IP:${LOCAL_IP}" +echo "[first-boot-ca] cert SAN: ${SAN}" + +openssl req -x509 -nodes -newkey rsa:2048 -days 365 \ + -subj "/CN=nexus-gateway/O=Nexus Gateway" \ + -addext "subjectAltName=${SAN}" \ + -keyout "$NEXUS_DIR/tls.key" \ + -out "$NEXUS_DIR/tls.crt" 2>/dev/null + +chmod 0640 "$NEXUS_DIR/tls.crt" "$NEXUS_DIR/tls.key" +chown root:nexus "$NEXUS_DIR/tls.crt" "$NEXUS_DIR/tls.key" + + +# Install the nginx self-signed cert into the system CA trust store. Without +# this, Go's default HTTP client (used by the JWT verifier's JWKS fetcher in +# the control-plane) rejects the self-signed cert with x509 "unknown +# authority" — tokens are issued correctly at /oauth/token but cannot be +# verified at /api/admin/me, the SPA bounces back to /login on every login +# attempt. Hit on 2026-05-29 first-user-test of build #10. Acceptable: the +# anchor is per-instance and only ever signs this appliance's own hostname. +echo "[first-boot-ca] trusting self-signed nginx cert in the system CA bundle..." +install -o root -g root -m 0644 "$NEXUS_DIR/tls.crt" \ + /etc/pki/ca-trust/source/anchors/nexus-gateway.crt +update-ca-trust + +echo "[first-boot-ca] complete (proxy CA + nginx self-signed cert + system CA anchor)." diff --git a/nexus-ami/scripts/first-boot-db.sh b/nexus-ami/scripts/first-boot-db.sh new file mode 100755 index 00000000..21bd53c7 --- /dev/null +++ b/nexus-ami/scripts/first-boot-db.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# first-boot-db.sh — initialise PostgreSQL, materialise schema, seed baseline +# rows, randomise the admin password, and surface credentials to the operator. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# Hashing matches tools/db-migrate/seed/lib.ts hashPassword(): scrypt N=16384, +# r=8, p=1, salt=32B, key=64B, format "salt_hex:hash_hex". + +set -euo pipefail + +# Put the bundled Node 20 on PATH so `npx` (whose shebang is +# `#!/usr/bin/env node`) can resolve `node`. Without this the script aborts +# at the first prisma call with `/usr/bin/env: 'node': No such file or +# directory` because systemd starts this unit with the system PATH that does +# not include /opt/nexus/node/bin. Hit on 2026-05-28 first-launch test of +# build #8. +export PATH=/opt/nexus/node/bin:$PATH + +PRISMA_DIR=/opt/nexus/prisma +ADMIN_CREDS=/var/log/nexus/admin-credentials.txt + +# Source the per-service env file written by first-boot-secrets.sh — the seed +# requires CREDENTIAL_ENCRYPTION_KEY (re-encrypts seeded credential rows) and +# ADMIN_KEY_HMAC_SECRET (re-hashes seeded VK lookup keys). Both live in +# control-plane.env which has the union of secrets for the two services that +# need them. +# shellcheck disable=SC1091 +. /etc/nexus/control-plane.env +export CREDENTIAL_ENCRYPTION_KEY ADMIN_KEY_HMAC_SECRET INTERNAL_SERVICE_TOKEN COMPLIANCE_PROXY_API_TOKEN + +DB_NAME=nexus_gateway +DB_USER=nexus +DB_PASSWORD=$(openssl rand -hex 24) +ADMIN_PASSWORD=$(openssl rand -base64 18 | tr -d '/+=' | cut -c1-20) +PGDATA=/var/lib/pgsql/data + +# ─── initdb on first launch (install.sh deferred this so harden.sh's wipe +# leaves a clean snapshot; see install-postgres.sh for the why). ──────── +if [ ! -f "$PGDATA/PG_VERSION" ]; then + echo "[first-boot-db] initialising PostgreSQL data directory..." + /usr/bin/postgresql-setup --initdb + + echo "[first-boot-db] enforcing localhost-only + scram-sha-256 auth..." + sed -i "s/^#listen_addresses.*/listen_addresses = '127.0.0.1'/" "$PGDATA/postgresql.conf" + sed -i "s/^listen_addresses.*/listen_addresses = '127.0.0.1'/" "$PGDATA/postgresql.conf" + sed -i "s/^#password_encryption.*/password_encryption = scram-sha-256/" "$PGDATA/postgresql.conf" + sed -i "s/^password_encryption.*/password_encryption = scram-sha-256/" "$PGDATA/postgresql.conf" + + cat > "$PGDATA/pg_hba.conf" <<'PGHBA' +# Nexus appliance — localhost-only, scram-sha-256 for nexus user, peer for postgres OS user. +local all postgres peer +local all all scram-sha-256 +host all all 127.0.0.1/32 scram-sha-256 +host all all ::1/128 scram-sha-256 +PGHBA + chown postgres:postgres "$PGDATA/pg_hba.conf" + chmod 0600 "$PGDATA/pg_hba.conf" +fi + +echo "[first-boot-db] starting PostgreSQL..." +systemctl start postgresql + +# Wait until accepting connections (postgresql-setup is async-ish on some AL2023 builds). +for i in 1 2 3 4 5 6 7 8 9 10; do + if sudo -u postgres pg_isready -q -h /var/run/postgresql; then + break + fi + echo "[first-boot-db] waiting for PostgreSQL... ($i/10)" + sleep 1 +done + +# If a previous DATABASE_URL was already stamped into an env file, reuse the +# password it encodes — the role already exists in PG with that password, and +# rotating it here would break that consistency. Otherwise this is the first +# run and we generate a fresh DB_PASSWORD above. +# +# `|| true` is load-bearing: on a fresh boot the env file exists (written by +# first-boot-secrets) but contains NO DATABASE_URL line yet, so grep returns 1. +# Under `set -euo pipefail` that fails the command substitution and `set -e` +# kills the whole script BEFORE we ever reach the role-creation block. Hit on +# 2026-05-28 first-launch test of build #8. +EXISTING_URL=$(grep -h '^DATABASE_URL=' /etc/nexus/control-plane.env 2>/dev/null | tail -1 | sed 's/^DATABASE_URL=//' || true) +if [ -n "$EXISTING_URL" ]; then + echo "[first-boot-db] reusing prior DATABASE_URL from /etc/nexus/control-plane.env (idempotent)." + DATABASE_URL="$EXISTING_URL" + DB_PASSWORD=$(echo "$DATABASE_URL" | sed -E "s|.*://$DB_USER:([^@]+)@.*|\1|") +fi + +echo "[first-boot-db] ensuring role and database exist (idempotent)..." +# SUPERUSER is required because seed/data/seed-baseline.sql is a pg_dump that +# uses `ALTER TABLE ... DISABLE TRIGGER ALL` to load FK-related rows out of +# topological order. Postgres only lets SUPERUSER touch the system-generated +# RI_ConstraintTrigger_* triggers — without it the seed aborts with +# permission denied: "RI_ConstraintTrigger_a_NNNN" is a system trigger +# Acceptable for this appliance: Postgres binds 127.0.0.1 only (see the +# listen_addresses tweak above) and pg_hba.conf forces scram-sha-256, so the +# attack surface is local processes only — same boundary as the rest of the +# appliance. Hit on 2026-05-28 first-launch test of build #8. +sudo -u postgres psql -v ON_ERROR_STOP=1 <> "$envfile" +done + +echo "[first-boot-db] materialising schema via prisma db push..." +cd "$PRISMA_DIR" +# --skip-generate was removed in newer Prisma CLI; client generation is now a +# separate explicit call below. Hit on 2026-05-28 first-launch test of build #8: +# "! unknown or unexpected option: --skip-generate". --accept-data-loss alone is +# enough — on a fresh DB there is no data to lose, but Prisma requires the flag +# to push without an interactive y/n prompt. +DATABASE_URL="$DATABASE_URL" /opt/nexus/node/bin/npx prisma db push --accept-data-loss + +echo "[first-boot-db] generating Prisma client (required by seed)..." +DATABASE_URL="$DATABASE_URL" /opt/nexus/node/bin/npx prisma generate + +echo "[first-boot-db] loading baseline seed (organisations, IAM, roles)..." +DATABASE_URL="$DATABASE_URL" /opt/nexus/node/bin/npx tsx seed/seed.ts + +echo "[first-boot-db] randomising admin@nexus.ai password..." +NEW_ADMIN_HASH=$(NEW_PASSWORD="$ADMIN_PASSWORD" /opt/nexus/node/bin/node "$PRISMA_DIR/set-admin-password.js") +DATABASE_URL="$DATABASE_URL" /opt/nexus/node/bin/npx prisma db execute --stdin < "$ADMIN_CREDS" </ +Username: admin@nexus.ai +Password: $ADMIN_PASSWORD + +IMPORTANT +--------- +1. This file is mode 0640, root:nexus — root or members of the 'nexus' group + can read it. Remove this file once you have changed the admin password + from the UI: sudo rm $ADMIN_CREDS +2. The TLS certificate at /etc/nexus/tls.crt is SELF-SIGNED. Replace it with + a cert signed for your hostname before exposing the appliance publicly, + then run: sudo systemctl reload nginx +3. The Compliance Proxy MITM CA at /etc/compliance-proxy/ca.crt must be + distributed to every device that egresses through the proxy on port 3128. +4. Demo accounts (alice@/bob@/carol@/diana@nexus.ai) ship with documented + dev passwords — disable them from the UI before opening this instance + to external traffic. + +For full operator documentation see: + https://github.com/AlphaBitCore/nexus-gateway/blob/main/docs/operators/ +================================================================================ +EOF +chmod 0640 "$ADMIN_CREDS" +chown root:nexus "$ADMIN_CREDS" + +cat > /etc/motd < "$CONFIG_DIR/nexus-hub.env" < "$CONFIG_DIR/control-plane.env" < "$CONFIG_DIR/ai-gateway.env" < "$CONFIG_DIR/compliance-proxy.env" </dev/null || true) +if [ -n "$TOKEN" ]; then + IP=$(curl -fsS -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/public-ipv4 -m 3 2>/dev/null || true) + [ -z "$IP" ] && IP=$(curl -fsS -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/local-ipv4 -m 3 2>/dev/null || true) +fi +[ -z "$IP" ] && IP=$(hostname -I 2>/dev/null | awk '{print $1}') +[ -z "$IP" ] && IP=127.0.0.1 +echo "[nexus-first-boot] publicURL host = $IP" + +stamp_public_url() { + local yaml="$1"; local url="$2" + if grep -q '^publicURL:' "$yaml"; then + echo "[nexus-first-boot] $yaml already has publicURL; skipping." + else + sed -i "1i publicURL: \"$url\"" "$yaml" + echo "[nexus-first-boot] $yaml <- publicURL=$url" + fi +} +stamp_public_url /etc/nexus/nexus-hub.config.yaml "http://${IP}:3060" +stamp_public_url /etc/nexus/control-plane.config.yaml "https://${IP}/" +stamp_public_url /etc/nexus/ai-gateway.config.yaml "https://${IP}/v1" +stamp_public_url /etc/nexus/compliance-proxy.config.yaml "http://${IP}:3128" + +# Stamp AUTH_SERVER_ISSUER into control-plane.env (env override fills the +# yaml's empty authServer.issuer placeholder). Must match the publicURL the +# CP advertises so JWT iss-claim validation + JWKS fetch line up. Idempotent +# replace, not append. +if ! grep -q '^AUTH_SERVER_ISSUER=' /etc/nexus/control-plane.env; then + echo "AUTH_SERVER_ISSUER=https://${IP}/" >> /etc/nexus/control-plane.env + echo "[nexus-first-boot] /etc/nexus/control-plane.env <- AUTH_SERVER_ISSUER=https://${IP}/" +fi + +"$SCRIPT_DIR/nexus-first-boot-db" + +# Register this instance's redirect URI on the cp-ui OAuth client. The seed +# ships with localhost / cp.nexus.ai defaults; without this update an admin +# launching the appliance and clicking "Login" gets a 400 invalid_request +# from /oauth/authorize because the per-instance redirect_uri is not in the +# OAuthClient.redirectUris array. Idempotent — array_append fires only if +# missing. Runs as the postgres OS user (peer auth in pg_hba.conf). Hit on +# 2026-05-29 first-user-test of build #10. +echo "[nexus-first-boot] registering cp-ui redirect_uri for this instance..." +sudo -u postgres psql -d nexus_gateway -v ON_ERROR_STOP=1 </dev/null || true +systemctl start --no-block nexus-hub nexus-control-plane nexus-gateway nexus-proxy nginx + +echo "[nexus-first-boot] initialization complete." diff --git a/nexus-ami/scripts/harden.sh b/nexus-ami/scripts/harden.sh new file mode 100755 index 00000000..5d3bd82e --- /dev/null +++ b/nexus-ami/scripts/harden.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# harden.sh — final cleanup before AMI snapshot. MUST run as the LAST Packer +# provisioner. AWS Marketplace rejects the AMI if any of this is left in. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md §7 + +set -euo pipefail + +echo "==> [harden] removing SSH authorized_keys (recursive)..." +find / -name 'authorized_keys' -type f -delete 2>/dev/null || true + +echo "==> [harden] removing SSH host keys (regenerated on first boot)..." +find /etc/ssh -name 'ssh_host_*' -type f -delete 2>/dev/null || true + +echo "==> [harden] enforcing strict sshd config..." +sed -i 's/^#*PermitRootLogin.*/PermitRootLogin no/' /etc/ssh/sshd_config +sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config +sed -i 's/^#*PermitEmptyPasswords.*/PermitEmptyPasswords no/' /etc/ssh/sshd_config + +echo "==> [harden] locking the root password..." +passwd -l root || true + +echo "==> [harden] clearing shell history..." +find /root /home -name '.bash_history' -type f -delete 2>/dev/null || true +find /root /home -name '.zsh_history' -type f -delete 2>/dev/null || true +unset HISTFILE || true +history -c 2>/dev/null || true + +echo "==> [harden] truncating logs..." +find /var/log -type f -exec truncate -s 0 {} \; 2>/dev/null || true +journalctl --rotate 2>/dev/null || true +journalctl --vacuum-time=1s 2>/dev/null || true + +echo "==> [harden] resetting machine-id (regenerated on first boot)..." +truncate -s 0 /etc/machine-id +# /var/lib/dbus/machine-id is the legacy compatibility symlink for systems +# that ship dbus (Fedora desktop, RHEL with dbus). AL2023 minimal AMI does +# NOT install dbus by default — /var/lib/dbus/ does not exist (verified +# 2026-05-28 build, `ln -sf` failed). Skip the symlink when dbus isn't +# around; systemd alone reads /etc/machine-id directly and regenerates it +# on first boot. +if [ -d /var/lib/dbus ]; then + rm -f /var/lib/dbus/machine-id + ln -sf /etc/machine-id /var/lib/dbus/machine-id +fi + +echo "==> [harden] cleaning cloud-init state..." +cloud-init clean --logs 2>/dev/null || true + +echo "==> [harden] clearing DHCP leases and MAC-bound network rules..." +rm -rf /var/lib/dhclient/* /var/lib/dhcp/* 2>/dev/null || true +rm -f /etc/udev/rules.d/70-persistent-net.rules + +echo "==> [harden] clearing sudo password caches..." +rm -rf /var/db/sudo/* 2>/dev/null || true + +echo "==> [harden] clearing package manager caches..." +dnf clean all +rm -rf /var/cache/dnf/* /var/cache/yum/* 2>/dev/null || true + +echo "==> [harden] clearing /tmp, /var/tmp, and any leftover Nexus staging..." +rm -rf /tmp/nexus 2>/dev/null || true +find /tmp -mindepth 1 -delete 2>/dev/null || true +find /var/tmp -mindepth 1 -delete 2>/dev/null || true + +echo "==> [harden] clearing per-stateful service data accumulated during install..." +# Each of these is regenerated on first-boot or by the service itself; leaving +# install-time content baked into the AMI is a leak / non-determinism source. +rm -rf /var/lib/pgsql/data/* /var/lib/valkey/* /var/lib/nats/* 2>/dev/null || true +rm -f /etc/nexus/.initialized 2>/dev/null || true +rm -f /var/log/nexus/admin-credentials.txt 2>/dev/null || true + +echo "==> [harden] zeroing free space (shrinks EBS snapshot)..." +dd if=/dev/zero of=/zerofile bs=1M 2>/dev/null || true +rm -f /zerofile +sync + +echo "==> [harden] Nexus AMI hardening complete." diff --git a/nexus-ami/scripts/install-nats.sh b/nexus-ami/scripts/install-nats.sh new file mode 100755 index 00000000..5605f424 --- /dev/null +++ b/nexus-ami/scripts/install-nats.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# install-nats.sh — install NATS Server 2.x (JetStream enabled) from the +# official release binary on AL2023. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md + +set -euo pipefail + +NATS_VERSION=2.10.20 +ARCH=$(uname -m) +case "$ARCH" in + x86_64) NATS_ARCH=amd64 ;; + aarch64) NATS_ARCH=arm64 ;; + *) echo "ERROR: unsupported arch $ARCH" >&2; exit 1 ;; +esac + +TARBALL="nats-server-v$NATS_VERSION-linux-$NATS_ARCH.tar.gz" +URL="https://github.com/nats-io/nats-server/releases/download/v$NATS_VERSION/$TARBALL" + +echo "==> [install-nats] downloading $URL..." +cd /tmp +curl -fsSL "$URL" -o "$TARBALL" +tar xzf "$TARBALL" +install -m 0755 "nats-server-v$NATS_VERSION-linux-$NATS_ARCH/nats-server" /usr/local/bin/nats-server +rm -rf "$TARBALL" "nats-server-v$NATS_VERSION-linux-$NATS_ARCH" + +echo "==> [install-nats] creating nats user + dirs..." +if ! id -u nats >/dev/null 2>&1; then + useradd --system --no-create-home --shell /sbin/nologin --user-group nats +fi +install -d -o nats -g nats -m 0750 /var/lib/nats /var/log/nats +install -d -o root -g root -m 0755 /etc/nats + +cat > /etc/nats/nats-server.conf <<'EOF' +# Nexus appliance — NATS Server with JetStream (localhost-only). +listen: "127.0.0.1:4222" +http: "127.0.0.1:8222" + +server_name: "nexus-appliance" + +jetstream { + store_dir: "/var/lib/nats" + max_memory_store: 1GB + max_file_store: 32GB +} + +log_file: "/var/log/nats/nats-server.log" +logtime: true +debug: false +trace: false + +# No external clustering for the appliance form factor; the Hub is the only +# JetStream client and runs on the same host. +EOF +chmod 0644 /etc/nats/nats-server.conf + +echo "==> [install-nats] complete (NATS $NATS_VERSION)." diff --git a/nexus-ami/scripts/install-node-prisma.sh b/nexus-ami/scripts/install-node-prisma.sh new file mode 100755 index 00000000..1889b4ec --- /dev/null +++ b/nexus-ami/scripts/install-node-prisma.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# install-node-prisma.sh — install a self-contained Node.js 20 runtime under +# /opt/nexus/node and run `npm install` inside /opt/nexus/prisma so the +# first-boot Prisma client / seed / tsx commands work offline. +# +# Why self-contained? +# - AL2023's dnf node is older + slower-moving; pinning a specific Node 20 +# binary keeps the AMI reproducible across Marketplace rebuilds. +# - Only the first-boot path uses Node; nothing else on the appliance needs +# it, so installing into /opt/nexus/node keeps it out of the system PATH. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md + +set -euo pipefail + +# NODE_VERSION must satisfy Prisma's engines.node constraint. Prisma 7.8.0 +# requires "^20.19 || ^22.12 || >=24.0"; chokidar@5 + readdirp@5 (transitive +# deps) also require ">=20.19.0". Hard-pinned 20.18.1 produced an npm +# EBADENGINE fatal at AMI build time — verified 2026-05-28. Stay within +# 20.x LTS line ("Iron") to keep the runtime delta minimal across rebuilds. +NODE_VERSION=20.19.0 +ARCH=$(uname -m) +case "$ARCH" in + x86_64) NODE_ARCH=x64 ;; + aarch64) NODE_ARCH=arm64 ;; + *) echo "ERROR: unsupported arch $ARCH" >&2; exit 1 ;; +esac + +NODE_DIR=/opt/nexus/node +PRISMA_DIR=/opt/nexus/prisma + +TARBALL="node-v$NODE_VERSION-linux-$NODE_ARCH.tar.xz" +URL="https://nodejs.org/dist/v$NODE_VERSION/$TARBALL" + +echo "==> [install-node-prisma] downloading Node.js $NODE_VERSION..." +cd /tmp +curl -fsSL "$URL" -o "$TARBALL" +mkdir -p "$NODE_DIR" +tar xJf "$TARBALL" -C "$NODE_DIR" --strip-components=1 +rm -f "$TARBALL" + +export PATH="$NODE_DIR/bin:$PATH" + +echo "==> [install-node-prisma] node $(node --version) | npm $(npm --version) installed at $NODE_DIR" + +echo "==> [install-node-prisma] running npm install in $PRISMA_DIR..." +cd "$PRISMA_DIR" +"$NODE_DIR/bin/npm" install --omit=dev --no-audit --no-fund + +# Install tsx + typescript globally so first-boot-db.sh can call them +# regardless of devDependencies. +"$NODE_DIR/bin/npm" install -g --no-audit --no-fund tsx typescript + +echo "==> [install-node-prisma] complete." diff --git a/nexus-ami/scripts/install-postgres.sh b/nexus-ami/scripts/install-postgres.sh new file mode 100755 index 00000000..e80da0e1 --- /dev/null +++ b/nexus-ami/scripts/install-postgres.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# install-postgres.sh — install PostgreSQL 16 from AL2023's dnf and initialise +# an empty cluster. The data directory is populated by first-boot-db.sh. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md + +set -euo pipefail + +echo "==> [install-postgres] installing postgresql16-server..." +dnf install -y postgresql16-server postgresql16-contrib + +echo "==> [install-postgres] enabling postgresql.service..." +systemctl enable postgresql + +# IMPORTANT: postgres `initdb` is NOT run here. It happens at first-boot +# (see first-boot-db.sh). Reason: harden.sh wipes /var/lib/pgsql/data/* +# before the AMI snapshot — if we initdb'd at build time those files would +# be removed and postgresql.service would refuse to start on the launched +# instance with "data directory not initialized". Deferring initdb to +# first-boot avoids that whole class of bug AND keeps every launched +# instance's cluster identifier unique. + +echo "==> [install-postgres] complete (initdb deferred to first-boot)." diff --git a/nexus-ami/scripts/install-valkey.sh b/nexus-ami/scripts/install-valkey.sh new file mode 100755 index 00000000..ed5cc7ca --- /dev/null +++ b/nexus-ami/scripts/install-valkey.sh @@ -0,0 +1,215 @@ +#!/bin/bash +# install-valkey.sh — build Valkey 8 + valkey-search from source on AL2023. +# AL2023 dnf has no valkey package (checked 2026-05); the official Valkey +# project ships valkey/valkey-bundle Docker images but no rpm. Source compile +# is the cleanest path for a baked AMI (no docker-in-AMI smell). +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md +# Wire-compatible with Redis 7 — every go-redis/v9 client in Nexus works +# unchanged against it. + +set -euo pipefail + +# valkey-search 1.2.0 hard-requires Valkey >= 9.0.1 at module-init time. Hit +# on 2026-05-28 first-launch test: a Valkey 8.1.2 server with libsearch.so +# loadmodule logged +# Minimum required server version is 9.0.1, Current version is 8.1.2 +# Module /usr/lib/valkey/libsearch.so initialization failed. +# and aborted boot. Bumping to 9.0.4 (the latest in the 9.0.x line) restores +# compatibility while staying on the 9.0 series (avoiding 9.1.x feature drift). +VALKEY_VERSION=9.0.4 +# valkey-search GitHub tags do NOT use a `v` prefix (e.g. `1.2.0`, not +# `v1.2.0`). Verified via GitHub API 2026-05-28; 1.2.0 is the latest stable +# release. Bumping requires re-checking the API: +# curl -fsSL https://api.github.com/repos/valkey-io/valkey-search/tags +VALKEY_SEARCH_VERSION=1.2.0 +BUILD_DIR=/tmp/valkey-build +INSTALL_PREFIX=/usr/local + +echo "==> [install-valkey] installing build dependencies..." +# valkey-search 1.x switched to ninja + cmake + submodules (gRPC, Protobuf, +# Abseil bundled as submodules); cmake hard-checks the C++ compiler is +# either GCC ≥ 12 or Clang ≥ 16. AL2023's default gcc is 11.5.0 and default +# clang is 15.0.7 — both too old. AL2023 dnf ships versioned clang packages +# (confirmed 2026-05-28 dnf list available 'clang*'): clang18 / clang19 / +# clang20 are present; clang17 / clang16 are NOT in the repo. We pick the +# newest available. + +dnf install -y \ + gcc \ + gcc-c++ \ + make \ + cmake \ + ninja-build \ + git \ + openssl-devel \ + systemd-devel \ + pkgconf + +echo "==> [install-valkey] selecting a Clang ≥ 16 ..." +# Each clang${ver} package installs /usr/bin/clang-${ver} + /usr/bin/clang++-${ver} +# following the standard LLVM versioned-binary convention. We try newest-first +# and stop at the first one whose binary lands on PATH. 17/16 stay in the list +# in case a future repo refresh adds them back; current AL2023 (2026-05) skips +# straight from 15 → 18. +CLANG_BIN="" +CLANGXX_BIN="" +# Also install lld${ver} alongside clang${ver}. valkey-search's CMake compiles +# with -flto; linking libsearch.so requires LTO bitcode handling. GNU ld +# delegates LTO to LLVMgold.so which AL2023's `clang${ver}` package does NOT +# ship — verified 2026-05-28 build: link of libsearch.so failed with +# "cannot open /usr/lib64/llvm20/lib64/LLVMgold.so". lld is LLVM's native +# linker; it handles LTO bitcode directly, no plugin needed. +for ver in 20 19 18 17 16; do + if dnf install -y "clang${ver}" "lld${ver}" 2>/dev/null && command -v "clang-${ver}" >/dev/null 2>&1; then + CLANG_BIN="clang-${ver}" + CLANGXX_BIN="clang++-${ver}" + # `-fuse-ld=lld` only finds `ld.lld` (unversioned) on PATH. AL2023's + # lld${ver} package installs versioned binaries (ld.lld-${ver}); add a + # symlink if the unversioned name is missing. + if ! command -v ld.lld >/dev/null 2>&1; then + for candidate in "/usr/bin/ld.lld-${ver}" "/usr/bin/lld-${ver}"; do + if [ -x "$candidate" ]; then + ln -sf "$candidate" /usr/bin/ld.lld + break + fi + done + fi + command -v ld.lld >/dev/null 2>&1 || { + echo "ERROR: ld.lld not in PATH after installing lld${ver}; cannot use lld for LTO link." >&2 + ls -la /usr/bin/ld.lld* /usr/bin/lld* 2>&1 | head -10 >&2 + exit 1 + } + echo "==> [install-valkey] using clang-${ver} + ld.lld (LTO via lld, not LLVMgold plugin)" + break + fi +done +if [ -z "$CLANG_BIN" ]; then + echo "ERROR: AL2023 dnf has no clang ≥ 16 (tried clang20 / 19 / 18 / 17 / 16); valkey-search 1.x requires ≥ 16." >&2 + echo "Available clang/lld packages:" >&2 + dnf list available 'clang*' 'lld*' 2>&1 | head -30 >&2 + exit 1 +fi + +echo "==> [install-valkey] verifying compiler versions..." +gcc --version | head -1 +"$CLANG_BIN" --version | head -1 + +# ─── Valkey core ──────────────────────────────────────────────────────────── + +echo "==> [install-valkey] downloading Valkey $VALKEY_VERSION source..." +mkdir -p "$BUILD_DIR" && cd "$BUILD_DIR" +curl -fsSL "https://github.com/valkey-io/valkey/archive/refs/tags/$VALKEY_VERSION.tar.gz" \ + -o "valkey-$VALKEY_VERSION.tar.gz" +tar xzf "valkey-$VALKEY_VERSION.tar.gz" +cd "valkey-$VALKEY_VERSION" + +echo "==> [install-valkey] building Valkey ($(nproc) parallel jobs)..." +make -j"$(nproc)" USE_SYSTEMD=yes BUILD_TLS=yes +make install PREFIX="$INSTALL_PREFIX" + +# ─── valkey-search module ─────────────────────────────────────────────────── +# semantic cache module — packages/ai-gateway/internal/cache/semantic/ requires +# the FT.CREATE / FT.SEARCH commands this module provides. +# +# valkey-search 1.x uses ninja + cmake + git submodules (gRPC, Protobuf, +# Abseil all vendored). We use the project's canonical `build.sh` rather +# than calling cmake directly — the build script knows where the submodules +# live and how to wire them. --recurse-submodules at clone time pulls all +# vendored deps in one shot. + +echo "==> [install-valkey] cloning valkey-search $VALKEY_SEARCH_VERSION (with submodules)..." +cd "$BUILD_DIR" +git clone --recurse-submodules --depth 1 --shallow-submodules \ + --branch "$VALKEY_SEARCH_VERSION" \ + https://github.com/valkey-io/valkey-search.git +cd valkey-search + +# ─── Patch: Linux x86_64 + clang duplicate-overload in type_conversions.h ─── +# vmsdk/src/type_conversions.h has: +# template <> inline absl::StatusOr To(absl::string_view); +# #if defined(__clang__) && !defined(RunningClangd) +# template <> inline absl::StatusOr To(absl::string_view); +# #endif +# On Linux x86_64 (LP64) `uint64_t === unsigned long`, so the guarded overload +# collides with the unguarded one and clang ≥ 18 emits a hard redefinition +# error in vmsdklib — verified 2026-05-28 against tags 1.2.0 AND main. The +# overload only matters on platforms where the two types differ (macOS arm64/ +# x86_64: uint64_t = unsigned long long ≠ unsigned long). Fix: tighten the +# guard to also require !defined(__linux__). No-op on macOS; eliminates the +# redefinition on Linux. Idempotent: re-running the sed is harmless. +echo "==> [install-valkey] patching type_conversions.h for Linux+clang duplicate template..." +sed -i 's|^#if defined(__clang__) && !defined(RunningClangd)$|#if defined(__clang__) \&\& !defined(RunningClangd) \&\& !defined(__linux__)|' vmsdk/src/type_conversions.h +grep -q '!defined(__linux__)' vmsdk/src/type_conversions.h || { + echo "ERROR: type_conversions.h patch did not apply — upstream source layout changed" >&2 + echo "Expected line to patch: #if defined(__clang__) && !defined(RunningClangd)" >&2 + echo "Current grep for the guard line:" >&2 + grep -n 'defined(__clang__)' vmsdk/src/type_conversions.h >&2 || true + exit 1 +} + +# Force the build to use Clang (≥ 16 on AL2023) instead of the default +# gcc-11.5.0 (which valkey-search 1.x cmake rejects: "Minimum GCC required +# is 12 and later"). +# +# Cap parallelism at 4 regardless of available cores. Each gRPC/Protobuf/ICU +# compile worker can hold 1.5–2 GB resident; running all 8 cores parallel +# on t3.2xlarge would push 16+ GB and risk OOM-killer even on 32 GB hosts. +# --jobs=4 is the sweet spot: comfortable 16–24 GB working set, no OOM. +echo "==> [install-valkey] building valkey-search with ${CLANG_BIN} (jobs=4, linker=lld)..." +export CC="$CLANG_BIN" +export CXX="$CLANGXX_BIN" +# Force lld as the linker for every link step. CMake propagates LDFLAGS into +# CMAKE_{EXE,SHARED,MODULE}_LINKER_FLAGS at configure time, so this reaches +# the libsearch.so shared-library link where the LTO bitcode lives. Required +# because AL2023's clang20 ships without LLVMgold.so — see comment above the +# clang/lld install loop. +export LDFLAGS="${LDFLAGS:-} -fuse-ld=lld" +./build.sh --jobs=4 + +echo "==> [install-valkey] locating built libsearch.so..." +SO_PATH=$(find . -name 'libsearch.so' -type f 2>/dev/null | head -1) +if [ -z "$SO_PATH" ]; then + echo "ERROR: libsearch.so not found after build" >&2 + find . -name '*.so' -type f 2>/dev/null | head -20 >&2 + exit 1 +fi +echo "==> [install-valkey] found: $SO_PATH" + +install -d -m 0755 /usr/lib/valkey +# 0755 (not 0644) — Valkey's module loader explicitly checks the execute bit +# before dlopen as a safety guard, and refuses to load with "It does not have +# execute permissions." Hit on 2026-05-28 — the AMI booted with valkey in a +# restart loop. Standard convention for shared libs on Linux is 0755. +install -m 0755 "$SO_PATH" /usr/lib/valkey/libsearch.so + +# ─── User + directories + config ──────────────────────────────────────────── + +echo "==> [install-valkey] creating valkey user + dirs..." +if ! id -u valkey >/dev/null 2>&1; then + useradd --system --no-create-home --shell /sbin/nologin --user-group valkey +fi +install -d -o valkey -g valkey -m 0750 /var/lib/valkey /var/log/valkey /var/run/valkey +install -d -o root -g root -m 0755 /etc/valkey + +cat > /etc/valkey/valkey.conf <<'EOF' +# Nexus appliance — Valkey config (localhost-only). +bind 127.0.0.1 +port 6379 +protected-mode yes +supervised systemd +loglevel notice +logfile /var/log/valkey/valkey.log +dir /var/lib/valkey +appendonly yes +appendfsync everysec +maxmemory-policy allkeys-lru +loadmodule /usr/lib/valkey/libsearch.so +EOF +chmod 0644 /etc/valkey/valkey.conf + +# ─── Cleanup ──────────────────────────────────────────────────────────────── + +rm -rf "$BUILD_DIR" + +echo "==> [install-valkey] complete (Valkey $VALKEY_VERSION + valkey-search $VALKEY_SEARCH_VERSION)." diff --git a/nexus-ami/scripts/install.sh b/nexus-ami/scripts/install.sh new file mode 100755 index 00000000..a4c785c6 --- /dev/null +++ b/nexus-ami/scripts/install.sh @@ -0,0 +1,184 @@ +#!/bin/bash +# install.sh — orchestrator. Runs ONCE during Packer build (NOT per-instance). +# Assumes: Amazon Linux 2023 base, artifacts/ staged at /tmp/nexus/ by Packer +# file provisioner. +# +# Architecture: docs/developers/architecture/cross-cutting/deployment/ami-appliance-architecture.md + +set -euo pipefail + +NEXUS_USER=nexus +NEXUS_GROUP=nexus +INSTALL_DIR=/opt/nexus +BIN_DIR=$INSTALL_DIR/bin +UI_DIR=$INSTALL_DIR/ui +PRISMA_DIR=$INSTALL_DIR/prisma +NODE_DIR=$INSTALL_DIR/node +CONFIG_DIR=/etc/nexus +LOG_DIR=/var/log/nexus +DATA_DIR=/var/lib/nexus +STAGING_DIR=/tmp/nexus +SCRIPT_DIR=/usr/local/sbin +TARBALL=/tmp/nexus-artifacts.tar.gz + +# ─── 0. Extract artifacts tarball uploaded by Packer file provisioner ────── +# Packer uploads a single artifacts.tar.gz to /tmp/nexus-artifacts.tar.gz +# (atomic transfer — avoids the recursive-SCP partial-upload bug we hit +# when source was directory-shape on slow links). We extract it under +# /tmp/nexus/ so the rest of this script can reference $STAGING_DIR/bin/ +# etc. exactly as if Packer had uploaded the directory directly. + +echo "==> [install] extracting $TARBALL -> $STAGING_DIR ..." +if [ ! -f "$TARBALL" ]; then + echo "ERROR: tarball not found at $TARBALL — Packer file provisioner did not deliver it" >&2 + exit 1 +fi +mkdir -p "$STAGING_DIR" +tar -C "$STAGING_DIR" -xzf "$TARBALL" +rm -f "$TARBALL" +echo "==> [install] extracted artifacts ($(du -sh "$STAGING_DIR" | awk '{print $1}'))" + +# ─── 1. Update base OS + install base packages ────────────────────────────── + +echo "==> [install] dnf update -y (required for Marketplace scan-clean)..." +dnf update -y +# Only firewalld + nginx need installing — openssl, ca-certificates, jq, tar, +# gzip, rsync, procps-ng, curl-minimal all ship preinstalled in AL2023. We +# explicitly do NOT install the full `curl` package because it conflicts with +# the pre-installed curl-minimal (and curl-minimal already provides the curl +# CLI features the install/first-boot scripts need: -f / -s / -S / -L / +# --connect-timeout / etc.). +dnf install -y \ + firewalld \ + nginx + +# ─── 2. Create system user ────────────────────────────────────────────────── + +echo "==> [install] creating nexus system user..." +if ! id -u "$NEXUS_USER" >/dev/null 2>&1; then + useradd --system --no-create-home --shell /sbin/nologin --user-group "$NEXUS_USER" +fi + +# ─── 3. Create directory structure ────────────────────────────────────────── + +echo "==> [install] creating directory structure..." +install -d -o root -g root -m 0755 "$INSTALL_DIR" "$BIN_DIR" "$UI_DIR" "$PRISMA_DIR" "$NODE_DIR" +install -d -o root -g "$NEXUS_GROUP" -m 0750 "$CONFIG_DIR" /etc/compliance-proxy +install -d -o "$NEXUS_USER" -g "$NEXUS_GROUP" -m 0750 "$LOG_DIR" "$DATA_DIR" \ + "$DATA_DIR/agentca" \ + "$DATA_DIR/audit-spool" \ + "$DATA_DIR/alerting" + +# ─── 4. Install Nexus Go binaries ─────────────────────────────────────────── + +echo "==> [install] installing Nexus Go binaries..." +for binary in nexus-hub control-plane ai-gateway compliance-proxy; do + if [ ! -f "$STAGING_DIR/bin/$binary" ]; then + echo "ERROR: missing binary $STAGING_DIR/bin/$binary" >&2 + exit 1 + fi + install -o root -g root -m 0755 "$STAGING_DIR/bin/$binary" "$BIN_DIR/$binary" +done + +# ─── 5. Install UI static assets ──────────────────────────────────────────── + +echo "==> [install] installing UI static assets..." +if [ ! -d "$STAGING_DIR/ui-dist" ]; then + echo "ERROR: missing UI dist at $STAGING_DIR/ui-dist" >&2 + exit 1 +fi +rsync -a --delete "$STAGING_DIR/ui-dist/" "$UI_DIR/" +chown -R root:root "$UI_DIR" + +# ─── 6. Install Prisma schema + seed + admin-password helper ──────────────── + +echo "==> [install] installing Prisma schema + seed..." +if [ ! -d "$STAGING_DIR/prisma" ]; then + echo "ERROR: missing prisma bundle at $STAGING_DIR/prisma" >&2 + exit 1 +fi +rsync -a --delete "$STAGING_DIR/prisma/" "$PRISMA_DIR/" +install -o root -g root -m 0755 "$STAGING_DIR/scripts/set-admin-password.js" "$PRISMA_DIR/set-admin-password.js" +chown -R root:root "$PRISMA_DIR" + +# ─── 7. Install service configs ───────────────────────────────────────────── + +echo "==> [install] installing prod-shape config files..." +for svc in nexus-hub control-plane ai-gateway compliance-proxy; do + install -o root -g "$NEXUS_GROUP" -m 0640 \ + "$STAGING_DIR/configs/$svc.config.yaml" "$CONFIG_DIR/$svc.config.yaml" +done +install -o root -g root -m 0644 "$STAGING_DIR/configs/nginx-nexus.conf" /etc/nginx/conf.d/nexus.conf +rm -f /etc/nginx/conf.d/default.conf + +# ─── 8. Install systemd units ─────────────────────────────────────────────── + +echo "==> [install] installing systemd units..." +install -o root -g root -m 0644 "$STAGING_DIR/systemd/"*.service /etc/systemd/system/ + +# ─── 9. Install first-boot helpers under /usr/local/sbin ──────────────────── + +echo "==> [install] installing first-boot scripts..." +install -o root -g root -m 0755 "$STAGING_DIR/scripts/first-boot.sh" "$SCRIPT_DIR/nexus-first-boot" +install -o root -g root -m 0755 "$STAGING_DIR/scripts/first-boot-secrets.sh" "$SCRIPT_DIR/nexus-first-boot-secrets" +install -o root -g root -m 0755 "$STAGING_DIR/scripts/first-boot-ca.sh" "$SCRIPT_DIR/nexus-first-boot-ca" +install -o root -g root -m 0755 "$STAGING_DIR/scripts/first-boot-db.sh" "$SCRIPT_DIR/nexus-first-boot-db" + +# ─── 10. Install runtime dependencies (Postgres / Valkey / NATS / Node) ───── + +bash "$STAGING_DIR/scripts/install-postgres.sh" +bash "$STAGING_DIR/scripts/install-valkey.sh" +bash "$STAGING_DIR/scripts/install-nats.sh" +bash "$STAGING_DIR/scripts/install-node-prisma.sh" + +# ─── 11. Configure firewall ───────────────────────────────────────────────── + +echo "==> [install] configuring firewalld..." +systemctl enable firewalld +systemctl start firewalld +firewall-cmd --permanent --add-service=ssh +firewall-cmd --permanent --add-port=443/tcp # nginx (UI + /api/*) +firewall-cmd --permanent --add-port=80/tcp # nginx (HTTP redirect to 443) +firewall-cmd --permanent --add-port=3050/tcp # AI Gateway (SDK direct) +firewall-cmd --permanent --add-port=3128/tcp # Compliance Proxy CONNECT +firewall-cmd --reload + +# ─── 12. Enable services to start at boot ─────────────────────────────────── + +echo "==> [install] enabling services..." +systemctl daemon-reload +systemctl enable nginx +systemctl enable postgresql +systemctl enable valkey +systemctl enable nats +systemctl enable nexus-first-boot.service +systemctl enable nexus-hub.service +systemctl enable nexus-control-plane.service +systemctl enable nexus-gateway.service +systemctl enable nexus-proxy.service + +# ─── 13. Configure logrotate for Nexus log dir ────────────────────────────── + +echo "==> [install] writing logrotate config..." +cat > /etc/logrotate.d/nexus <<'EOF' +/var/log/nexus/*.log { + daily + rotate 14 + compress + delaycompress + missingok + notifempty + create 0640 nexus nexus + sharedscripts + postrotate + systemctl reload-or-restart nexus-hub.service nexus-control-plane.service \ + nexus-gateway.service nexus-proxy.service \ + > /dev/null 2>&1 || true + endscript +} +EOF + +echo "==> [install] cleaning staging directory..." +rm -rf "$STAGING_DIR" + +echo "==> [install] install.sh complete." diff --git a/nexus-ami/scripts/set-admin-password.js b/nexus-ami/scripts/set-admin-password.js new file mode 100644 index 00000000..8cdf75f6 --- /dev/null +++ b/nexus-ami/scripts/set-admin-password.js @@ -0,0 +1,30 @@ +// set-admin-password.js — generate a scrypt password hash compatible with the +// NexusUser.passwordHash column. Reads NEW_PASSWORD from env, prints the hash +// to stdout in "salt_hex:hash_hex" format. +// +// Parameters MUST match tools/db-migrate/seed/lib.ts hashPassword(): +// N = 16384, r = 8, p = 1 +// salt = 32 bytes (random) +// key = 64 bytes +// +// Used by first-boot-db.sh to replace the seeded admin@nexus.ai password +// with a per-instance random one. This file is shipped to /opt/nexus/prisma/ +// alongside the schema and seed code. + +'use strict'; + +const { scryptSync, randomBytes } = require('crypto'); + +const SALT_LENGTH = 32; +const KEY_LENGTH = 64; +const SCRYPT_OPTIONS = { N: 16384, r: 8, p: 1 }; + +const password = process.env.NEW_PASSWORD; +if (!password || password.length < 8) { + process.stderr.write('set-admin-password: NEW_PASSWORD env must be set and >= 8 chars\n'); + process.exit(1); +} + +const salt = randomBytes(SALT_LENGTH); +const hash = scryptSync(password, salt, KEY_LENGTH, SCRYPT_OPTIONS); +process.stdout.write(`${salt.toString('hex')}:${hash.toString('hex')}`);