From 946d26c2f2f1fb038a622d725304318dc4da0941 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 15 May 2026 11:47:28 +0200 Subject: [PATCH 1/3] feat(backup): add backup stacklet for photos and documents Append-only backup of stacklet data to an attached APFS disk. Photos (Immich originals) and documents (Paperless archived PDFs) sync nightly via cron; every file is locked with the kernel uchg flag once on the vault, so accidents and ransomware cannot reach back through. Stacklet: * host-type stacklet, opt-in via 'stack up backup' * interactive on_configure prompts for disk, encryption, nightly time * on_install plants the canary tripwire, generates an FDA-granted .app wrapper, installs the cron entry * on_stop / on_destroy remove the cron defensively; vault data and the Keychain entry are preserved External-disk engine: * rsync --ignore-existing for append-only writes * chflags uchg on every new file (kernel-enforced immutability) * canary + preflight refuse to sync a wiped source * filesystem probe refuses non-APFS targets at runtime * per-run JSON appended to history.jsonl (append-only audit trail) Orchestrator: * discovers [[backup.archive]] across enabled stacklets * posts summary to #famstack via stacker-bot Reserved for follow-up: [[backup.snapshot]] for DB dumps, 'stack backup restore' + per-stacklet on_restore hooks, encrypted offsite via restic. Tests: 153 unit + 9 E2E against a real APFS sparse image. --- docs/stack-reference.md | 53 + docs/user-guide.md | 60 +- stack.example.toml | 16 + stacklets/backup/README.md | 117 ++ stacklets/backup/_config.py | 183 +++ stacklets/backup/_cron.py | 143 +++ stacklets/backup/cli/_orchestrator.py | 404 +++++++ stacklets/backup/cli/status.py | 30 + stacklets/backup/cli/sync.py | 220 ++++ .../backup/engines/external-disk/README.md | 149 +++ .../backup/engines/external-disk/sync.py | 1046 +++++++++++++++++ stacklets/backup/hooks/on_configure.py | 316 +++++ stacklets/backup/hooks/on_destroy.py | 91 ++ stacklets/backup/hooks/on_install.py | 247 ++++ stacklets/backup/hooks/on_start.py | 66 ++ stacklets/backup/hooks/on_stop.py | 70 ++ stacklets/backup/stacklet.toml | 51 + stacklets/docs/stacklet.toml | 16 + stacklets/photos/stacklet.toml | 19 + tests/stacklets/test_backup_config.py | 228 ++++ tests/stacklets/test_backup_configure.py | 253 ++++ tests/stacklets/test_backup_cron.py | 253 ++++ tests/stacklets/test_backup_e2e.py | 497 ++++++++ tests/stacklets/test_backup_engine.py | 444 +++++++ tests/stacklets/test_backup_install.py | 125 ++ tests/stacklets/test_backup_orchestrator.py | 438 +++++++ 26 files changed, 5531 insertions(+), 4 deletions(-) create mode 100644 stacklets/backup/README.md create mode 100644 stacklets/backup/_config.py create mode 100644 stacklets/backup/_cron.py create mode 100644 stacklets/backup/cli/_orchestrator.py create mode 100644 stacklets/backup/cli/status.py create mode 100644 stacklets/backup/cli/sync.py create mode 100644 stacklets/backup/engines/external-disk/README.md create mode 100755 stacklets/backup/engines/external-disk/sync.py create mode 100644 stacklets/backup/hooks/on_configure.py create mode 100644 stacklets/backup/hooks/on_destroy.py create mode 100644 stacklets/backup/hooks/on_install.py create mode 100644 stacklets/backup/hooks/on_start.py create mode 100644 stacklets/backup/hooks/on_stop.py create mode 100644 stacklets/backup/stacklet.toml create mode 100644 tests/stacklets/test_backup_config.py create mode 100644 tests/stacklets/test_backup_configure.py create mode 100644 tests/stacklets/test_backup_cron.py create mode 100644 tests/stacklets/test_backup_e2e.py create mode 100644 tests/stacklets/test_backup_engine.py create mode 100644 tests/stacklets/test_backup_install.py create mode 100644 tests/stacklets/test_backup_orchestrator.py diff --git a/docs/stack-reference.md b/docs/stack-reference.md index 22efbaa..9f45c84 100644 --- a/docs/stack-reference.md +++ b/docs/stack-reference.md @@ -293,6 +293,44 @@ Subscribing to custom events on the bot side isn't wired yet — that comes with the first consumer bot. The emit contract is stable and tested end-to-end (see `tests/integration/test_archivist_e2e.py`). +### Backup + +A stacklet declares the data it wants backed up. The `backup` stacklet +discovers these declarations across all enabled stacklets and routes them +through configured targets (see `stack.toml` → `[backup.targets.*]`). +The stacklet itself never reads the manifest — discovery is the runtime's +job. + +```toml +# stacklets/photos/stacklet.toml +[[backup.archive]] +name = "library" +path = "{data_dir}/photos/library/library" +min_files = 10 +``` + +| Field | Description | +|---|---| +| `name` | Short slug for this source. Combined with the stacklet id, this becomes the global source id (`photos/library`). Used in `stack backup status` output and (future) `--source=` selection. | +| `path` | Filesystem path to sync. Template variables from the rendered environment are available (`{data_dir}`, etc.). | +| `min_files` | Coarse ransomware smoke test. The engine counts files at `path` before syncing and refuses if the count is below this. The canary file is the precise tripwire; this is the dumb-and-cheap secondary check. Keep low enough that fresh installs don't trip it. | + +**`[[backup.archive]]`** declares an append-only store: files are added, +never modified, never deleted. The engine commits to kernel-enforced +immutability where the filesystem supports it. If a stacklet's data is +genuinely append-only (photo originals, archived PDFs), this is the +right section. (Storage-industry vocabulary calls this WORM — Write +Once Read Many.) + +**`[[backup.snapshot]]`** is reserved for time-stamped point-in-time +captures of mutable state (Postgres dumps, Docker volume tarballs). Not +yet implemented — declare an `archive` section today; a `snapshot` +section will be added later when DB-restore semantics ship. + +A stacklet may declare zero, one, or several entries of each kind. Sources +flow to every configured target whose engine supports the declared +section type. + --- ## Lifecycle @@ -434,6 +472,7 @@ stack destroy: | `on_start_ready` | Every up | **Runs after health checks pass.** The service is healthy and accepting API calls. Seed data, sync accounts, anything that needs the service running. Must be idempotent. | | `on_stop` | Every down | Stop native services. Only stops services we manage (.state/ markers). | | `on_destroy` | Once | Remove native services entirely (unload plists, uninstall). | +| `on_restore` | On `stack backup restore` | **Reserved — not yet invoked.** Runs after the backup engine has put a stacklet's files back on disk. Owns stacklet-specific recovery: DB import, search-index rebuild, account re-seed. Photos can ship an empty stub (Immich re-indexes from the library on its own); Docs needs `pg_restore` + Paperless reindex. Hook signature will match the other `run(ctx)` hooks. | **File resolution:** for each hook, the runtime looks for `.py` first, then `.sh`. Only one can exist — not both. Python is preferred. @@ -843,8 +882,22 @@ openai_key = "local" default = "mlx-community/Qwen2.5-14B-Instruct-4bit" whisper_url = "http://localhost:6111/v1" language = "en" + +[backup] +# One block per destination. Engine name selects the implementation. +[backup.targets.vault] +engine = "external-disk" +disk = "backup-vault" +schedule = "0 2 * * *" ``` +`[backup.targets.]` defines one destination. `` is the user's +label for that destination (any string). The required `engine` field +picks the implementation under `stacklets/backup/engines/`. Today only +`external-disk` ships. Engine-specific fields (`disk`, `schedule`, +future `repository`, `password`) live alongside `engine` in the same +block. + Stacklets never read `stack.toml` directly. The runtime resolves template variables and passes everything through the rendered `.env`. diff --git a/docs/user-guide.md b/docs/user-guide.md index 35e33f6..795aa91 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -508,15 +508,67 @@ All commands output JSON when piped. Use `--json` to force it, `--pretty` to for ## Backups -This is the part everyone skips and regrets. famstack puts every byte of user data under one directory: +This is the part everyone skips and regrets. famstack ships an opt-in backup stacklet for the irreplaceable file-level data (photo originals, scanned documents). It does not yet cover stacklet databases or your config files; for those, layer it with Time Machine or a periodic tar. + +### The backup stacklet + +Run `stack up backup`. You need an APFS-formatted external drive plugged in. The setup wizard asks for the disk name, encryption (default: plain APFS), and a nightly time (default 02:00). It then installs a small `.app` wrapper, asks you to grant it Full Disk Access, and adds a cron entry. + +**What gets backed up** + +| Source | Path on the vault disk | +|---|---| +| Immich photo originals | `/Volumes//data/photos-library/` | +| Paperless archived PDFs | `/Volumes//data/docs-media/` | + +Postgres databases for both are not backed up yet. You get your files back but lose albums, tags, custom fields, and saved views. Pg-dump snapshots will ship as `[[backup.snapshot]]` in a later release. + +**How the protection works** + +Every file written to the vault gets the kernel `uchg` flag. macOS refuses to modify or delete uchg files, even with `sudo`. `rsync --ignore-existing` means existing vault files are skipped on every run, so the backup is append-only by design and accidental `rm -rf` on your main system cannot propagate. A canary file is checked before every sync; if ransomware has touched `~/famstack-data`, the canary won't match and the sync aborts before opening the vault. + +**Daily operation** + +```bash +stack backup sync # run a sync now (from any context) +stack backup status # last run, source counts, current mount state +stack down backup # remove the cron entry, keep .app and vault data +stack destroy backup # also remove the .app; vault and Keychain are preserved +``` + +The scheduled nightly run leaves the disk mounted between runs (cron cannot trigger eject under the macOS sandbox; files are kernel-locked regardless). Manual `stack backup sync` from Terminal does eject when it finishes. Results post to the `#famstack` Matrix room via stacker-bot. + +**Recovery (no special tooling needed)** + +Plug the vault disk into any Mac and browse the files in Finder. To copy locked files out: ```bash -ls ~/famstack-data +sudo chflags -R nouchg /Volumes//data/photos-library// +cp -R /Volumes//data/photos-library// ~/recovered/ ``` -That is what you back up. Time Machine works. So does `restic`, `rsync`, or copying it to an external drive every Sunday. Pick one and do it. +A `stack backup restore` command and `on_restore` hooks for database recovery are planned but not yet shipped. + +**What it protects you from** + +| Threat | Covered | +|---|---| +| Ransomware encrypts your Mac | Yes (uchg + canary) | +| Accidental `rm -rf` on `~/famstack-data` | Yes (rsync never deletes from the vault) | +| You delete a photo on your phone | Yes (Immich propagates the delete to disk; vault keeps the original) | +| Vault drive stolen from your house | Only if you opted in to APFS encryption | +| Vault drive hardware failure | No (single physical copy; offsite engine planned) | +| Fire or flood | No (vault is in the same building; offsite engine planned) | + +**Limitations to know about** + +Only APFS or HFS+ disks attached via USB or Thunderbolt. Network shares (SMB/NFS/Synology) are refused at probe time because the kernel cannot enforce `uchg` over a network filesystem. One target only; encrypted offsite via restic is planned, not shipped. + +### Everything else + +`~/famstack-data` holds every byte of user data. Even with the backup stacklet running you may want Time Machine or `restic` against this directory for full coverage (databases, AI model caches). -What is **not** in `~/famstack-data` and therefore needs separate handling: +What is **not** in `~/famstack-data` and needs separate handling: - `stack.toml`, `users.toml`, `.stack/secrets.toml` in the repo. Small but irreplaceable. Gitignored on purpose, so they will not survive a fresh `git clone`. - oMLX models (in `~/.omlx/models`). Re-downloadable. diff --git a/stack.example.toml b/stack.example.toml index cf55da9..9f6954a 100644 --- a/stack.example.toml +++ b/stack.example.toml @@ -85,6 +85,22 @@ whisper_url = "http://localhost:42062/v1" # "en" = English (alloy voice), "de" = German (onyx voice) language = "en" +[backup] +# Nightly append-only backup of stacklet data. Enable the backup stacklet +# (`stack up backup`) to install the cron entry and FDA-granted .app +# wrapper that runs the sync. Each [[backup.targets.*]] block defines one +# destination; sources are discovered from stacklets that declare +# [[backup.archive]] (append-only) in their own manifest. +# +# In v1 only the external-disk engine ships. Restic-based encrypted +# offsite is planned and will slot in as a second target without +# touching this file's existing entries. + +# [backup.targets.vault] +# engine = "external-disk" # rsync + chflags uchg + diskutil eject +# disk = "backup-vault" # APFS volume name (case-sensitive) +# schedule = "0 2 * * *" # 5-field cron, runs nightly at 02:00 + [services] # Homepage dashboard URL — set automatically when core is enabled. homepage_url = "http://localhost:3000" diff --git a/stacklets/backup/README.md b/stacklets/backup/README.md new file mode 100644 index 0000000..a3ea851 --- /dev/null +++ b/stacklets/backup/README.md @@ -0,0 +1,117 @@ +# backup — append-only backup of stacklet data + +## What it does + +Coordinates nightly backups of stacklet data to attached external disks. +The model is an **append-only archive**: files are added, never modified, +never deleted. Once a photo or document lands on the backup disk, the +kernel itself refuses to let anything change it. The threat model is +*ransomware, accidents, mistakes* — not a sophisticated targeted +attack. + +See `engines/external-disk/README.md` for the full protection layers and +the rationale behind each one. + +## Architecture + +Backup is **a coordinator, not a backup tool**. The actual work is done +by *engines*, each of which implements one well-defined backup strategy +with explicit guarantees: + +| Engine | Status | What it does | +|---|---|---| +| `external-disk` | scaffolded, port pending | rsync + chflags uchg on attached APFS disk | +| `restic` | planned | encrypted, deduplicated, snapshotted offsite (S3/B2) | + +Sources are discovered from other stacklets via a manifest contract. +Every stacklet that declares `[[backup.archive]]` (an append-only store) +in its `stacklet.toml` contributes one source path to the next sync: + +```toml +# stacklets/photos/stacklet.toml +[[backup.archive]] +name = "library" +path = "{data_dir}/photos/library/library" +min_files = 10 +``` + +Targets are configured in `stack.toml`. Today the only target is the +attached disk: + +```toml +# stack.toml +[backup] +[backup.targets.vault] +engine = "external-disk" +disk = "backup-vault" +``` + +Routing: every `[[backup.archive]]` source flows to every target whose +engine supports append-only semantics. Adding a second target later +(offsite restic) is purely additive — no manifest change on photos/docs. + +## CLI + +``` +stack backup sync [--dry-run] [--no-eject] [--verbose] +stack backup status # last run, source counts, cron presence +``` + +Per-stacklet aliases (`stack photos backup`, `stack docs backup`) and +restore (`stack backup restore --source=…`) are intentionally not in +v1 — they'll layer on once the engine port lands and the manifest +contract has been exercised on at least one production sync. + +## Destroy semantics + +`stack destroy backup` removes the backup *tooling* — never the +*backups*. Specifically: + +- **Removed:** cron entry, FamstackVaultSync.app bundle, local logs, + canary file under BACKUP_DATA_DIR. +- **Preserved:** every file on the vault disk. The whole point of an + append-only archive is that it outlives the system that wrote it. +- **Preserved:** the macOS Keychain entry for the disk passphrase + (encrypted vaults only). The user may want manual disk access after + uninstall; the command to remove it is surfaced if they want a fully + clean state. + +Defensive measure: `on_configure` refuses to let `BACKUP_DATA_DIR` +point at a path under `/Volumes/`. That way the framework's automatic +data-dir cleanup at destroy time can never accidentally reach external +storage. + +## Recovery without restore tooling + +The v1 engine writes plain files in plain directory structures. No +restore CLI exists yet — but you don't need one to get your photos +back: + +```bash +# 1. Plug the vault disk into any Mac and unlock it (Finder prompts +# for the passphrase if encrypted) + +# 2. Browse to the originals +ls /Volumes/backup-vault/data/photos/library/ + +# 3. Files are immutable. Unlock the ones you want to recover: +sudo chflags -R nouchg /Volumes/backup-vault/data/photos/library/ + +# 4. Copy them wherever you need +cp -R /Volumes/backup-vault/data/photos/library/ ~/recovered-photos/ +``` + +This is the "survivalist" property the append-only design buys: no +special software needed to read the archive. The future restore CLI will +automate this and run stacklet-specific recovery via `on_restore` +hooks (DB import, search-index rebuild). For v1, manual recovery +is the documented path. + +## Status + +This stacklet is currently **scaffold only**. The hooks and CLI files +raise `NotImplementedError`. The next step is porting `vault-sync.sh` +from `family-server/backup/` into `engines/external-disk/`, with two +adaptations: source discovery via the manifest contract, and Matrix +notifications via the local `stacker-bot` instead of the legacy +`kit-control-bot`. diff --git a/stacklets/backup/_config.py b/stacklets/backup/_config.py new file mode 100644 index 0000000..0beecd2 --- /dev/null +++ b/stacklets/backup/_config.py @@ -0,0 +1,183 @@ +"""Target config read/write for the backup stacklet's stack.toml entries. + +The framework's ``ctx.cfg`` reads and writes a stacklet's own section +(``[backup]``, in our case), but backup's target config lives one level +deeper at ``[backup.targets.]``. ``ctx.cfg`` can't address nested +tables, so this module provides narrow, atomic helpers scoped to that +schema only. + +We deliberately do *not* try to be a generalized TOML editor: + +* The reader uses ``tomllib`` (stdlib) so it benefits from a real + parser — no surprises with multi-line strings or array tables. +* The writer is a targeted block replacement: it finds the + ``[backup.targets.]`` header and replaces from there until the + next section. Comments and content outside the target block are + preserved byte-for-byte. + +If multiple stacklets eventually need the same kind of nested-table +write, this is the natural seed for a framework-level helper. Until +then, keeping the surface narrow protects the rest of stack.toml from +us. +""" + +from __future__ import annotations + +import os +import tempfile +from pathlib import Path +from typing import Optional + +try: + import tomllib +except ModuleNotFoundError: # pragma: no cover — py < 3.11 fallback + from stack._vendor import tomli as tomllib # type: ignore + + +def read_target(toml_path: Path, target_name: str) -> Optional[dict]: + """Read ``[backup.targets.]`` and return its config. + + Returns ``None`` when the file is missing, unreadable, or the + target isn't configured. Doesn't distinguish those cases at the + return value — callers that need the distinction can check + ``toml_path.exists()`` themselves. + """ + if not toml_path.exists(): + return None + try: + with toml_path.open("rb") as f: + data = tomllib.load(f) + except (tomllib.TOMLDecodeError, OSError): + return None + return data.get("backup", {}).get("targets", {}).get(target_name) + + +def write_target(toml_path: Path, target_name: str, config: dict) -> None: + """Create or replace ``[backup.targets.]`` atomically. + + All values in ``config`` are written as TOML basic strings (double- + quoted, with backslash + quote escaped). Int and bool aren't + supported because no current target field needs them — adding them + later is a one-line change in :func:`_render_value`. + + Comment handling: comments and blank lines that visually precede the + *next* section header are left attached to that section, not + swallowed into the replaced block. Comments *inside* the replaced + block (between key lines) are lost — replacing means replacing. + + The write goes through a temp file in the same directory followed + by ``os.replace``, so a crash mid-write can't leave a half-written + stack.toml. + """ + content = toml_path.read_text() if toml_path.exists() else "" + new_block = _render_block(target_name, config) + + bounds = _find_block(content, target_name) + if bounds is not None: + start, end = bounds + new_content = content[:start] + new_block + content[end:] + else: + new_content = _append_block(content, new_block) + + _atomic_write(toml_path, new_content) + + +# ── Internals ────────────────────────────────────────────────────────────── + +def _render_block(target_name: str, config: dict) -> str: + """Render a ``[backup.targets.]`` block with given values.""" + lines = [f"[backup.targets.{target_name}]"] + for k, v in config.items(): + lines.append(f"{k} = {_render_value(v)}") + return "\n".join(lines) + "\n" + + +def _render_value(value) -> str: + """Serialize a Python value as a TOML literal. Strings only for now.""" + return f'"{_toml_escape(str(value))}"' + + +def _toml_escape(s: str) -> str: + """Escape backslashes and double quotes for TOML basic strings. + + Other control characters aren't escaped here — none of the target + config fields (engine name, disk name, cron schedule) plausibly + contain newlines or tabs. If that assumption changes, replace + this with a fuller escape table. + """ + return s.replace("\\", "\\\\").replace('"', '\\"') + + +def _find_block(content: str, target_name: str) -> Optional[tuple]: + """Locate ``[backup.targets.]`` in ``content``. + + Returns ``(start_offset, end_offset)`` for slicing — start is the + beginning of the header line; end is just past the last key=value + line of the block. Trailing blank lines and comments that visually + attach to the *next* section are NOT consumed; they stay where + they were so writing one target can't accidentally orphan another + target's lead-in comment. + + Returns ``None`` when the header isn't present. + """ + header = f"[backup.targets.{target_name}]" + lines = content.splitlines(keepends=True) + + start_line = None + for i, line in enumerate(lines): + if line.strip() == header: + start_line = i + break + if start_line is None: + return None + + # Walk forward. A "data line" (key = value) extends the block; a + # blank or comment line does not — those belong to whatever + # follows. Stop at the next section header. + last_data_line = start_line + cursor = start_line + 1 + while cursor < len(lines): + stripped = lines[cursor].strip() + if stripped.startswith("["): + break + if stripped and not stripped.startswith("#"): + last_data_line = cursor + cursor += 1 + + end_line = last_data_line + 1 + start_offset = sum(len(l) for l in lines[:start_line]) + end_offset = sum(len(l) for l in lines[:end_line]) + return start_offset, end_offset + + +def _append_block(content: str, new_block: str) -> str: + """Append a block to file content, ensuring one blank line of + separation from whatever came before.""" + if not content: + return new_block + stripped = content.rstrip("\n") + return f"{stripped}\n\n{new_block}" + + +def _atomic_write(path: Path, content: str) -> None: + """Write ``content`` to ``path`` via a same-directory temp file + + rename. ``os.replace`` is atomic on POSIX so readers see either the + old or new file, never a half-written one.""" + fd, tmp_path_str = tempfile.mkstemp( + prefix=path.name + ".", + suffix=".tmp", + dir=str(path.parent), + ) + tmp_path = Path(tmp_path_str) + try: + with os.fdopen(fd, "w") as f: + f.write(content) + os.replace(tmp_path, path) + except Exception: + # Best-effort cleanup; missing_ok is fine because replace may + # have already moved the file. + try: + tmp_path.unlink() + except FileNotFoundError: + pass + raise diff --git a/stacklets/backup/_cron.py b/stacklets/backup/_cron.py new file mode 100644 index 0000000..d1de9ba --- /dev/null +++ b/stacklets/backup/_cron.py @@ -0,0 +1,143 @@ +"""Crontab install/remove helpers for the backup stacklet. + +Backup's nightly run is a cron entry rather than a launchd job because +launchd's sandbox blocks ``diskutil`` operations even with Full Disk +Access — the only combination macOS actually allows is cron invoking +an .app bundle via ``open`` (see +``family-server/backup/docs/MACOS-SANDBOX-BACKUP-SCRIPT.md`` for the +full history of approaches that didn't work). + +Entries are identified by an inline marker comment:: + + 0 2 * * * open /Volumes/... # famstack-backup-vault + +The marker has the target name appended so multiple targets (vault, +offsite) can coexist without one's removal touching the other's entry. +``install_entry`` and ``remove_entry`` are both idempotent — running +either twice is harmless. + +The contract: ``stack down backup`` and ``stack destroy backup`` MUST +both leave the user's crontab free of our entries. If an edit fails +(locked file, permission denied) we raise with the exact line the user +should remove manually rather than failing silently — a stale cron +entry firing nightly against an uninstalled backup is the worst +operational outcome (see ``project_backup_lifecycle.md``). +""" + +from __future__ import annotations + +import subprocess +from typing import List + + +MARKER_PREFIX = "# famstack-backup-" + + +def marker_for(target_name: str) -> str: + """Return the inline marker string for a given target. The string + is what gets appended to the cron line after ``#`` so removal can + find and drop it again.""" + return f"famstack-backup-{target_name}" + + +def install_entry(schedule: str, command: str, target_name: str) -> bool: + """Install a cron entry for ``target_name`` with the given schedule + and command. + + Idempotent: if an entry with this target's marker is already + present, it's replaced rather than duplicated. Returns ``True`` if + the crontab changed, ``False`` if the desired entry was already + present byte-for-byte. + + Raises ``RuntimeError`` if the ``crontab`` command fails — the + caller surfaces the exact line for manual installation. + """ + marker = marker_for(target_name) + desired_line = f"{schedule} {command} # {marker}" + + current = _read_crontab() + without_ours = [l for l in current if marker not in l] + new_lines = without_ours + [desired_line] + + if new_lines == current: + return False + + _write_crontab("\n".join(new_lines) + "\n") + return True + + +def remove_entry(target_name: str) -> bool: + """Remove the cron entry for ``target_name`` if present. + + Idempotent — returns ``False`` when no matching entry was found + (already removed, or never installed). ``True`` when an entry was + actually removed. + """ + marker = marker_for(target_name) + current = _read_crontab() + filtered = [l for l in current if marker not in l] + if filtered == current: + return False + _write_crontab("\n".join(filtered) + "\n" if filtered else "") + return True + + +def is_installed(target_name: str) -> bool: + """True if a cron entry tagged with this target's marker is present + in the current user's crontab.""" + marker = marker_for(target_name) + return any(marker in l for l in _read_crontab()) + + +def remove_all_entries() -> int: + """Remove every cron entry whose marker matches ``famstack-backup-*``. + + Useful in destroy paths when we may have lost track of which target + names exist (stack.toml deleted, partial install). Returns the + number of entries removed. + """ + current = _read_crontab() + filtered = [l for l in current if MARKER_PREFIX not in l] + removed = len(current) - len(filtered) + if removed == 0: + return 0 + _write_crontab("\n".join(filtered) + "\n" if filtered else "") + return removed + + +# ── crontab I/O ──────────────────────────────────────────────────────────── + +def _read_crontab() -> List[str]: + """Current user's crontab as a list of lines. + + Returns ``[]`` for "no crontab installed" — the typical first-run + state on a fresh Mac. Other crontab failures (locked, permission + denied) also return ``[]`` so the caller's subsequent write + attempt is the one that surfaces the real error. + """ + result = subprocess.run( + ["crontab", "-l"], + capture_output=True, text=True, + ) + if result.returncode != 0: + return [] + return result.stdout.splitlines() + + +def _write_crontab(content: str) -> None: + """Replace the current user's crontab with ``content``. + + Raises :class:`RuntimeError` with the crontab command's stderr on + failure. We never want to swallow this — a write failure means our + install or remove didn't actually happen. + """ + result = subprocess.run( + ["crontab", "-"], + input=content, text=True, + capture_output=True, + ) + if result.returncode != 0: + raise RuntimeError( + f"crontab edit failed (exit {result.returncode}): " + f"{result.stderr.strip() or 'no error output'}" + ) diff --git a/stacklets/backup/cli/_orchestrator.py b/stacklets/backup/cli/_orchestrator.py new file mode 100644 index 0000000..82c6ba0 --- /dev/null +++ b/stacklets/backup/cli/_orchestrator.py @@ -0,0 +1,404 @@ +"""Backup orchestrator helpers — discovery, invocation, formatting. + +The orchestrator's job is to coordinate the engine. The engine knows +how to back up; the orchestrator knows *what* to back up and *where to +report* the outcome. + +Pipeline: + + 1. Read ``[backup.targets.*]`` from stack.toml. + 2. Walk enabled stacklets for ``[[backup.archive]]`` entries. + 3. Render template variables in the declared paths. + 4. For each target: build the ``$SOURCES`` env string, invoke the + engine, read the latest entry from ``history.jsonl``, format and + post a Matrix summary. + +This module holds the pure functions so they can be unit-tested +without running rsync, diskutil, or Matrix logins. The entry point +``cli/sync.py`` is a thin wrapper that wires them up. +""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import List, Optional, Tuple + +try: + import tomllib +except ModuleNotFoundError: # pragma: no cover — py < 3.11 fallback + from stack._vendor import tomli as tomllib # type: ignore + + +# ── Domain types ─────────────────────────────────────────────────────────── + +@dataclass +class SourceRecord: + """One ``[[backup.archive]]`` entry, after template rendering. + + These are the records the engine consumes. The orchestrator builds + them by discovery; the engine never reads stacklet manifests + itself. + """ + + id: str # "{stacklet_id}/{archive.name}", e.g. "photos/library" + display: str # Human-readable, e.g. "Photos" + src_path: Path # Absolute path on internal SSD (post-rendering) + vault_subdir: str # Relative path under /Volumes// + min_files: int # Coarse ransomware guard threshold + + +@dataclass +class Target: + """One ``[backup.targets.]`` block from stack.toml.""" + + name: str # User-chosen label ("vault", "offsite", ...) + engine: str # "external-disk" (today), "restic" (future) + disk: str # Volume name; meaningful for external-disk + schedule: str # Cron expression; informational at this level + + +# ── Source discovery ─────────────────────────────────────────────────────── + +def discover_archive_sources( + repo_root: Path, + instance_dir: Path, + data_dir: Path, +) -> List[SourceRecord]: + """Walk every ``stacklets/*/stacklet.toml`` and gather + ``[[backup.archive]]`` entries. + + A stacklet contributes only if it's enabled — presence of + ``.stack/{id}.setup-done`` under ``instance_dir``. Template vars + (currently just ``{data_dir}``) are rendered into the path field. + + The source ``id`` is ``{stacklet_id}/{archive.name}`` so a single + stacklet can declare multiple archives without collision. The + vault subdirectory is derived as ``data/{stacklet_id}-{name}`` — + short, stable, and namespaced so future stacklets can't accidentally + clobber existing archive directories. + """ + stacklets_dir = repo_root / "stacklets" + if not stacklets_dir.is_dir(): + return [] + + template_vars = {"data_dir": str(data_dir)} + sources: List[SourceRecord] = [] + + for manifest_path in sorted(stacklets_dir.glob("*/stacklet.toml")): + stacklet_id = manifest_path.parent.name + if not _is_setup_done(instance_dir, stacklet_id): + continue + + manifest = _safe_load_toml(manifest_path) + if manifest is None: + continue + + archives = manifest.get("backup", {}).get("archive", []) + if not archives: + continue + + stacklet_display = manifest.get("name", stacklet_id) + + for archive in archives: + name = archive.get("name", "default") + raw_path = archive.get("path", "") + try: + rendered_path = raw_path.format(**template_vars) + except (KeyError, IndexError): + # An unrecognized template variable. Treat the raw + # string as the literal path; the engine's preflight + # will surface the problem with a useful error. + rendered_path = raw_path + + try: + min_files = int(archive.get("min_files", 1)) + except (TypeError, ValueError): + min_files = 1 + + sources.append(SourceRecord( + id=f"{stacklet_id}/{name}", + display=stacklet_display, + src_path=Path(rendered_path), + vault_subdir=f"data/{stacklet_id}-{name}", + min_files=min_files, + )) + + return sources + + +def _is_setup_done(instance_dir: Path, stacklet_id: str) -> bool: + """Replicates Stack._is_set_up without importing Stack — keeps this + module standalone-testable.""" + return (instance_dir / ".stack" / f"{stacklet_id}.setup-done").exists() + + +def _safe_load_toml(path: Path) -> Optional[dict]: + """Load TOML, returning None on read or parse failure. A single + broken manifest shouldn't take down the whole sync.""" + try: + with path.open("rb") as f: + return tomllib.load(f) + except (tomllib.TOMLDecodeError, OSError): + return None + + +# ── Target discovery ─────────────────────────────────────────────────────── + +def get_targets(stack_config: dict) -> List[Target]: + """Parse ``[backup.targets.*]`` from stack.toml. Returns empty list + when the section is missing or contains nothing usable. + + Entries lacking ``engine`` are skipped — they're malformed and + shouldn't silently behave like a default. + """ + targets_cfg = stack_config.get("backup", {}).get("targets", {}) + targets: List[Target] = [] + for name, cfg in targets_cfg.items(): + if not isinstance(cfg, dict): + continue + engine = cfg.get("engine", "") + if not engine: + continue + targets.append(Target( + name=name, + engine=engine, + disk=cfg.get("disk", ""), + schedule=cfg.get("schedule", ""), + )) + return targets + + +# ── Engine invocation ────────────────────────────────────────────────────── + +def serialize_sources_env(sources: List[SourceRecord]) -> str: + """Format SourceRecords for the engine's ``$SOURCES`` env var. + + The engine's :func:`parse_sources` expects newline-separated, + pipe-delimited records: ``id|display|src_path|vault_subdir|min_files``. + """ + return "\n".join( + f"{s.id}|{s.display}|{s.src_path}|{s.vault_subdir}|{s.min_files}" + for s in sources + ) + + +def build_engine_command( + engine_script: Path, + args: argparse.Namespace, +) -> List[str]: + """Compose the engine subprocess command line.""" + cmd = [sys.executable, str(engine_script)] + if getattr(args, "dry_run", False): + cmd.append("--dry-run") + if getattr(args, "no_eject", False): + cmd.append("--no-eject") + if getattr(args, "verbose", False): + cmd.append("--verbose") + if getattr(args, "verify", False): + cmd.append("--verify") + return cmd + + +def invoke_engine( + engine_script: Path, + backup_data_dir: Path, + target: Target, + sources: List[SourceRecord], + args: argparse.Namespace, +) -> int: + """Run the engine for one target. Returns the engine's exit code. + + The engine appends to ``history.jsonl`` under + ``BACKUP_DATA_DIR/logs/``; callers read the last entry via + :func:`read_latest_run` after this returns. + + The canary is planted by ``on_install``, not by the engine on + first run, so the orchestrator doesn't need to pass any extra + state for that — the engine simply verifies the canary it expects + to find. + """ + env = os.environ.copy() + env["BACKUP_DATA_DIR"] = str(backup_data_dir) + env["VAULT_DISK"] = target.disk + env["SOURCES"] = serialize_sources_env(sources) + + cmd = build_engine_command(engine_script, args) + result = subprocess.run(cmd, env=env) + return result.returncode + + +# ── Result reading ───────────────────────────────────────────────────────── + +def read_latest_run(backup_data_dir: Path) -> Optional[dict]: + """Return the most recent run's outcome by tail-scanning + ``history.jsonl``. + + Returns ``None`` when the file is missing or contains no parseable + line. The caller treats that case as "the engine crashed before it + could report" — distinct from a written failure result. + + Tolerates a corrupted trailing line (e.g. a partial write from a + crashed engine, though our writes fit under PIPE_BUF and should be + atomic). Walks past unparseable lines and returns the last good + one. + """ + path = backup_data_dir / "logs" / "history.jsonl" + if not path.exists(): + return None + latest: Optional[dict] = None + try: + with path.open() as f: + for line in f: + stripped = line.strip() + if not stripped: + continue + try: + latest = json.loads(stripped) + except json.JSONDecodeError: + continue + except OSError: + return None + return latest + + +# ── Notification formatting ──────────────────────────────────────────────── + +def format_notification(target_name: str, result: dict) -> Tuple[str, str]: + """Build ``(plain_text, html)`` Matrix bodies for one target's result. + + The plain version is the fallback for text-only clients. The HTML + version bolds the numbers that matter (totals, new files) so + Element renders them readably. + """ + headline_plain, headline_html = _headline(result) + state_plain, state_html = _vault_state_line(result) + source_lines_plain, source_lines_html = _source_lines(result) + + duration_str = _duration(result.get("duration_seconds", 0)) + vault_size = result.get("vault_size", "unknown") + run_user = result.get("run_user") or "unknown" + run_context = result.get("run_context", "unknown") + + failure_line_plain = "" + failure_line_html = "" + if not result.get("success") and result.get("failure_reason"): + failure_line_plain = f"Reason: {result['failure_reason']}\n" + failure_line_html = f"Reason: {result['failure_reason']}
" + + plain = ( + f"{headline_plain}\n" + f"{failure_line_plain}" + f"\n" + f"Target: {target_name}\n" + f"Duration: {duration_str} | Backup size: {vault_size}\n" + f"Run by: {run_user} via {run_context}\n" + f"\n" + + ("\n".join(source_lines_plain) + "\n" if source_lines_plain else "") + + f"\n{state_plain}" + ) + html = ( + f"{headline_html}
" + f"{failure_line_html}" + f"
" + f"Target: {target_name}
" + f"Duration: {duration_str}  |  " + f"Backup size: {vault_size}
" + f"Run by: {run_user} via {run_context}
" + f"
" + + ("
".join(source_lines_html) + "
" if source_lines_html else "") + + f"
{state_html}" + ) + return plain, html + + +def _headline(result: dict) -> Tuple[str, str]: + if result.get("dry_run"): + return "🧪 Backup Sync (dry run)", "🧪 Backup Sync (dry run)" + if result.get("success"): + return "✅ Backup Sync Completed", "✅ Backup Sync Completed" + return "❌ Backup Sync FAILED", "❌ Backup Sync FAILED" + + +def _vault_state_line(result: dict) -> Tuple[str, str]: + state = result.get("vault_state", "unknown") + run_context = result.get("run_context", "unknown") + if state == "not_connected": + msg = "⚠️ Backup disk not connected." + elif state == "mounted": + # Document the operational truth: scheduled syncs can't eject, + # so a mounted disk after a cron run isn't a problem — it's the + # expected steady state. + msg = f"ℹ️ Backup disk mounted (eject not available from {run_context})." + elif state == "ejected": + msg = "⏏️ Backup disk ejected." + else: + msg = f"Backup disk state: {state}" + return msg, msg + + +def _source_lines(result: dict) -> Tuple[List[str], List[str]]: + plain: List[str] = [] + html: List[str] = [] + for src in result.get("sources", []): + emoji = _source_emoji(src.get("id", "")) + display = src.get("display", "Source") + status = src.get("status", "skipped") + total = src.get("total_files", 0) + new = src.get("new_files", 0) + + if status == "ok": + plain.append( + f"{emoji} {display} — {_format_number(total)} files " + f"({_format_number(new)} new)" + ) + html.append( + f"{emoji} {display} — {_format_number(total)} files " + f"({_format_number(new)} new)" + ) + elif status == "FAILED": + plain.append(f"{emoji} {display} — FAILED") + html.append(f"{emoji} {display} — FAILED") + else: + plain.append(f"{emoji} {display} — skipped") + html.append(f"{emoji} {display} — skipped") + + return plain, html + + +def _source_emoji(source_id: str) -> str: + """Decorative emoji for a source family. Pure cosmetics — no + semantics depend on these.""" + if source_id.startswith("photos/"): + return "📷" + if source_id.startswith("docs/"): + return "📄" + if source_id.startswith("code/"): + return "💾" + return "📦" + + +def _duration(seconds: int) -> str: + """Format duration as ``Xm Ys``.""" + mins, secs = divmod(int(seconds), 60) + return f"{mins}m {secs}s" + + +def _format_number(n: int) -> str: + """Dot-separated thousands. Mirrors the engine's helper so notification + numbers match the engine's terminal output; comma triggers phone-number + linkification in Element.""" + s = str(n) + if len(s) <= 3: + return s + parts: List[str] = [] + while len(s) > 3: + parts.insert(0, s[-3:]) + s = s[:-3] + parts.insert(0, s) + return ".".join(parts) diff --git a/stacklets/backup/cli/status.py b/stacklets/backup/cli/status.py new file mode 100644 index 0000000..24a9e46 --- /dev/null +++ b/stacklets/backup/cli/status.py @@ -0,0 +1,30 @@ +"""stack backup status — show the last run and current target health. + +Reports per target: + - When it was last synced and whether it succeeded + - Per-source file counts (from the source paths on the SSD) + - Whether the canary file is intact + - Whether the cron entry for the next scheduled run is wired + - Whether the disk is currently mounted + +A mounted disk is the steady-state expectation for scheduled mode — +eject is sandbox-blocked from cron, so the disk stays mounted between +runs and files are kernel-immutable. "Not mounted" only deserves a +warning when: + - The disk is encrypted and hasn't been unlocked since reboot + - The disk is physically disconnected + - The previous scheduled run failed to find /Volumes/ + +When the disk is mounted, the command also reports vault size, per- +source file counts on the vault, and free space. When it isn't, those +fields show the last known values from the previous successful run. + +Outputs JSON when piped, human-readable otherwise. Matches the +convention of `stack status`, `stack errors`, `stack host`. +""" + +HELP = "Show last-run status and target health" + +raise NotImplementedError( + "backup stacklet scaffold — see stacklets/backup/README.md" +) diff --git a/stacklets/backup/cli/sync.py b/stacklets/backup/cli/sync.py new file mode 100644 index 0000000..c3cfe40 --- /dev/null +++ b/stacklets/backup/cli/sync.py @@ -0,0 +1,220 @@ +"""stack backup sync — run a backup now. + +Discovers ``[[backup.archive]]`` sources from every enabled stacklet +(append-only stores) and routes them through every configured +``[backup.targets.*]`` engine. Each engine writes a structured result +to ``$BACKUP_DATA_DIR/logs/history.jsonl``; the orchestrator reads +that file, formats a per-target summary, and posts it to the +``#famstack`` room as ``stacker-bot``. + +Exit code is 0 only if every target succeeded. A missing notification +(Matrix unavailable, room not yet created) is soft-failed — the sync +itself is recorded regardless. + +Usage: + stack backup sync full sync of all sources to all targets + stack backup sync --dry-run preview only — no writes, no mounts + stack backup sync --no-eject keep the disk mounted after sync + stack backup sync --verbose rsync file-level output + stack backup sync --verify compare file counts source vs vault +""" + +HELP = "Run a backup now (all sources to all configured targets)" + +import argparse +import sys +from pathlib import Path +from typing import Optional + +_here = Path(__file__).parent +sys.path.insert(0, str(_here)) +from _orchestrator import ( + SourceRecord, + Target, + discover_archive_sources, + format_notification, + get_targets, + invoke_engine, + read_latest_run, +) + +# MatrixClient lives in the messages stacklet — the canonical Matrix +# interface for any CLI plugin that needs to post. Cross-stacklet import +# here is fine because messaging is a hard prerequisite for sending a +# notification anyway: if messages isn't around, neither is MatrixClient. +_messages_cli = _here.parent.parent / "messages" / "cli" +sys.path.insert(0, str(_messages_cli)) + + +def _parse_args(argv: list) -> argparse.Namespace: + parser = argparse.ArgumentParser(prog="stack backup sync", description=HELP) + parser.add_argument("--dry-run", action="store_true", + help="Preview what would be synced (no changes).") + parser.add_argument("--no-eject", action="store_true", + help="Keep vault mounted after sync.") + parser.add_argument("--verbose", action="store_true", + help="Show rsync file-level details.") + parser.add_argument("--verify", action="store_true", + help="Compare file counts source vs vault after sync.") + return parser.parse_args(argv) + + +def _resolve_backup_data_dir(config: dict) -> Path: + """Render the backup stacklet's own ``BACKUP_DATA_DIR`` from its + manifest. The default is ``{data_dir}/backup`` — we don't import the + full template renderer here; substituting one variable is enough.""" + manifest = config.get("manifest", {}) + template = manifest.get("env", {}).get("defaults", {}).get( + "BACKUP_DATA_DIR", "{data_dir}/backup" + ) + data_dir = config.get("data_dir", "") + try: + return Path(template.format(data_dir=data_dir)) + except (KeyError, IndexError): + # Fall back to the well-known default. + return Path(data_dir) / "backup" + + +def _post_notification(plain: str, html: str, config: dict) -> Optional[str]: + """Post a notification to ``#famstack`` as stacker-bot. + + Returns an error string on failure, ``None`` on success. The + orchestrator soft-fails on notification problems — failing to + deliver the message must not mask the actual sync outcome. + """ + try: + from _matrix import MatrixClient # type: ignore + except ImportError: + return "MatrixClient not importable (messages stacklet missing?)" + + stack_cfg = config.get("stack", {}) + secrets = config.get("secrets", {}) + server_name = stack_cfg.get("messages", {}).get("server_name", "home") + + bot_pass = (secrets.get("core__STACKER_BOT_PASSWORD") + or secrets.get("messages__STACKER_BOT_PASSWORD", "")) + if not bot_pass: + return "stacker-bot password not in secrets — is core set up?" + + instance_dir = config.get("instance_dir", config.get("repo_root", ".")) + # Synapse host port: the messages stacklet binds 42031. We read it + # from the messages manifest if available; otherwise use the + # well-known default. The send.py plugin uses the same constant. + synapse_port = _read_synapse_port(Path(config.get("repo_root", "."))) + base_url = f"http://localhost:{synapse_port}" + + client = MatrixClient(base_url, server_name, instance_dir) + if not client.login("stacker-bot", bot_pass): + return "stacker-bot login failed — run 'stack messages setup' to re-create the account" + + ok, detail = client.send("famstack", plain, html=html) + if not ok: + return f"Matrix send failed: {detail}" + return None + + +def _read_synapse_port(repo_root: Path) -> int: + """Pull synapse port from the messages stacklet manifest, falling + back to the standard 42031.""" + try: + import tomllib + except ModuleNotFoundError: # pragma: no cover + from stack._vendor import tomli as tomllib # type: ignore + manifest_path = repo_root / "stacklets" / "messages" / "stacklet.toml" + try: + with manifest_path.open("rb") as f: + manifest = tomllib.load(f) + return int(manifest.get("ports", {}).get("synapse", 42031)) + except (OSError, tomllib.TOMLDecodeError, ValueError): + return 42031 + + +def _engine_script(repo_root: Path, engine_name: str) -> Path: + """Resolve the engine script path. Today only ``external-disk`` + exists; future engines slot in beside it under ``engines/``.""" + return repo_root / "stacklets" / "backup" / "engines" / engine_name / "sync.py" + + +def _print_summary(target_name: str, result: Optional[dict]) -> None: + """Echo a one-line summary to stdout for the user running interactively.""" + if result is None: + print(f" [{target_name}] engine crashed before writing a result") + return + outcome = "ok" if result.get("success") else "FAILED" + new_total = sum(s.get("new_files", 0) for s in result.get("sources", [])) + duration = int(result.get("duration_seconds", 0)) + print( + f" [{target_name}] {outcome} — {new_total} new files, " + f"{duration}s, vault_state={result.get('vault_state', '?')}" + ) + + +def run(args, stacklet, config): + """Entry point invoked by the framework via ``stack backup sync``. + + ``args`` is the list of unparsed arguments after the subcommand. + ``stacklet`` is the backup stacklet's discovered record. + ``config`` is the framework-supplied context dict (see Stack.run_cli_command). + """ + parsed = _parse_args(args or []) + + repo_root = Path(config.get("repo_root", ".")) + instance_dir = Path(config.get("instance_dir", repo_root)) + data_dir = Path(config.get("data_dir", ".")) + backup_data_dir = _resolve_backup_data_dir(config) + + targets = get_targets(config.get("stack", {})) + if not targets: + return { + "error": "No backup targets configured. Add a [backup.targets.] " + "block to stack.toml (see stack.example.toml)." + } + + sources = discover_archive_sources(repo_root, instance_dir, data_dir) + if not sources: + return { + "error": "No backup sources discovered. No enabled stacklet declares " + "[[backup.archive]] in its manifest." + } + + # Run each target sequentially. Each engine call overwrites + # history.jsonl, so we read the latest entry before invoking the next. + target_results: list = [] + any_failed = False + + for target in targets: + engine_script = _engine_script(repo_root, target.engine) + if not engine_script.exists(): + print(f" [{target.name}] engine '{target.engine}' not found at {engine_script}", + file=sys.stderr) + target_results.append((target, None)) + any_failed = True + continue + + invoke_engine( + engine_script, backup_data_dir, + target, sources, parsed, + ) + result = read_latest_run(backup_data_dir) + target_results.append((target, result)) + + _print_summary(target.name, result) + if result is None or not result.get("success"): + any_failed = True + + # Post notification per target. Soft-fail: messaging problems + # are reported but don't mask the sync outcome. + if result is not None: + plain, html = format_notification(target.name, result) + notify_error = _post_notification(plain, html, config) + if notify_error: + print(f" [{target.name}] notification skipped: {notify_error}", + file=sys.stderr) + + if any_failed: + return {"error": "One or more targets failed; see history.jsonl for details"} + return { + "ok": True, + "targets": [t.name for t, _ in target_results], + "sources": len(sources), + } diff --git a/stacklets/backup/engines/external-disk/README.md b/stacklets/backup/engines/external-disk/README.md new file mode 100644 index 0000000..1a64c4e --- /dev/null +++ b/stacklets/backup/engines/external-disk/README.md @@ -0,0 +1,149 @@ +# external-disk engine + +Backs up stacklet data to an APFS-formatted external disk attached via +USB or Thunderbolt. The disk is mounted by macOS itself (auto-mount for +plain APFS, login-time Keychain unlock for encrypted) and the engine +writes to it as it finds it. + +The engine implements **append-only archive** semantics — in the +storage-industry vocabulary this is called WORM (Write Once Read Many). +The user-facing word is "archive"; "WORM" stays as the technical term +for engine internals and threat-model discussion. + +## Guarantees + +This engine commits to three guarantees. If any of them cannot be +satisfied, the engine **refuses to run** rather than degrading silently. + +1. **Kernel-enforced immutability.** Every file on the destination has + the BSD `uchg` flag set. The kernel refuses modify, delete, rename, + and unlink, even to the owner. +2. **Append-only.** rsync runs with `--ignore-existing`. Existing files + on the destination are never touched. No `--delete`. +3. **Zero unlock window.** Files are never unlocked during a sync — + `--ignore-existing` skips them entirely, so the immutability flag + stays on. Only newly-written files get the flag applied afterward. + +Eject after sync is a best-effort bonus, not a guarantee. From an +interactive Terminal session, `diskutil eject` works and the disk goes +offline; from the scheduled cron context, eject is sandbox-blocked and +the disk stays mounted. The three guarantees above carry the append- +only contract on their own — eject just adds an extra "invisible to OS" +layer on top when it's available. + +## Why not network shares + +SMB and NFS mounts technically appear under `/Volumes/`, but they break +every one of the guarantees above: + +- `chflags uchg` is a BSD filesystem flag — not transmitted over SMB or + NFS. The kernel can't enforce immutability on a remote filesystem. +- The share stays reachable as long as the network is up; there's no + "physical disconnect" protection layer available. +- APFS encryption is irrelevant; the bytes live on the NAS, not here. + +If `external-disk` detects a non-APFS / non-HFS filesystem on the +destination, it aborts with a message pointing the user at the future +`restic` engine. NAS-based backup will use restic's own append-only +mode, which is enforced by the restic repo format and works over SFTP/ +SMB/S3 alike. + +## Sandbox notes + +`diskutil` operations (mount, eject, unlock) are restricted by macOS TCC +when called from `cron`, `launchd`, or any binary that hasn't been +granted Full Disk Access. The fix is the `.app` wrapper: a minimal app +bundle whose only purpose is to receive the FDA grant. Cron invokes it +via `open /path/to/FamstackVaultSync.app`, which routes through the +proper macOS app lifecycle and inherits the FDA permission. + +`diskutil eject` from cron is sandbox-blocked even with FDA. The disk +stays mounted after scheduled runs (uchg flags still protect the data); +manual runs from Terminal eject normally. + +See `family-server/backup/docs/MACOS-SANDBOX-BACKUP-SCRIPT.md` for the +full history of approaches that were tried and failed. + +## Files + +| File | Status | Role | +|---|---|---| +| `sync.py` | shipping | the append-only sync (ported from family-server) | +| `restore.py` | pending | copy files back to a target path, remove `uchg` flags | + +The filesystem capability check lives inside `sync.py` as +`probe_filesystem()` rather than a separate file — it's one function +call, no separate process needed. + +## Input contract + +`sync.py` reads three environment variables. The orchestrator +(`cli/sync.py`) is responsible for populating them. + +| Variable | Purpose | +|---|---| +| `BACKUP_DATA_DIR` | Host-side state directory (canary, logs, result JSON). Refused if under `/Volumes/`. | +| `VAULT_DISK` | APFS volume name. Mount point is `/Volumes/`. | +| `SOURCES` | Newline-separated, pipe-delimited records: `\|\|\|\|` | + +Arguments are POSIX-style: `--dry-run`, `--no-eject`, `--verbose`, +`--verify`. + +## Output contract + +| Output | Always written? | Schema / purpose | +|---|---|---| +| stdout | yes | Human-readable progress (TTY-aware coloring) | +| stderr | on failure | Warnings + errors | +| `$BACKUP_DATA_DIR/logs/sync.log` | yes (best-effort) | Human-readable audit log | +| `$BACKUP_DATA_DIR/logs/history.jsonl` | yes — even on crash | One JSON object per run, append-only | +| Exit code | yes | `0` ok, `1` hard failure | + +### `history.jsonl` schema + +Each line is a self-contained JSON object representing one run. The +caller reads the latest run by tail-scanning the file (the last good +JSON line wins). New runs append; old runs are never modified. + +```json +{ + "success": true, + "dry_run": false, + "failure_reason": null, + "duration_seconds": 125, + "started_at": "2026-05-14T02:00:00Z", + "ended_at": "2026-05-14T02:02:05Z", + "run_context": "cron", + "run_user": "arthur", + "vault_disk": "backup-vault", + "vault_state": "mounted", + "vault_size": "8.2G", + "sources": [ + { + "id": "photos/library", + "display": "Photos", + "status": "ok", + "total_files": 48293, + "new_files": 12 + } + ] +} +``` + +`vault_state` is one of `mounted`, `ejected`, `not_connected`. The +orchestrator treats an empty or missing `history.jsonl` as "the engine +crashed before it could report" — distinct from a written failure run. + +Writes are atomic: each line is one `write()` syscall under the POSIX +PIPE_BUF limit (4KB), so concurrent appenders and crashed engines +can't interleave or truncate a record mid-line. + +## Testing + +`tests/stacklets/test_backup_engine.py` covers the pure-Python parts: +source parsing, canary creation and tampering, preflight thresholds, +filesystem capability classification, result-file shape. The +rsync/diskutil/eject flows need a real disk and aren't part of the +unit suite. + +Run with: `uv run --extra test pytest tests/stacklets/test_backup_engine.py` diff --git a/stacklets/backup/engines/external-disk/sync.py b/stacklets/backup/engines/external-disk/sync.py new file mode 100755 index 0000000..434f8f3 --- /dev/null +++ b/stacklets/backup/engines/external-disk/sync.py @@ -0,0 +1,1046 @@ +#!/usr/bin/env python3 +"""external-disk engine — append-only backup of stacklet data. + +Ported from `family-server/backup/vault-sync.sh`. The control flow, exit +codes, and append-only contract are preserved; the implementation is +Python so it slots cleanly into a framework where every other layer +(hooks, CLI, lib) is also Python. + +Three adaptations from the bash original: + +1. Sources come from the ``$SOURCES`` env var (pipe-delimited records) + instead of a hardcoded list. The orchestrator (``cli/sync.py``) walks + enabled stacklets, gathers their ``[[backup.archive]]`` declarations, + and passes them in. + +2. The filesystem capability check is a function call + (:func:`probe_filesystem`) rather than a separate ``probe.sh``. + Refuses non-APFS/non-HFS+ targets so the kernel-immutability guarantee + can't silently degrade on SMB/NFS/exFAT. + +3. No Matrix notification. The engine appends one JSON object per run + to ``$BACKUP_DATA_DIR/logs/history.jsonl``; the caller reads the + latest entry and posts via ``stacker-bot``. Messaging is a separate + concern. + +Input (environment): + +================== ========================================================== +``BACKUP_DATA_DIR`` Required. The backup stacklet's own state directory + (canary file, audit log, run history, FDA .app + bundle). NOT the source data being backed up, NOT + the target vault disk. Follows the framework + convention {STACKLET}_DATA_DIR (cf. PAPERLESS_DATA_DIR). + Must NOT be under ``/Volumes/`` — the script refuses, + because that's where the framework's destroy cleanup + would not reach. +``VAULT_DISK`` Required. APFS volume name (e.g. ``backup-vault``). The + mount point is ``/Volumes/``. +``SOURCES`` Required. Newline-separated records, pipe-delimited:: + + |||| +``TZ`` Optional. Affects log timestamps. +================== ========================================================== + +Output: + +* stdout — human-readable progress. +* stderr — warnings and errors. +* ``$BACKUP_DATA_DIR/logs/history.jsonl`` — one JSON object per run, + append-only. Read the last good line for the latest run's outcome. +* ``$BACKUP_DATA_DIR/logs/sync.log`` — human-readable audit log. +* Exit code: 0 on success, 1 on hard failure. A run is always + appended to history, so the caller can distinguish "engine reported + a failure" from "engine crashed before it could report." +""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +import time +from dataclasses import asdict, dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import List, Optional + + +# ── Constants ────────────────────────────────────────────────────────────── + +CANARY_STRING = "famstack-backup-canary-do-not-delete" + +# Filesystem types that honor BSD ``uchg`` flags. Anything else is refused. +APPLE_FILESYSTEMS = frozenset({"apfs", "hfs"}) + +# Filesystem types we recognize and refuse with a tailored message. +NETWORK_FILESYSTEMS = frozenset({"smbfs", "nfs", "afpfs"}) +REMOVABLE_FILESYSTEMS = frozenset({"msdos", "exfat", "ntfs"}) + +# rsync exit codes that aren't real failures for incremental backups: +# 0 = success, 23 = partial transfer (vanished files during sync), +# 24 = source files vanished. All acceptable in our context. +RSYNC_OK_CODES = frozenset({0, 23, 24}) + + +# ── Domain types ─────────────────────────────────────────────────────────── + +@dataclass +class Source: + """One ``[[backup.archive]]`` declaration, as seen by the engine.""" + + id: str # global identity, e.g. "photos/library" + display: str # human-readable label, e.g. "Photos" + src_path: Path # absolute path on the internal SSD + vault_subdir: str # relative path under /Volumes// + min_files: int # coarse ransomware guard threshold + + +@dataclass +class SourceResult: + """Per-source outcome after the sync.""" + + id: str + display: str + status: str # "ok" | "FAILED" | "skipped" + total_files: int # files on vault after this run + new_files: int # files added this run + + +@dataclass +class SyncResult: + """The structured per-run result. One of these is serialized to JSON + and appended as a single line to ``history.jsonl``.""" + + success: bool = True + dry_run: bool = False + failure_reason: Optional[str] = None + duration_seconds: int = 0 + started_at: str = "" + ended_at: str = "" + run_context: str = "unknown" # "Terminal" | "cron" | "launchd" + run_user: str = "" + vault_disk: str = "" + vault_state: str = "unknown" # "mounted" | "ejected" | "not_connected" + vault_size: str = "unknown" + sources: List[SourceResult] = field(default_factory=list) + + +class SyncAborted(Exception): + """A pipeline step refused to continue (canary, preflight, mount, probe). + The caller catches and records as ``failure_reason``.""" + + +class DriveNotConnected(SyncAborted): + """Specifically: diskutil cannot see the volume at all. Surfaced as + a distinct ``vault_state`` so the caller can render a useful + message ("plug your backup disk in") instead of a generic failure. + """ + + +# ── Output helpers ───────────────────────────────────────────────────────── + +# ANSI colors only when stdout is a TTY. Cron output is a file; escape +# sequences would clutter it. +_USE_COLOR = sys.stdout.isatty() + +GREEN = "\033[0;32m" if _USE_COLOR else "" +RED = "\033[0;31m" if _USE_COLOR else "" +YELLOW = "\033[0;33m" if _USE_COLOR else "" +BOLD = "\033[1m" if _USE_COLOR else "" +NC = "\033[0m" if _USE_COLOR else "" + + +def info(msg: str) -> None: + print(f" {GREEN}→{NC} {msg}") + + +def warn(msg: str) -> None: + print(f" {YELLOW}⚠{NC} {msg}") + + +def error(msg: str) -> None: + print(f" {RED}✗{NC} {msg}", file=sys.stderr) + + +def header(msg: str) -> None: + print() + print(f"{BOLD}{msg}{NC}") + + +def append_log(log_path: Path, message: str) -> None: + """Append a timestamped line to the audit log. Best-effort: if the + log directory can't be created, the message is dropped. We never + want a logging failure to take down the sync.""" + try: + log_path.parent.mkdir(parents=True, exist_ok=True) + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + with log_path.open("a") as f: + f.write(f"[{ts}] {message}\n") + except OSError: + pass + + +# ── Source parsing ───────────────────────────────────────────────────────── + +def parse_sources(sources_env: str) -> List[Source]: + """Parse the ``$SOURCES`` env var into structured records. + + Records are newline-separated, fields pipe-delimited:: + + |||| + + Pipe over colon because paths can (rarely) contain colons on macOS + but never pipes. Empty input or malformed records raise + :class:`SyncAborted` — we'd rather fail loudly than silently skip + a misconfigured source. + """ + records: List[Source] = [] + for raw in sources_env.splitlines(): + line = raw.strip() + if not line: + continue + parts = line.split("|") + if len(parts) != 5: + raise SyncAborted( + f"Malformed source record: {line!r} " + f"(expected 5 pipe-delimited fields, got {len(parts)})" + ) + id_, display, src_path, vault_subdir, min_files = parts + try: + min_files_int = int(min_files) + except ValueError: + raise SyncAborted(f"min_files must be an integer in {line!r}") + records.append(Source( + id=id_, + display=display, + src_path=Path(src_path), + vault_subdir=vault_subdir, + min_files=min_files_int, + )) + if not records: + raise SyncAborted("No sources provided — $SOURCES is empty.") + return records + + +# ── Number formatting ────────────────────────────────────────────────────── + +def format_number(n: int) -> str: + """Format an integer with dot thousands separators (e.g. ``48.293``). + + Why dot, not comma: a comma triggers phone-number linkification in + some chat clients (Element among them); a dot doesn't. Reads + European but is unambiguous everywhere. + """ + s = str(n) + if len(s) <= 3: + return s + parts: List[str] = [] + while len(s) > 3: + parts.insert(0, s[-3:]) + s = s[:-3] + parts.insert(0, s) + return ".".join(parts) + + +def count_files(path: Path) -> int: + """Count regular files under a directory, recursively. Returns 0 if + the path doesn't exist or isn't a directory — the caller decides + whether that constitutes a failure.""" + if not path.is_dir(): + return 0 + return sum(1 for p in path.rglob("*") if p.is_file()) + + +# ── Canary (ransomware tripwire) ─────────────────────────────────────────── + +def verify_canary(canary_file: Path) -> None: + """Verify the canary's content. The engine never creates it. + + The canary is a small file on the internal SSD with known content. + If ransomware has encrypted any part of the data hierarchy, the + content won't match and we abort before touching the vault. + + Lifecycle: ``on_install`` plants the canary as part of setup; + framework destroy cleanup removes it with the rest of + ``BACKUP_DATA_DIR``. The engine only verifies — never creates, + never repairs. Missing canary means either the stacklet was never + installed or the tripwire has been deleted; both are refusals. + """ + header("Canary check") + + if not canary_file.exists(): + error("Canary file is missing.") + error(f"Expected at: {canary_file}") + error("If this is a fresh install, run 'stack up backup' first.") + error("If the stacklet IS installed, the tripwire has been tampered with.") + raise SyncAborted( + "Canary file is missing — backup not installed or tripwire deleted" + ) + + content = canary_file.read_text().strip() + if content != CANARY_STRING: + error("Canary file modified or corrupted!") + error(f"Expected: {CANARY_STRING!r}") + error(f"Got: {content!r}") + error("Possible ransomware or data corruption. Aborting.") + raise SyncAborted( + "Canary check failed — possible ransomware or data corruption" + ) + + info("Canary verified (internal SSD looks healthy)") + + +# ── Preflight ────────────────────────────────────────────────────────────── + +def preflight_check_sources(sources: List[Source]) -> None: + """Each source must exist and contain at least ``min_files`` entries. + + The canary catches "every file got encrypted in place"; this catches + "the directory got ``rm -rf``'d." Together they're a layered smoke + test that refuses to propagate a broken source to the vault. + """ + header("Preflight checks") + + failures: List[str] = [] + for src in sources: + if not src.src_path.is_dir(): + error(f"{src.display}: source directory not found ({src.src_path})") + failures.append(src.display) + continue + count = count_files(src.src_path) + if count < src.min_files: + error( + f"{src.display}: only {count} files " + f"(minimum: {src.min_files}) — refusing to sync" + ) + failures.append(src.display) + else: + info( + f"{src.display}: {format_number(count)} files " + f"(minimum: {src.min_files}) — ok" + ) + + if failures: + raise SyncAborted( + "Preflight failed — source directories missing or too few files" + ) + + +# ── Mount vault ──────────────────────────────────────────────────────────── + +@dataclass +class MountState: + """Outcome of :func:`mount_vault`. Tracked so the caller can report + whether the disk was already mounted vs. mounted by this run.""" + + was_already_mounted: bool + + +def mount_vault(vault_disk: str, mount_point: Path, dry_run: bool) -> MountState: + """Bring the vault to a mounted state. + + Plain APFS volumes auto-mount on physical connection — macOS does + this for us, so the common case is a no-op. Encrypted APFS volumes + need explicit unlock: we fetch the passphrase from the macOS + Keychain by Volume UUID and pipe it to ``diskutil -stdinpassphrase``. + diskutil's built-in keychain lookup is unreliable on macOS 26+; + explicit retrieval works. + + "Drive not connected" is raised as :class:`DriveNotConnected` (a + subclass of :class:`SyncAborted`) so the caller can mark + ``vault_state="not_connected"`` rather than a generic failure. + """ + header("Mounting backup disk") + + if mount_point.is_dir(): + info(f"Backup disk already mounted at {mount_point}") + return MountState(was_already_mounted=True) + + drive_detected = _diskutil_info_exists(vault_disk) + + if dry_run: + if drive_detected: + info("[DRY RUN] Would mount backup disk (drive detected)") + return MountState(was_already_mounted=False) + error(f"[DRY RUN] Backup disk not detected — volume {vault_disk!r} not found") + error("Is the drive enclosure powered on and connected? A real run would fail here.") + raise DriveNotConnected("Backup disk not connected") + + if not drive_detected: + error(f"Backup disk not detected — volume {vault_disk!r} not found in diskutil") + error("Is the drive enclosure powered on and connected via USB?") + raise DriveNotConnected("Backup disk not connected") + + if _is_filevault_encrypted(vault_disk): + info(f"Unlocking encrypted volume {vault_disk}...") + _unlock_encrypted_volume(vault_disk) + else: + info(f"Mounting {vault_disk}...") + _mount_plain_volume(vault_disk) + + info(f"Backup disk mounted at {mount_point}") + return MountState(was_already_mounted=False) + + +def _diskutil_info_exists(vault_disk: str) -> bool: + """True if diskutil can see the volume (even when its container is + locked but powered on).""" + result = subprocess.run( + ["diskutil", "info", vault_disk], + capture_output=True, + text=True, + ) + return result.returncode == 0 + + +def _is_filevault_encrypted(vault_disk: str) -> bool: + """Inspect ``diskutil apfs list`` to detect FileVault encryption. + + Snippet we parse:: + + | Name: backup-vault (Case-insensitive) + | Mount Point: ... + | ... + | FileVault: Yes + + The volume name and its FileVault flag appear in adjacent lines. + """ + result = subprocess.run( + ["diskutil", "apfs", "list"], + capture_output=True, + text=True, + ) + if result.returncode != 0: + return False + lines = result.stdout.splitlines() + for i, line in enumerate(lines): + if vault_disk in line: + for follow in lines[i:i + 6]: + if "FileVault:" in follow and "Yes" in follow: + return True + return False + return False + + +def _get_volume_uuid(vault_disk: str) -> str: + """Extract the Volume UUID from ``diskutil info``.""" + result = subprocess.run( + ["diskutil", "info", vault_disk], + capture_output=True, + text=True, + ) + for line in result.stdout.splitlines(): + if "Volume UUID" in line: + return line.rsplit(":", 1)[1].strip() + raise SyncAborted(f"Could not determine Volume UUID for {vault_disk}") + + +def _unlock_encrypted_volume(vault_disk: str) -> None: + """Fetch passphrase from Keychain, pipe it to diskutil. Raises with + a useful message if the Keychain entry is missing or wrong — the + user gets the exact ``security add-generic-password`` command to + fix it.""" + volume_uuid = _get_volume_uuid(vault_disk) + + keychain = subprocess.run( + ["security", "find-generic-password", "-a", volume_uuid, "-w"], + capture_output=True, + text=True, + ) + if keychain.returncode != 0 or not keychain.stdout.strip(): + error(f"No passphrase found in Keychain for volume UUID {volume_uuid}") + error( + "Add it with: security add-generic-password " + f'-a "{volume_uuid}" -s "{volume_uuid}" ' + f'-D "APFS Volume Password" -l "{vault_disk}" -w \'PASSPHRASE\'' + ) + raise SyncAborted("Backup disk passphrase not in Keychain") + + passphrase = keychain.stdout.rstrip("\n") + unlock = subprocess.run( + ["diskutil", "apfs", "unlockVolume", vault_disk, "-stdinpassphrase"], + input=passphrase, + text=True, + capture_output=True, + ) + if unlock.returncode != 0: + raise SyncAborted( + "Failed to unlock backup disk. Wrong passphrase in Keychain?" + ) + + +def _mount_plain_volume(vault_disk: str) -> None: + result = subprocess.run( + ["diskutil", "mount", vault_disk], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise SyncAborted("Failed to mount backup disk") + + +# ── Filesystem capability ────────────────────────────────────────────────── + +def probe_filesystem(mount_point: Path) -> None: + """Refuse if the mounted filesystem doesn't honor BSD ``uchg`` flags. + + Without this guard the engine would happily write to SMB, NFS, or + exFAT and silently downgrade its append-only contract to "files we + hope nobody touches." ``stat -f %T`` queries the kernel for the + live filesystem type of the mount; we trust the answer. + """ + header("Filesystem capability") + + fs_type = _stat_fs_type(mount_point) + + if fs_type in APPLE_FILESYSTEMS: + info("Vault filesystem honors BSD uchg — append-only contract enforceable") + return + + if fs_type in NETWORK_FILESYSTEMS: + error( + f"Vault filesystem is {fs_type!r} — network shares do not honor " + "BSD immutability flags." + ) + error(" The external-disk engine enforces kernel-level append-only via chflags uchg,") + error(" which only works on APFS or HFS+ on attached storage.") + error(" For network/NAS backup, wait for the 'restic' engine (planned).") + raise SyncAborted( + f"Vault filesystem {fs_type!r} does not support BSD immutability flags" + ) + + if fs_type in REMOVABLE_FILESYSTEMS: + error(f"Vault filesystem is {fs_type!r} — does not support BSD immutability flags.") + error(" Reformat the disk as APFS (Disk Utility → Erase) to use this engine.") + raise SyncAborted( + f"Vault filesystem {fs_type!r} does not support BSD immutability flags" + ) + + error(f"Vault filesystem {fs_type!r} is not on the supported list (apfs, hfs).") + error(" If you believe this filesystem honors chflags uchg, open an issue.") + raise SyncAborted(f"Vault filesystem {fs_type!r} not supported") + + +def _stat_fs_type(mount_point: Path) -> str: + """Return the filesystem type as macOS reports it (e.g. ``"apfs"``, + ``"smbfs"``, ``"msdos"``). Empty string if the mount point isn't + currently mounted or the type can't be determined. + + Parses ``mount`` output rather than ``stat`` — BSD ``stat -f %T`` + returns the ls-F file type suffix (``/`` for dirs), not the + filesystem type. The first attribute in the parenthesized list of + a mount line is what we want:: + + /dev/disk5s1 on /Volumes/foo (apfs, local, nodev, nosuid) + ^^^^ + """ + result = subprocess.run( + ["mount"], + capture_output=True, + text=True, + ) + if result.returncode != 0: + return "" + + target = str(mount_point) + for line in result.stdout.splitlines(): + # Each line: " on ()" + head_split = line.split(" on ", 1) + if len(head_split) != 2: + continue + rest = head_split[1] + if not rest.startswith(f"{target} ("): + continue + paren_start = rest.find("(") + 1 + paren_end = rest.rfind(")") + if paren_end <= paren_start: + continue + attrs = rest[paren_start:paren_end] + return attrs.split(",")[0].strip().lower() + return "" + + +# ── Free space ───────────────────────────────────────────────────────────── + +def check_vault_space(mount_point: Path) -> None: + """Warn if the vault has under 5 GB free. Doesn't abort — running + low doesn't break the contract; it's just useful information.""" + header("Backup disk space") + + result = subprocess.run( + ["df", "-m", str(mount_point)], + capture_output=True, + text=True, + ) + if result.returncode != 0: + warn("Could not determine free space") + return + + try: + last_line = result.stdout.strip().splitlines()[-1] + free_mb = int(last_line.split()[3]) + except (IndexError, ValueError): + warn("Could not parse df output") + return + + free_gb = free_mb // 1024 + if free_mb < 5120: + warn(f"Backup disk has only {free_gb}GB free — running low") + else: + info(f"{free_gb}GB free") + + +# ── Sync data ────────────────────────────────────────────────────────────── + +def sync_data( + sources: List[Source], + mount_point: Path, + log_path: Path, + dry_run: bool, + verbose: bool, +) -> List[SourceResult]: + """rsync each source into its vault subdirectory, then lock new files. + + The append-only contract holds because of two cooperating flags: + + * ``--ignore-existing``: rsync skips files already on the vault. + They keep their ``uchg`` flag from a prior run — never unlocked, + never overwritten. + * ``chflags uchg``: applied only to NEW files (files that didn't + have the flag after rsync). Zero unlock window for existing data. + + rsync exit codes 23 (partial transfer / vanished files) and 24 + (vanished source files) are acceptable for incremental backups — + they happen when source files move during a sync. Any other + non-zero exit is a real failure. + + rsync stderr goes to the audit log so the terminal stays tidy but + the failure trail survives. ``--stats`` output still goes to stdout + for the user to read. + """ + header("Syncing data") + + results: List[SourceResult] = [] + for src in sources: + dest = mount_point / src.vault_subdir + before_count = count_files(dest) if (dest.is_dir() and not dry_run) else 0 + + if not dry_run: + dest.mkdir(parents=True, exist_ok=True) + + info(f"{src.display}: {src.src_path}/ → {dest}/") + + rsync_flags = [ + "-a", + "--ignore-existing", + "--stats", + "--exclude=.DS_Store", + "--exclude=.Spotlight-V100", + "--exclude=.fseventsd", + "--exclude=.Trashes", + "--exclude=._*", + ] + if dry_run: + rsync_flags.append("--dry-run") + if verbose: + rsync_flags.append("-v") + + try: + log_path.parent.mkdir(parents=True, exist_ok=True) + with log_path.open("a") as log_file: + rsync = subprocess.run( + ["/usr/bin/rsync", *rsync_flags, + f"{src.src_path}/", f"{dest}/"], + stderr=log_file, + ) + except FileNotFoundError: + error(f"{src.display}: /usr/bin/rsync not found") + results.append(SourceResult(src.id, src.display, "FAILED", 0, 0)) + continue + + if rsync.returncode not in RSYNC_OK_CODES: + error(f"{src.display}: rsync failed (exit {rsync.returncode})") + results.append(SourceResult(src.id, src.display, "FAILED", 0, 0)) + continue + + after_count = before_count if dry_run else count_files(dest) + new_count = after_count - before_count + + # Lock only the new files. Existing locked files weren't touched + # (--ignore-existing), so their uchg flag survives. + # BSD find: `! -flags +uchg` matches files lacking the flag. + if new_count > 0 and not dry_run: + subprocess.run( + ["find", str(dest), "-type", "f", + "!", "-flags", "+uchg", + "-exec", "chflags", "uchg", "{}", "+"], + capture_output=True, + ) + info(f"{src.display}: locked {format_number(new_count)} new files") + + results.append(SourceResult( + id=src.id, + display=src.display, + status="ok", + total_files=after_count, + new_files=new_count, + )) + info( + f"{src.display}: done " + f"({format_number(after_count)} total, " + f"{format_number(new_count)} new)" + ) + + return results + + +# ── Verify (optional) ────────────────────────────────────────────────────── + +def verify_sync(sources: List[Source], mount_point: Path) -> None: + """Compare file counts source vs. vault. Logs only — never aborts. + + Useful sanity check after a first run or a manual restore. Off by + default because counting tens of thousands of files takes a moment. + """ + header("Verifying sync") + + for src in sources: + dest = mount_point / src.vault_subdir + src_count = count_files(src.src_path) + dest_count = count_files(dest) + if dest_count >= src_count: + info( + f"{src.display}: source={format_number(src_count)}, " + f"vault={format_number(dest_count)} — ok" + ) + else: + warn( + f"{src.display}: source={format_number(src_count)}, " + f"vault={format_number(dest_count)} — vault has fewer files" + ) + + +# ── Eject ────────────────────────────────────────────────────────────────── + +def eject_vault(vault_disk: str, dry_run: bool, no_eject: bool) -> None: + """Best-effort eject of the vault's parent disk. + + Works from a Terminal session. Sandbox-blocked from cron — that + failure is logged but does not fail the sync. Files are + ``uchg``-protected regardless of whether the disk is mounted. + """ + if dry_run: + info("[DRY RUN] Would eject backup disk") + return + if no_eject: + info("Backup disk left mounted (--no-eject)") + return + + header("Ejecting backup disk") + + parent_disk = _get_parent_disk(vault_disk) + if not parent_disk: + warn("Could not find parent disk — try ejecting manually") + return + + info(f"Ejecting {parent_disk}...") + result = subprocess.run( + ["diskutil", "eject", parent_disk], + capture_output=True, + text=True, + ) + if result.returncode == 0: + info("Backup disk ejected") + else: + warn( + "Eject failed (sandbox-blocked from this context) — " + "disk stays mounted, files are protected by uchg" + ) + + +def _get_parent_disk(vault_disk: str) -> str: + """Resolve volume → parent disk (``disk5s1`` → ``disk5``) via the + ``Part of Whole`` field. More robust than parsing the id.""" + result = subprocess.run( + ["diskutil", "info", vault_disk], + capture_output=True, + text=True, + ) + if result.returncode != 0: + return "" + for line in result.stdout.splitlines(): + if "Part of Whole" in line: + return line.split(":", 1)[1].strip() + return "" + + +# ── History (append-only run log) ────────────────────────────────────────── + +def append_to_history(result: SyncResult, history_path: Path) -> None: + """Append one JSON object as a single line to ``history.jsonl``. + + JSONL means each run is a self-contained line — partial writes from + a crashed engine produce at most one unparseable trailing line, + which :func:`read_latest_run` skips. POSIX guarantees writes under + PIPE_BUF (4KB) are atomic against concurrent appenders; a run + record is ~500 bytes so we're well under. + + Best-effort: if the write fails (full disk, permissions), log to + stderr and continue. The sync itself may have succeeded; failing + to record the outcome shouldn't mask that. + """ + try: + history_path.parent.mkdir(parents=True, exist_ok=True) + # Single write() — open() in append mode, dump, newline, close. + # No partial-line risk because the write fits in one syscall. + line = json.dumps(_result_to_dict(result)) + "\n" + with history_path.open("a") as f: + f.write(line) + except OSError as e: + print( + f"warning: could not append to {history_path}: {e}", + file=sys.stderr, + ) + + +def read_latest_run(history_path: Path) -> Optional[dict]: + """Return the most recent run from ``history.jsonl``, or ``None``. + + Scans the whole file and returns the last parseable line. This + tolerates a corrupted trailing line (e.g. a crash mid-write that + somehow truncated below the POSIX-atomic threshold): we walk past + bad lines and keep the last good one. + + At realistic sizes (~500 bytes per run, ~200 KB per year of + nightly runs) reading the whole file is instant. Reverse-seek is + overengineering until the file outgrows that, which it won't soon. + """ + if not history_path.exists(): + return None + latest: Optional[dict] = None + try: + with history_path.open() as f: + for line in f: + stripped = line.strip() + if not stripped: + continue + try: + latest = json.loads(stripped) + except json.JSONDecodeError: + # Corrupted line — skip and keep scanning. The last + # good line wins. + continue + except OSError: + return None + return latest + + +def _result_to_dict(result: SyncResult) -> dict: + """Convert to a plain dict; coerce empty ``failure_reason`` to JSON + null so the caller doesn't have to distinguish ``""`` from "no + failure".""" + d = asdict(result) + if not d.get("failure_reason"): + d["failure_reason"] = None + return d + + +# ── Context probes ───────────────────────────────────────────────────────── + +def detect_run_context() -> str: + """Best-guess at how the script was invoked. Affects status + reporting — was this a manual run from Terminal or the nightly + cron job? launchd sets ``XPC_SERVICE_NAME``; Terminal sets + ``TERM_PROGRAM``; cron sets neither.""" + if os.environ.get("TERM_PROGRAM"): + return "Terminal" + if os.environ.get("XPC_SERVICE_NAME"): + return "launchd" + return "cron" + + +def detect_vault_state(mount_point: Path, drive_not_connected: bool) -> str: + """Classify the vault's final state for the result JSON. + + ``not_connected`` is reported only when diskutil couldn't see the + volume at all — distinct from ``ejected``, which means we + successfully unmounted after a sync. + """ + if drive_not_connected: + return "not_connected" + if mount_point.is_dir(): + return "mounted" + return "ejected" + + +def measure_vault_size(mount_point: Path) -> str: + """``du -sh`` on the vault's ``data/`` subtree if present, else the + whole mount. Returns ``"unknown"`` on failure rather than raising — + size is a nice-to-have field, not a contract.""" + data_dir = mount_point / "data" + target = data_dir if data_dir.is_dir() else mount_point + if not target.is_dir(): + return "unknown" + result = subprocess.run( + ["du", "-sh", str(target)], + capture_output=True, + text=True, + ) + if result.returncode != 0 or not result.stdout: + return "unknown" + return result.stdout.split()[0] + + +# ── Pipeline ─────────────────────────────────────────────────────────────── + +def run_sync( + backup_data_dir: Path, + vault_disk: str, + sources_env: str, + args: argparse.Namespace, +) -> int: + """Top-level pipeline. Returns the process exit code (0 ok, 1 fail). + + Always appends to ``history.jsonl``, even when an exception + interrupts the pipeline halfway through. The caller (orchestrator) + treats an empty/missing history file as "the engine crashed before + it could report" — distinct from "the engine reported a failure." + + The canary at ``BACKUP_DATA_DIR/canary`` is expected to exist (planted + by ``on_install``). Missing canary = abort. + """ + mount_point = Path("/Volumes") / vault_disk + log_path = backup_data_dir / "logs" / "sync.log" + history_path = backup_data_dir / "logs" / "history.jsonl" + canary_file = backup_data_dir / "canary" + + result = SyncResult( + dry_run=args.dry_run, + run_context=detect_run_context(), + run_user=os.environ.get("USER", ""), + vault_disk=vault_disk, + started_at=datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + ) + started_at_seconds = time.time() + drive_not_connected_flag = False + + append_log( + log_path, + f"Starting sync (dry_run={args.dry_run}, no_eject={args.no_eject})", + ) + + try: + sources = parse_sources(sources_env) + + print() + print(f"{BOLD}═══ external-disk sync — vault: {vault_disk} ═══{NC}") + if args.dry_run: + print(f" {YELLOW}DRY RUN — no changes will be made{NC}") + + verify_canary(canary_file) + preflight_check_sources(sources) + mount_vault(vault_disk, mount_point, args.dry_run) + if not args.dry_run: + probe_filesystem(mount_point) + check_vault_space(mount_point) + + result.sources = sync_data( + sources, mount_point, log_path, args.dry_run, args.verbose + ) + if any(r.status == "FAILED" for r in result.sources): + result.success = False + + # Measure vault size BEFORE eject — once the disk is ejected the + # mount point is gone and du has nothing to look at. + result.vault_size = measure_vault_size(mount_point) + + if args.verify: + verify_sync(sources, mount_point) + + eject_vault(vault_disk, args.dry_run, args.no_eject) + + except DriveNotConnected as e: + drive_not_connected_flag = True + result.success = False + result.failure_reason = str(e) + append_log(log_path, f"ABORTED: {e}") + except SyncAborted as e: + result.success = False + result.failure_reason = str(e) + append_log(log_path, f"ABORTED: {e}") + except Exception as e: + result.success = False + result.failure_reason = f"Unexpected error: {e}" + append_log(log_path, f"ERROR: {e}") + finally: + result.ended_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + result.duration_seconds = int(time.time() - started_at_seconds) + result.vault_state = detect_vault_state(mount_point, drive_not_connected_flag) + if result.vault_size == "unknown": + result.vault_size = measure_vault_size(mount_point) + append_to_history(result, history_path) + append_log(log_path, f"Sync finished (success={result.success})") + + print() + if result.success: + print(f"{GREEN}Sync completed successfully.{NC}") + return 0 + print(f"{RED}Sync completed with errors.{NC}") + return 1 + + +# ── Entry point ──────────────────────────────────────────────────────────── + +def parse_args(argv: List[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="external-disk engine — append-only sync of stacklet data.", + ) + parser.add_argument( + "--dry-run", action="store_true", + help="Preview what would be synced (no changes).", + ) + parser.add_argument( + "--no-eject", action="store_true", + help="Keep vault mounted after sync.", + ) + parser.add_argument( + "--verbose", action="store_true", + help="Show rsync file-level details.", + ) + parser.add_argument( + "--verify", action="store_true", + help="Compare file counts source vs vault after sync.", + ) + return parser.parse_args(argv) + + +def main(argv: List[str]) -> int: + args = parse_args(argv) + + try: + backup_data_dir = Path(os.environ["BACKUP_DATA_DIR"]) + vault_disk = os.environ["VAULT_DISK"] + except KeyError as e: + missing = e.args[0] + print(f"sync: required environment variable {missing} is not set", + file=sys.stderr) + return 1 + + sources_env = os.environ.get("SOURCES", "") + + if str(backup_data_dir).startswith("/Volumes/"): + print( + f"sync: BACKUP_DATA_DIR ({backup_data_dir}) must not be under " + "/Volumes/. Logs and result state belong on the internal SSD.", + file=sys.stderr, + ) + return 1 + + return run_sync(backup_data_dir, vault_disk, sources_env, args) + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/stacklets/backup/hooks/on_configure.py b/stacklets/backup/hooks/on_configure.py new file mode 100644 index 0000000..f717ae0 --- /dev/null +++ b/stacklets/backup/hooks/on_configure.py @@ -0,0 +1,316 @@ +"""on_configure — interactive setup for the backup stacklet. + +Runs once on first ``stack up backup``. Idempotent — if the target +config already exists in stack.toml, we treat that as "already +configured" and skip the wizard. + +Walks the user through, in order: + +1. Picking the vault disk (default name ``backup-vault``). The disk + must already be attached and mounted; we don't try to format it. +2. Detecting whether the disk is APFS-encrypted. If so, walk through + the macOS Keychain setup so future runs can unlock unattended. The + user is warned that scheduled syncs after a reboot may need a manual + unlock until they next log in. +3. Picking the nightly schedule (``HH:MM`` form, converted to a 5-field + cron expression). +4. Refusing to set ``BACKUP_DATA_DIR`` under ``/Volumes/`` (a defensive + measure against the framework's destroy cleanup reaching external + storage). +5. Writing ``[backup.targets.vault]`` to stack.toml via the narrow + ``_config.write_target`` helper. The framework picks it up on the + next read. + +The .app/FDA/cron install happens in on_install, which runs after this. +""" + +from __future__ import annotations + +import re +import subprocess +import sys +from pathlib import Path + + +_BACKUP_DIR = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(_BACKUP_DIR)) + +from _config import read_target, write_target # noqa: E402 + +from stack.prompt import ( # noqa: E402 + ask, bold, confirm, dim, done, nl, out, section, warn, +) + + +TARGET_NAME = "vault" +DEFAULT_DISK = "backup-vault" +DEFAULT_TIME = "02:00" + + +# ── Hook entry ───────────────────────────────────────────────────────────── + +def run(ctx): + instance_dir = Path(ctx.stack.instance_dir) + stack_toml = instance_dir / "stack.toml" + + if read_target(stack_toml, TARGET_NAME) is not None: + ctx.step(f"Target '{TARGET_NAME}' already configured in stack.toml") + return + + if not sys.stdin.isatty(): + raise RuntimeError( + "Backup configuration requires interactive input. " + "Add [backup.targets.vault] to stack.toml or run " + "'stack up backup' from a terminal." + ) + + _explain() + disk_name = _ask_disk_name() + _verify_mounted_and_apfs(disk_name) + _handle_encryption(disk_name) + schedule = _ask_schedule() + + _validate_backup_data_dir(ctx) + + write_target(stack_toml, TARGET_NAME, { + "engine": "external-disk", + "disk": disk_name, + "schedule": schedule, + }) + + nl() + done(f"Target '{TARGET_NAME}' written to stack.toml") + dim(" Next: on_install will install the FDA-granted .app wrapper and the cron entry.") + nl() + + +# ── Steps ────────────────────────────────────────────────────────────────── + +def _explain() -> None: + section("Backup", "Append-only sync of stacklet data to an attached APFS disk") + out("Photos and documents are copied to the disk and locked with") + out("the kernel-level immutability flag — once written, they can't") + out("be modified or deleted, even by root.") + nl() + + +def _ask_disk_name() -> str: + bold("Step 1 — Backup disk") + out("Plug in your external disk (USB or Thunderbolt). It must be") + out("APFS-formatted and currently mounted under /Volumes/.") + nl() + + while True: + name = ask("Disk volume name", default=DEFAULT_DISK) + if not name: + raise RuntimeError("No disk name entered") + mount_point = Path("/Volumes") / name + if mount_point.is_dir(): + done(f"Found {name} at {mount_point}") + return name + warn(f"Volume '{name}' is not mounted at /Volumes/{name}") + dim(" Check the disk is plugged in and unlocked, then try again.") + if not confirm("Try a different name?", default=True): + raise RuntimeError("Backup disk not mounted") + + +def _verify_mounted_and_apfs(disk_name: str) -> None: + """Refuse non-APFS filesystems here, not later in the engine. The + engine's probe would still catch SMB/exFAT, but failing at + configure time means we never write a target config that's + guaranteed to fail at first sync. + """ + bold("Step 2 — Filesystem check") + mount_point = Path("/Volumes") / disk_name + fs_type = _stat_fs_type(mount_point) + if fs_type in ("apfs", "hfs"): + done(f"Filesystem is {fs_type} — kernel immutability available") + return + raise RuntimeError( + f"Filesystem {fs_type or 'unknown'!r} on /Volumes/{disk_name} doesn't honor " + "BSD immutability flags. Reformat as APFS (Disk Utility → Erase) " + "or use a different disk." + ) + + +def _stat_fs_type(mount_point: Path) -> str: + """Filesystem type at a mount point, parsed from ``mount`` output. + + Mirrors the engine's ``_stat_fs_type`` — see that function's + docstring for why ``mount`` parsing rather than ``stat -f %T``. + Kept in sync by hand because the engine is standalone-runnable + and shouldn't depend on hook code. + """ + result = subprocess.run( + ["mount"], + capture_output=True, text=True, + ) + if result.returncode != 0: + return "" + target = str(mount_point) + for line in result.stdout.splitlines(): + head_split = line.split(" on ", 1) + if len(head_split) != 2: + continue + rest = head_split[1] + if not rest.startswith(f"{target} ("): + continue + paren_start = rest.find("(") + 1 + paren_end = rest.rfind(")") + if paren_end <= paren_start: + continue + attrs = rest[paren_start:paren_end] + return attrs.split(",")[0].strip().lower() + return "" + + +def _handle_encryption(disk_name: str) -> None: + bold("Step 3 — Encryption") + if not _is_encrypted(disk_name): + out("Disk is plain APFS (not encrypted).") + dim(" Encryption protects against physical drive theft only.") + dim(" The uchg + offline-eject layers handle the ransomware threat") + dim(" model on their own. Plain APFS is the recommended default.") + nl() + return + + warn(f"Disk '{disk_name}' is APFS-encrypted.") + dim(" Encryption is supported but adds two operational costs:") + dim(" 1. The passphrase must live in your macOS Keychain.") + dim(" 2. After a reboot, scheduled syncs only find the disk once") + dim(" you've logged in (Keychain unlocks at login).") + nl() + + volume_uuid = _get_volume_uuid(disk_name) + if not volume_uuid: + raise RuntimeError(f"Couldn't determine Volume UUID for {disk_name}") + + if _keychain_has_entry(volume_uuid): + done(f"Keychain entry already present for Volume UUID {volume_uuid}") + nl() + return + + out("We'll store the passphrase in your login Keychain now so the") + out("backup script can unlock the disk unattended.") + nl() + passphrase = _read_passphrase("Disk passphrase") + if not passphrase: + raise RuntimeError("No passphrase entered") + + if not _store_keychain_passphrase(volume_uuid, disk_name, passphrase): + raise RuntimeError("Failed to store passphrase in Keychain") + + done(f"Passphrase stored in Keychain (Volume UUID {volume_uuid})") + nl() + + +def _is_encrypted(disk_name: str) -> bool: + result = subprocess.run( + ["diskutil", "apfs", "list"], + capture_output=True, text=True, + ) + if result.returncode != 0: + return False + lines = result.stdout.splitlines() + for i, line in enumerate(lines): + if disk_name in line: + for follow in lines[i:i + 6]: + if "FileVault:" in follow and "Yes" in follow: + return True + return False + return False + + +def _get_volume_uuid(disk_name: str) -> str: + result = subprocess.run( + ["diskutil", "info", disk_name], + capture_output=True, text=True, + ) + if result.returncode != 0: + return "" + for line in result.stdout.splitlines(): + if "Volume UUID" in line: + return line.rsplit(":", 1)[1].strip() + return "" + + +def _keychain_has_entry(volume_uuid: str) -> bool: + result = subprocess.run( + ["security", "find-generic-password", "-a", volume_uuid, "-w"], + capture_output=True, text=True, + ) + return result.returncode == 0 and bool(result.stdout.strip()) + + +def _read_passphrase(prompt: str) -> str: + """Read a passphrase without echo. Falls back to plain input if + getpass isn't usable (rare, but the framework runs in unusual + environments).""" + try: + import getpass + return getpass.getpass(f" ▸ {prompt}: ") + except Exception: + return ask(prompt, default="") or "" + + +def _store_keychain_passphrase(volume_uuid: str, disk_name: str, passphrase: str) -> bool: + """``security add-generic-password`` with the layout that + ``diskutil apfs unlockVolume`` recognizes.""" + result = subprocess.run( + [ + "security", "add-generic-password", + "-a", volume_uuid, + "-s", volume_uuid, + "-D", "APFS Volume Password", + "-l", disk_name, + "-w", passphrase, + "-U", # Update if entry already exists + ], + capture_output=True, text=True, + ) + return result.returncode == 0 + + +def _ask_schedule() -> str: + bold("Step 4 — Schedule") + out("Pick a nightly time. 2 AM avoids active hours for most households.") + nl() + + while True: + time_str = ask("Time (HH:MM)", default=DEFAULT_TIME) + if not time_str: + raise RuntimeError("No time entered") + cron = _parse_time_to_cron(time_str) + if cron: + done(f"Scheduled daily at {time_str} (cron: {cron})") + nl() + return cron + warn(f"'{time_str}' is not a valid HH:MM time") + dim(" Examples: 02:00, 03:30, 22:15") + + +def _parse_time_to_cron(time_str: str) -> str: + """Convert ``HH:MM`` to a 5-field daily cron expression. + Returns empty string on invalid input.""" + match = re.match(r"^\s*(\d{1,2}):(\d{2})\s*$", time_str) + if not match: + return "" + hour, minute = int(match.group(1)), int(match.group(2)) + if not (0 <= hour < 24 and 0 <= minute < 60): + return "" + return f"{minute} {hour} * * *" + + +def _validate_backup_data_dir(ctx) -> None: + """The framework's destroy cleanup deletes data_dir// recursively. + If we ever pointed BACKUP_DATA_DIR at the vault, destroy would wipe + the vault. The default ``{data_dir}/backup`` is safe; warn loudly + if someone overrode it to a /Volumes/ path.""" + env_defaults = ctx.stack.config.get("env", {}).get("defaults", {}) + backup_dir_template = env_defaults.get("BACKUP_DATA_DIR", "") + if backup_dir_template.startswith("/Volumes/"): + raise RuntimeError( + "BACKUP_DATA_DIR points at /Volumes/ — refused for safety. " + "Logs, canary, and result state must live on the internal SSD " + "so the framework's destroy cleanup can't reach external storage." + ) diff --git a/stacklets/backup/hooks/on_destroy.py b/stacklets/backup/hooks/on_destroy.py new file mode 100644 index 0000000..e7add61 --- /dev/null +++ b/stacklets/backup/hooks/on_destroy.py @@ -0,0 +1,91 @@ +"""on_destroy — tear down host-side state. NEVER touches existing +backup data. + +The framework calls on_stop FIRST during destroy, then on_destroy. The +cron entry is removed in on_stop; this hook removes it AGAIN as a +defensive measure. Both removals are idempotent (no-op if the entry is +already gone) so the double-call has no cost and protects against +on_stop having been skipped, failed, or never run. + +What this hook removes (regenerable host-side state): + - The cron entry installed by on_install (defensive re-removal) + - The FamstackVaultSync.app bundle — the framework's data-dir + cleanup will sweep it after this hook, but removing it explicitly + here makes the destroy-time summary accurate + +What is explicitly preserved: + - The vault disk and every file on it. The whole point of an + append-only archive is that it outlives the system that wrote it. + A user who runs ``stack destroy backup`` is uninstalling the + backup tooling, not asking us to wipe their photo history. + - The macOS Keychain entry for the disk passphrase (encrypted vaults + only). The user may want manual disk access after uninstall. + +The vault is on ``/Volumes/``, not under BACKUP_DATA_DIR — the +framework's data-dir cleanup never reaches it. on_configure refuses to +let BACKUP_DATA_DIR be placed under ``/Volumes/`` as a defensive +measure against misconfiguration. + +The Full Disk Access entry in System Settings becomes orphaned (the +.app it points at is gone). macOS shows it as "This item refers to an +item that doesn't exist." Users can clean it up manually; we can't +remove TCC entries programmatically — that's the entire point of TCC. +""" + +from __future__ import annotations + +import shutil +import sys +from pathlib import Path + + +_BACKUP_DIR = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(_BACKUP_DIR)) + +import _cron as cron # noqa: E402 + +from stack.prompt import dim, done, nl, out # noqa: E402 + + +APP_BUNDLE_NAME = "FamstackVaultSync.app" + + +def run(ctx): + backup_data_dir = Path(ctx.env["BACKUP_DATA_DIR"]) + + # Defensive cron sweep — should already be empty after on_stop, but + # cheap to verify. Use the wildcard sweep rather than a per-target + # loop because by destroy time we may have no target config left to + # iterate. + try: + removed = cron.remove_all_entries() + if removed: + done(f"Removed {removed} stale backup cron entr{'y' if removed == 1 else 'ies'}") + except RuntimeError as e: + # Loud failure: a stale cron entry pointed at a now-deleted + # .app is the worst-case operational outcome — silently failing + # to clean up is exactly what NOT to do. + raise RuntimeError( + f"Could not remove backup cron entries: {e}\n" + f"Run 'crontab -e' and delete any line containing 'famstack-backup-'." + ) + + # Remove the .app bundle explicitly. The framework's destroy will + # also wipe BACKUP_DATA_DIR/ recursively after we return, so this + # is partly cosmetic — but doing it here means the summary printed + # below reflects reality, not promises. + app_path = backup_data_dir / APP_BUNDLE_NAME + if app_path.is_dir(): + shutil.rmtree(app_path, ignore_errors=True) + done(f"Removed app bundle: {app_path}") + + nl() + out("Preserved:") + out(" • Vault disk contents — append-only archive outlives the tooling.") + out(" To wipe (only if you're sure): plug in the disk, then") + dim(" sudo chflags -R nouchg /Volumes//data && rm -rf /Volumes//data") + out(" • Keychain passphrase entry (encrypted vaults). Remove with:") + dim(" security delete-generic-password -a ''") + out(" • Full Disk Access entry in System Settings (orphaned — remove") + out(" manually via Privacy & Security → Full Disk Access → -)") + nl() diff --git a/stacklets/backup/hooks/on_install.py b/stacklets/backup/hooks/on_install.py new file mode 100644 index 0000000..e1b3a37 --- /dev/null +++ b/stacklets/backup/hooks/on_install.py @@ -0,0 +1,247 @@ +"""on_install — install the FDA-granted .app, walk the user through the +FDA grant, and add the nightly cron entry. + +Runs once after on_configure on first ``stack up backup``. Idempotent — +re-running it after a successful install is a no-op (the .app and cron +entry already exist and get rewritten with the same content). + +Why the .app dance: macOS TCC restricts ``diskutil`` operations from +background processes (cron, launchd) unless the binary has been granted +Full Disk Access. FDA can't be granted to a raw script or symlink — +only to a proper .app bundle. So we generate a minimal .app whose only +job is to receive the FDA grant and shell out to ``stack backup sync``. + +The cron line invokes the app via ``open``, which routes through the +proper macOS app lifecycle so the FDA permission applies. +""" + +from __future__ import annotations + +import stat +import subprocess +import sys +from pathlib import Path + + +_BACKUP_DIR = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(_BACKUP_DIR)) + +from _config import read_target # noqa: E402 +import _cron as cron # noqa: E402 + +# Import the canary constant from the engine so the planter writes +# exactly what the verifier expects. Single source of truth. +_ENGINE_DIR = _BACKUP_DIR / "engines" / "external-disk" +sys.path.insert(0, str(_ENGINE_DIR)) +from sync import CANARY_STRING # noqa: E402 + +from stack.prompt import ( # noqa: E402 + ask, bold, confirm, dim, done, nl, out, section, warn, +) + + +APP_BUNDLE_NAME = "FamstackVaultSync.app" +APP_BUNDLE_ID = "dev.famstack.backup" +TARGET_NAME = "vault" + + +# ── Hook entry ───────────────────────────────────────────────────────────── + +def run(ctx): + instance_dir = Path(ctx.stack.instance_dir) + repo_root = Path(ctx.stack.root) + backup_data_dir = Path(ctx.env["BACKUP_DATA_DIR"]) + + target = read_target(instance_dir / "stack.toml", TARGET_NAME) + if target is None: + raise RuntimeError( + f"Target '{TARGET_NAME}' is not in stack.toml. " + "Did on_configure run successfully?" + ) + + section("Backup install", f"FDA wrapper + cron entry for target '{TARGET_NAME}'") + nl() + + # 1. Make the directories the engine and the .app both need. + backup_data_dir.mkdir(parents=True, exist_ok=True) + (backup_data_dir / "logs").mkdir(parents=True, exist_ok=True) + + # 2. Plant the canary. The engine only *verifies* — it doesn't + # create the tripwire — so install is where it gets seeded. + # Idempotent: only writes if the file isn't already present + # (re-running install must not clobber an existing canary that + # might already have been verified across successful syncs). + plant_canary(backup_data_dir) + + # 3. Generate the .app bundle. + app_path = generate_app_bundle( + target_dir=backup_data_dir, + stack_executable=repo_root / "stack", + log_path=backup_data_dir / "logs" / "cron.log", + ) + done(f"App bundle: {app_path}") + nl() + + # 4. Walk the user through the FDA grant. + if sys.stdin.isatty(): + _fda_walkthrough(app_path) + else: + warn("Non-interactive install — Full Disk Access must be granted manually:") + out(f" System Settings → Privacy & Security → Full Disk Access → + → {app_path}") + nl() + + # 5. Install the cron entry. + schedule = target.get("schedule", "0 2 * * *") + cron_command = f"open {app_path}" + try: + changed = cron.install_entry(schedule, cron_command, TARGET_NAME) + except RuntimeError as e: + raise RuntimeError( + f"Cron install failed: {e}\n" + f"Add this line to your crontab manually (crontab -e):\n" + f" {schedule} {cron_command} # {cron.marker_for(TARGET_NAME)}" + ) + + if changed: + done(f"Cron entry installed: {schedule} → {app_path}") + else: + done("Cron entry already up to date") + nl() + + bold("Setup complete.") + out("Run 'stack backup sync' to test now (manual run also tries to eject).") + out("The scheduled run fires nightly per the cron entry. Disk stays") + out("mounted between scheduled runs (sandbox blocks eject from cron);") + out("files are protected by chflags uchg.") + nl() + + +# ── Canary planter ──────────────────────────────────────────────────────── + +def plant_canary(backup_data_dir: Path) -> None: + """Write the ransomware-tripwire canary with known content. + + The engine verifies this file before every sync and refuses to + proceed if it's missing or corrupted (see ``verify_canary`` in the + engine). Install plants it; the engine never creates it. + + Idempotent: if the canary already exists we leave it alone. + Clobbering an existing canary would make a tampered-with state + indistinguishable from a fresh install. + """ + canary_path = backup_data_dir / "canary" + if canary_path.exists(): + done(f"Canary already present: {canary_path}") + return + canary_path.write_text(CANARY_STRING + "\n") + done(f"Canary planted: {canary_path}") + + +# ── .app bundle generation ──────────────────────────────────────────────── + +def generate_app_bundle( + target_dir: Path, + stack_executable: Path, + log_path: Path, +) -> Path: + """Generate the FamstackVaultSync.app bundle. + + A .app bundle is a directory tree macOS treats as a single + "application." Ours is the bare minimum: an ``Info.plist`` that + identifies the bundle and an executable that shells out to + ``stack backup sync``. The bundle exists ONLY so macOS TCC can + attach a Full Disk Access grant to it — there's no UI, no dock + icon, no real "app." + """ + app_path = target_dir / APP_BUNDLE_NAME + contents_dir = app_path / "Contents" + macos_dir = contents_dir / "MacOS" + macos_dir.mkdir(parents=True, exist_ok=True) + + # Info.plist — minimum keys macOS needs to recognize the bundle. + # LSUIElement=true keeps it out of the dock and Cmd-Tab. + (contents_dir / "Info.plist").write_text(_info_plist()) + + # The executable wrapper. Cron fires `open `, macOS launches + # the bundle, the bundle's executable runs this script. + wrapper = macos_dir / "vault-sync" + wrapper.write_text(_wrapper_script(stack_executable, log_path)) + wrapper.chmod(wrapper.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + + return app_path + + +def _info_plist() -> str: + return ( + '\n' + '\n' + '\n' + '\n' + ' CFBundleExecutable\n' + ' vault-sync\n' + f' CFBundleIdentifier\n' + f' {APP_BUNDLE_ID}\n' + ' CFBundleName\n' + ' FamstackVaultSync\n' + ' CFBundleVersion\n' + ' 1.0\n' + ' LSUIElement\n' + ' \n' + '\n' + '\n' + ) + + +def _wrapper_script(stack_executable: Path, log_path: Path) -> str: + """The .app's executable. Logs go to BACKUP_DATA_DIR/logs/cron.log + so a scheduled run that's gone wrong leaves a trail the user can + inspect without trawling Console.app.""" + return ( + "#!/bin/bash\n" + "# Auto-generated by stacklets/backup/hooks/on_install.py — do not edit.\n" + "# Invoked by the cron entry: `open `.\n" + f'LOG="{log_path}"\n' + 'mkdir -p "$(dirname "$LOG")"\n' + f'exec "{stack_executable}" backup sync >> "$LOG" 2>&1\n' + ) + + +# ── FDA walkthrough ──────────────────────────────────────────────────────── + +def _fda_walkthrough(app_path: Path) -> None: + """Open System Settings to the Full Disk Access pane and walk the + user through adding the .app. We can't programmatically grant TCC + permissions — that's the whole point of TCC — so this is the best + we can do.""" + bold("Full Disk Access grant") + out("The backup script reads from your stacklet data directories and") + out("writes to the external vault disk. Both need Full Disk Access") + out("when the script runs from cron (a sandboxed context).") + nl() + out("Steps:") + out(f" 1. Settings opens to the Full Disk Access pane.") + out(f" 2. Click + to add an app.") + out(f" 3. Press {bold_text('⌘⇧G')} and paste:") + out(f" {app_path}") + out(f" 4. Select {bold_text('FamstackVaultSync.app')} and turn it on.") + nl() + + # Deep-link to the FDA pane. This URL works on macOS 13+. Older + # macOS opens to the general Privacy pane. + subprocess.run([ + "open", + "x-apple.systempreferences:com.apple.preference.security?Privacy_AllFiles", + ], check=False) + + if not confirm("Done? (you don't have to relaunch anything)", default=True): + warn("Skipping FDA confirmation — backups may fail until granted.") + dim(" You can run 'stack up backup' again later to re-trigger this prompt.") + nl() + + +def bold_text(s: str) -> str: + """Inline bold wrapping. Helper because ``stack.prompt.bold`` prints + on its own line; we need inline emphasis.""" + from stack.prompt import BOLD, RESET + return f"{BOLD}{s}{RESET}" diff --git a/stacklets/backup/hooks/on_start.py b/stacklets/backup/hooks/on_start.py new file mode 100644 index 0000000..4f5b700 --- /dev/null +++ b/stacklets/backup/hooks/on_start.py @@ -0,0 +1,66 @@ +"""on_start — ensure the cron entry is present. + +Runs on every ``stack up backup``. Idempotent — install_entry replaces +an existing entry if the schedule or command changed (e.g. data_dir +was reconfigured, schedule was edited in stack.toml), and is a no-op +when the entry is already current. + +This hook is the natural place to pick up stack.toml edits: a user +changes ``schedule`` from 02:00 to 03:30, runs ``stack up backup``, the +cron entry updates. No need to ``stack destroy`` + reconfigure. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + + +_BACKUP_DIR = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(_BACKUP_DIR)) + +from _config import read_target # noqa: E402 +import _cron as cron # noqa: E402 + + +TARGET_NAME = "vault" +APP_BUNDLE_NAME = "FamstackVaultSync.app" + + +def run(ctx): + instance_dir = Path(ctx.stack.instance_dir) + backup_data_dir = Path(ctx.env["BACKUP_DATA_DIR"]) + + target = read_target(instance_dir / "stack.toml", TARGET_NAME) + if target is None: + # No target configured. on_configure should have caught this, + # but be defensive — silently skipping here would leave the + # user with no scheduled run and no warning. + ctx.step(f"No [backup.targets.{TARGET_NAME}] in stack.toml — skipping cron install") + return + + app_path = backup_data_dir / APP_BUNDLE_NAME + if not app_path.is_dir(): + # The .app should have been installed by on_install. If it's + # missing here, something deleted it after install — re-running + # `stack up backup` should regenerate it via on_install, but + # the framework only runs on_install once. Surface the issue. + ctx.step( + f"App bundle missing at {app_path}. " + f"Run 'stack destroy backup && stack up backup' to reinstall." + ) + return + + schedule = target.get("schedule", "0 2 * * *") + try: + changed = cron.install_entry(schedule, f"open {app_path}", TARGET_NAME) + except RuntimeError as e: + raise RuntimeError( + f"Cron install failed: {e}\n" + f"Add this line to your crontab manually (crontab -e):\n" + f" {schedule} open {app_path} # {cron.marker_for(TARGET_NAME)}" + ) + if changed: + ctx.step(f"Cron entry updated for target '{TARGET_NAME}' ({schedule})") + else: + ctx.step(f"Cron entry already current for target '{TARGET_NAME}'") diff --git a/stacklets/backup/hooks/on_stop.py b/stacklets/backup/hooks/on_stop.py new file mode 100644 index 0000000..6385f67 --- /dev/null +++ b/stacklets/backup/hooks/on_stop.py @@ -0,0 +1,70 @@ +"""on_stop — remove the nightly cron entry. + +MUST remove every cron entry installed by on_install. We iterate over +all ``[backup.targets.*]`` names in stack.toml — once we support +multiple targets, each gets its own cron entry and its own removal. +A wildcard fallback (``remove_all_entries``) catches the case where +stack.toml is missing or unreadable. + +Idempotent: re-running after a successful stop is a no-op. The .app +bundle, BACKUP_DATA_DIR, the vault disk, and the Keychain passphrase +are left alone — ``stack up backup`` reinstalls the cron without +re-running on_configure or on_install. + +If the crontab edit fails (locked file, permission denied), this hook +raises with the exact line to remove manually. Silent failure here is +dangerous: a stale entry would keep firing nightly against a stopped +backup, appending failure records to ``history.jsonl`` indefinitely. +""" + +from __future__ import annotations + +import sys +from pathlib import Path +from typing import List + + +_BACKUP_DIR = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(_BACKUP_DIR)) + +import _cron as cron # noqa: E402 + +try: + import tomllib +except ModuleNotFoundError: # pragma: no cover + from stack._vendor import tomli as tomllib # type: ignore + + +def run(ctx): + instance_dir = Path(ctx.stack.instance_dir) + stack_toml = instance_dir / "stack.toml" + + target_names = _read_target_names(stack_toml) + if target_names: + removed_any = False + for name in target_names: + if cron.remove_entry(name): + ctx.step(f"Removed cron entry for target '{name}'") + removed_any = True + if not removed_any: + ctx.step("No backup cron entries to remove") + return + + # Fallback: stack.toml unreadable or no targets listed. Sweep + # anything tagged with our marker prefix. + removed = cron.remove_all_entries() + if removed: + ctx.step(f"Removed {removed} backup cron entr{'y' if removed == 1 else 'ies'}") + else: + ctx.step("No backup cron entries to remove") + + +def _read_target_names(stack_toml: Path) -> List[str]: + if not stack_toml.exists(): + return [] + try: + with stack_toml.open("rb") as f: + data = tomllib.load(f) + except (tomllib.TOMLDecodeError, OSError): + return [] + return list(data.get("backup", {}).get("targets", {}).keys()) diff --git a/stacklets/backup/stacklet.toml b/stacklets/backup/stacklet.toml new file mode 100644 index 0000000..a538f5b --- /dev/null +++ b/stacklets/backup/stacklet.toml @@ -0,0 +1,51 @@ +# stacklet.toml — backup stacklet (append-only archives) +# +# Coordinates backups of stacklet data to attached external disks. The +# protection model is layered: +# +# 1. Files locked with chflags uchg (kernel refuses modify or delete). +# 2. rsync --ignore-existing (append-only: never overwrites existing). +# 3. Canary + min-file-count preflight (ransomware tripwire). +# 4. APFS encryption optional (defends physical theft only). +# 5. Eject after sync — works from Terminal, sandbox-blocked from cron. +# Treat as a bonus, not a guarantee: the disk usually stays mounted +# between scheduled runs, with files kernel-immutable. +# +# Sources are discovered, not hardcoded: every enabled stacklet that +# declares [[backup.archive]] in its own stacklet.toml contributes to +# the next sync. Targets are configured in stack.toml under +# [backup.targets]. +# +# Engines: +# external-disk — rsync + chflags uchg on attached APFS disk (refuses +# to run on filesystems that don't honor uchg) +# restic — encrypted offsite (planned, not implemented) + +id = "backup" +name = "Backup" +description = "Append-only backup of stacklet data to attached disks" +version = "0.1.0" +category = "infrastructure" +type = "host" + +# Backup runs once per night via a host-installed cron entry. The cron +# entry launches an .app bundle via `open` so that macOS Full Disk Access +# applies — required to read external volumes from a background process. +# See engines/external-disk/README.md for the sandboxing rationale. + +hints = [ + "Run 'stack backup sync' to test a backup now", + "Run 'stack backup status' to see the last run and vault size", +] + +[env.defaults] +# BACKUP_DATA_DIR is this stacklet's own state directory (canary file, +# audit log, run history, the FDA-granted .app bundle). It is NOT the +# source data being backed up, and NOT the target vault disk — both of +# those have separate names elsewhere. The form here follows the +# framework convention {STACKLET}_DATA_DIR = "{data_dir}/{stacklet_id}" +# that other stacklets use (cf. PAPERLESS_DATA_DIR in docs/stacklet.toml). +# The name is confusing because "backup" is both a stacklet name and a +# verb; the contents are state, not backups. +BACKUP_DATA_DIR = "{data_dir}/backup" +TZ = "{timezone}" diff --git a/stacklets/docs/stacklet.toml b/stacklets/docs/stacklet.toml index 3c8a937..a88d5a8 100644 --- a/stacklets/docs/stacklet.toml +++ b/stacklets/docs/stacklet.toml @@ -50,3 +50,19 @@ LANGUAGE = "{language}" [health] url = "http://localhost:42020/api/" expect = "200" + +# Backup sources declared by this stacklet. See stacklets/backup/README.md +# for how sources flow to configured targets. +# +# We back up paperless/media — the archived PDFs themselves. These are +# the irreplaceable part: scanned letters, contracts, bills. +# +# Postgres is NOT yet backed up here. On host failure you'd recover the +# PDFs but lose tag assignments, correspondents, custom field values, +# manually edited titles, ASNs, saved views, and permissions. Re-running +# OCR rebuilds the search index but not the organization. Pg-dump +# snapshots land via [[backup.snapshot]] later. +[[backup.archive]] +name = "media" +path = "{data_dir}/docs/paperless/media" +min_files = 10 diff --git a/stacklets/photos/stacklet.toml b/stacklets/photos/stacklet.toml index 5b0c495..5f62772 100644 --- a/stacklets/photos/stacklet.toml +++ b/stacklets/photos/stacklet.toml @@ -53,3 +53,22 @@ IMMICH_VERSION = "release" url = "http://localhost:42010/api/server/ping" path = "$.res" expect = "pong" + +# Backup sources declared by this stacklet. The backup stacklet discovers +# every [[backup.archive]] entry from every enabled stacklet and routes +# them through configured targets. An archive is an append-only store: +# files are added, never modified, never deleted. min_files is a coarse +# ransomware smoke test; the canary file is the precise tripwire. +# +# We back up the originals only — UPLOAD_LOCATION contains library/ +# (originals), encoded-video/, thumbs/, backups/, upload/, profile/. +# Everything except library/ is regenerable from the originals (Immich +# re-encodes and re-thumbnails on import). +# +# Postgres is NOT yet backed up here. On host failure you'd recover the +# photo files but lose albums, shares, user accounts, faces, and memory +# definitions. Pg-dump snapshots land via [[backup.snapshot]] later. +[[backup.archive]] +name = "library" +path = "{data_dir}/photos/library/library" +min_files = 10 diff --git a/tests/stacklets/test_backup_config.py b/tests/stacklets/test_backup_config.py new file mode 100644 index 0000000..0fd4d90 --- /dev/null +++ b/tests/stacklets/test_backup_config.py @@ -0,0 +1,228 @@ +"""Unit tests for the backup stacklet's stack.toml target-config helper. + +The helper does a narrow, comment-preserving block replacement in +stack.toml. These tests cover the create / replace / coexistence +cases plus a round-trip with the reader. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT / "stacklets" / "backup")) + +from _config import read_target, write_target + + +# ── read_target ───────────────────────────────────────────────────────────── + +class TestReadTarget: + def test_returns_none_when_file_missing(self, tmp_path): + assert read_target(tmp_path / "absent.toml", "vault") is None + + def test_returns_none_when_no_backup_section(self, tmp_path): + p = tmp_path / "stack.toml" + p.write_text('[core]\ndata_dir = "/x"\n') + assert read_target(p, "vault") is None + + def test_returns_none_when_no_such_target(self, tmp_path): + p = tmp_path / "stack.toml" + p.write_text( + '[backup.targets.offsite]\n' + 'engine = "restic"\n' + ) + assert read_target(p, "vault") is None + + def test_returns_config_dict_when_target_present(self, tmp_path): + p = tmp_path / "stack.toml" + p.write_text( + '[backup.targets.vault]\n' + 'engine = "external-disk"\n' + 'disk = "backup-vault"\n' + 'schedule = "0 2 * * *"\n' + ) + cfg = read_target(p, "vault") + assert cfg == { + "engine": "external-disk", + "disk": "backup-vault", + "schedule": "0 2 * * *", + } + + def test_returns_none_on_malformed_toml(self, tmp_path): + # A broken stack.toml shouldn't blow up the orchestrator — + # treat it like "no config" and let the caller decide. + p = tmp_path / "stack.toml" + p.write_text("this is not [valid] toml = ===") + assert read_target(p, "vault") is None + + +# ── write_target — create cases ───────────────────────────────────────────── + +class TestWriteTargetCreate: + def test_creates_file_when_missing(self, tmp_path): + p = tmp_path / "stack.toml" + write_target(p, "vault", {"engine": "external-disk", "disk": "backup-vault"}) + assert p.exists() + assert read_target(p, "vault") == { + "engine": "external-disk", + "disk": "backup-vault", + } + + def test_appends_block_to_existing_file(self, tmp_path): + p = tmp_path / "stack.toml" + p.write_text( + '[core]\n' + 'data_dir = "/x"\n' + 'timezone = "UTC"\n' + ) + write_target(p, "vault", {"engine": "external-disk", "disk": "backup-vault"}) + + text = p.read_text() + # The [core] block is untouched. + assert '[core]' in text + assert 'data_dir = "/x"' in text + # The new block is present. + assert '[backup.targets.vault]' in text + assert read_target(p, "vault")["disk"] == "backup-vault" + + def test_preserves_comments_in_unrelated_sections(self, tmp_path): + # The header comment, the inline comment, and the standalone + # comment between sections must all survive a write that + # doesn't touch those sections. + p = tmp_path / "stack.toml" + original = ( + '# famstack stack.toml — household config\n' + '\n' + '[core]\n' + 'data_dir = "/x" # absolute path\n' + '\n' + '# AI section below\n' + '[ai]\n' + 'default = "model"\n' + ) + p.write_text(original) + write_target(p, "vault", {"engine": "external-disk"}) + + text = p.read_text() + assert '# famstack stack.toml — household config' in text + assert 'data_dir = "/x" # absolute path' in text + assert '# AI section below' in text + + +# ── write_target — replace cases ──────────────────────────────────────────── + +class TestWriteTargetReplace: + def test_replaces_existing_block(self, tmp_path): + p = tmp_path / "stack.toml" + p.write_text( + '[backup.targets.vault]\n' + 'engine = "external-disk"\n' + 'disk = "old-name"\n' + 'schedule = "0 0 * * *"\n' + ) + write_target(p, "vault", { + "engine": "external-disk", + "disk": "new-name", + "schedule": "0 2 * * *", + }) + assert read_target(p, "vault") == { + "engine": "external-disk", + "disk": "new-name", + "schedule": "0 2 * * *", + } + + def test_drops_keys_not_in_new_config(self, tmp_path): + # Replacing means *replacing* — not merging. If the new config + # has fewer keys, the old extras are gone. + p = tmp_path / "stack.toml" + p.write_text( + '[backup.targets.vault]\n' + 'engine = "external-disk"\n' + 'disk = "d"\n' + 'legacy_field = "should disappear"\n' + ) + write_target(p, "vault", {"engine": "external-disk", "disk": "d"}) + cfg = read_target(p, "vault") + assert "legacy_field" not in cfg + assert cfg["disk"] == "d" + + def test_does_not_touch_sibling_target(self, tmp_path): + # Two targets configured — writing to one must leave the + # other intact, comments and all. + p = tmp_path / "stack.toml" + p.write_text( + '[backup.targets.vault]\n' + 'engine = "external-disk"\n' + 'disk = "v"\n' + '\n' + '# Offsite is the future restic target\n' + '[backup.targets.offsite]\n' + 'engine = "restic"\n' + 'repository = "s3:..."\n' + ) + write_target(p, "vault", {"engine": "external-disk", "disk": "new-vault"}) + + text = p.read_text() + assert "# Offsite is the future restic target" in text + assert read_target(p, "offsite") == { + "engine": "restic", + "repository": "s3:...", + } + assert read_target(p, "vault")["disk"] == "new-vault" + + def test_does_not_touch_unrelated_section_after(self, tmp_path): + # The block-matching regex must stop at the next [section] + # header — otherwise we'd consume part of [updates]. + p = tmp_path / "stack.toml" + p.write_text( + '[backup.targets.vault]\n' + 'engine = "external-disk"\n' + '\n' + '[updates]\n' + 'schedule = "0 0 3 * * *"\n' + ) + write_target(p, "vault", {"engine": "external-disk", "disk": "d"}) + text = p.read_text() + assert "[updates]" in text + assert 'schedule = "0 0 3 * * *"' in text + + +# ── write_target — value escaping ─────────────────────────────────────────── + +class TestWriteTargetEscaping: + def test_round_trip_preserves_values(self, tmp_path): + p = tmp_path / "stack.toml" + cfg = { + "engine": "external-disk", + "disk": "backup-vault", + "schedule": "0 2 * * *", + } + write_target(p, "vault", cfg) + assert read_target(p, "vault") == cfg + + def test_double_quote_in_value_escapes_correctly(self, tmp_path): + # No current field plausibly contains a quote, but the helper + # is the wrong place to silently mangle one if it ever does. + p = tmp_path / "stack.toml" + write_target(p, "vault", {"engine": "ext", "disk": 'has"quote'}) + assert read_target(p, "vault")["disk"] == 'has"quote' + + def test_backslash_in_value_escapes_correctly(self, tmp_path): + p = tmp_path / "stack.toml" + write_target(p, "vault", {"engine": "ext", "disk": r"has\back"}) + assert read_target(p, "vault")["disk"] == r"has\back" + + +# ── Atomicity ────────────────────────────────────────────────────────────── + +class TestWriteTargetAtomicity: + def test_no_temp_file_left_behind_after_success(self, tmp_path): + p = tmp_path / "stack.toml" + write_target(p, "vault", {"engine": "external-disk"}) + # Only stack.toml should exist — no .tmp leftovers. + entries = sorted(x.name for x in tmp_path.iterdir()) + assert entries == ["stack.toml"] diff --git a/tests/stacklets/test_backup_configure.py b/tests/stacklets/test_backup_configure.py new file mode 100644 index 0000000..d6bb029 --- /dev/null +++ b/tests/stacklets/test_backup_configure.py @@ -0,0 +1,253 @@ +"""Unit tests for the backup stacklet's on_configure helpers. + +Covers the pure-function parts: schedule parsing, filesystem type +classification (via mocked stat), encryption detection (via mocked +diskutil). The interactive ``run(ctx)`` flow needs a full ctx mock and +stdin/stdout, which is integration territory. +""" + +from __future__ import annotations + +import sys +from pathlib import Path +from unittest.mock import patch + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT / "stacklets" / "backup" / "hooks")) + +# Need stack.prompt importable for the module to load — add lib/ to path. +sys.path.insert(0, str(REPO_ROOT / "lib")) + +import on_configure as configure # noqa: E402 + + +# ── _parse_time_to_cron ──────────────────────────────────────────────────── + +class TestParseTimeToCron: + def test_two_oclock_morning(self): + assert configure._parse_time_to_cron("02:00") == "0 2 * * *" + + def test_single_digit_hour(self): + assert configure._parse_time_to_cron("3:30") == "30 3 * * *" + + def test_late_evening(self): + assert configure._parse_time_to_cron("22:15") == "15 22 * * *" + + def test_midnight(self): + assert configure._parse_time_to_cron("00:00") == "0 0 * * *" + + def test_whitespace_stripped(self): + assert configure._parse_time_to_cron(" 04:00 ") == "0 4 * * *" + + def test_returns_empty_on_invalid_hour(self): + assert configure._parse_time_to_cron("25:00") == "" + + def test_returns_empty_on_invalid_minute(self): + assert configure._parse_time_to_cron("02:60") == "" + + def test_returns_empty_on_garbage(self): + assert configure._parse_time_to_cron("not a time") == "" + + def test_returns_empty_on_seconds_format(self): + # We only accept HH:MM, not HH:MM:SS — cron doesn't do seconds. + assert configure._parse_time_to_cron("02:00:00") == "" + + +# ── _stat_fs_type (mocked mount output) ──────────────────────────────────── + +class TestStatFsType: + """Parses ``mount`` output. NOT ``stat -f %T`` — that returns the + ls-F file type suffix, not the filesystem type. This was a real + bug found by E2E; tests now exercise the actual parsing path. + """ + + def _mount_lines(self, *lines): + return "\n".join(lines) + "\n" + + def test_apfs_extracted_from_mount(self): + out = self._mount_lines( + "/dev/disk3s1 on / (apfs, sealed, local, read-only)", + "/dev/disk5s1 on /Volumes/foo (apfs, local, nodev, nosuid)", + ) + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = out + assert configure._stat_fs_type(Path("/Volumes/foo")) == "apfs" + + def test_smbfs_extracted(self): + out = self._mount_lines( + "//user@server/share on /Volumes/share (smbfs, nodev, nosuid)" + ) + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = out + assert configure._stat_fs_type(Path("/Volumes/share")) == "smbfs" + + def test_msdos_fat_extracted(self): + out = self._mount_lines( + "/dev/disk6s1 on /Volumes/fat (msdos, local, nodev)" + ) + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = out + assert configure._stat_fs_type(Path("/Volumes/fat")) == "msdos" + + def test_empty_string_when_mount_point_not_found(self): + out = self._mount_lines( + "/dev/disk3s1 on / (apfs, sealed)", + ) + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = out + assert configure._stat_fs_type(Path("/Volumes/absent")) == "" + + def test_partial_path_prefix_does_not_match(self): + # /Volumes/foo and /Volumes/foobar are distinct mounts — the + # parser must not match the wrong one. + out = self._mount_lines( + "/dev/disk5s1 on /Volumes/foobar (apfs, local)" + ) + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = out + assert configure._stat_fs_type(Path("/Volumes/foo")) == "" + + def test_empty_on_subprocess_failure(self): + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 1 + run.return_value.stdout = "" + assert configure._stat_fs_type(Path("/Volumes/x")) == "" + + +# ── _is_encrypted (mocked diskutil apfs list) ────────────────────────────── + +class TestIsEncrypted: + def test_true_when_filevault_yes_near_disk_name(self): + # diskutil apfs list emits the disk's name followed by a few + # lines of attributes; FileVault: Yes appears within ~6 lines. + stub = ( + "APFS Container Disk5 ...\n" + " APFS Volume Disk Identifier: disk5s1\n" + " Name: backup-vault (Case-insensitive)\n" + " Mount Point: /Volumes/backup-vault\n" + " Capacity Consumed: ...\n" + " FileVault: Yes\n" + " Encrypted: Yes\n" + ) + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = stub + assert configure._is_encrypted("backup-vault") is True + + def test_false_when_filevault_no(self): + stub = ( + "APFS Container ...\n" + " Name: backup-vault (Case-insensitive)\n" + " FileVault: No\n" + ) + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = stub + assert configure._is_encrypted("backup-vault") is False + + def test_false_when_disk_not_in_output(self): + stub = "APFS Container ...\n Name: some-other-disk\n FileVault: Yes\n" + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = stub + # Even though "FileVault: Yes" appears in the output, it's + # for a different disk — must not match ours. + assert configure._is_encrypted("backup-vault") is False + + def test_false_on_subprocess_failure(self): + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 1 + run.return_value.stdout = "" + assert configure._is_encrypted("backup-vault") is False + + +# ── _get_volume_uuid (mocked diskutil info) ──────────────────────────────── + +class TestGetVolumeUuid: + def test_extracts_uuid_from_diskutil_info(self): + stub = ( + " Device / Media Name: Backup Vault\n" + " Volume Name: backup-vault\n" + " Mount Point: /Volumes/backup-vault\n" + " File System Personality: APFS\n" + " Volume UUID: ABCD1234-5678-90AB-CDEF-1234567890AB\n" + " Disk / Partition UUID: F00D0000-0000-0000-0000-000000000000\n" + ) + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = stub + assert ( + configure._get_volume_uuid("backup-vault") + == "ABCD1234-5678-90AB-CDEF-1234567890AB" + ) + + def test_empty_string_when_uuid_missing(self): + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = "Volume Name: x\n" # no UUID line + assert configure._get_volume_uuid("backup-vault") == "" + + def test_empty_string_on_subprocess_failure(self): + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 1 + run.return_value.stdout = "" + assert configure._get_volume_uuid("backup-vault") == "" + + +# ── _keychain_has_entry (mocked security) ────────────────────────────────── + +class TestKeychainHasEntry: + def test_true_when_security_finds_password(self): + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = "the-passphrase\n" + assert configure._keychain_has_entry("UUID-1234") is True + + def test_false_when_security_fails(self): + # security exits non-zero when no matching item exists. + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 44 # "specified item not found" + run.return_value.stdout = "" + assert configure._keychain_has_entry("UUID-1234") is False + + def test_false_when_password_blank(self): + # Defensive: if security somehow returns success but empty + # password, treat as no entry. + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = "\n" + assert configure._keychain_has_entry("UUID-1234") is False + + +# ── _store_keychain_passphrase (mocked security) ─────────────────────────── + +class TestStoreKeychainPassphrase: + def test_returns_true_on_success(self): + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + assert configure._store_keychain_passphrase( + "UUID-1234", "backup-vault", "secret" + ) is True + + def test_returns_false_on_failure(self): + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 1 + assert configure._store_keychain_passphrase( + "UUID-1234", "backup-vault", "secret" + ) is False + + def test_command_uses_update_flag(self): + # Using -U means re-running on_configure for an existing disk + # doesn't fail with "item already exists". + with patch.object(configure.subprocess, "run") as run: + run.return_value.returncode = 0 + configure._store_keychain_passphrase("UUID", "disk", "pass") + args = run.call_args[0][0] + assert "-U" in args diff --git a/tests/stacklets/test_backup_cron.py b/tests/stacklets/test_backup_cron.py new file mode 100644 index 0000000..658b244 --- /dev/null +++ b/tests/stacklets/test_backup_cron.py @@ -0,0 +1,253 @@ +"""Unit tests for the backup stacklet's cron helper. + +The crontab command is mocked everywhere — these tests assert the +behavior the helper promises (idempotent install, target-scoped +removal, loud failure on edit errors) without touching the real +crontab. +""" + +from __future__ import annotations + +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT / "stacklets" / "backup")) + +import _cron as cron # noqa: E402 + + +# ── Helpers ──────────────────────────────────────────────────────────────── + +class _FakeCron: + """Test double for the crontab subprocess. Holds in-memory state + and serves both ``crontab -l`` and ``crontab -`` calls.""" + + def __init__(self, initial: str = ""): + self.content = initial + self.write_calls = 0 + self.next_write_should_fail = False + + def run(self, argv, **kw): + result = MagicMock() + if argv == ["crontab", "-l"]: + if self.content: + result.returncode = 0 + result.stdout = self.content + else: + # Match real crontab behavior: exit non-zero when no + # crontab is installed. + result.returncode = 1 + result.stdout = "" + return result + if argv == ["crontab", "-"]: + if self.next_write_should_fail: + result.returncode = 1 + result.stderr = "crontab: locked" + return result + self.content = kw["input"] + self.write_calls += 1 + result.returncode = 0 + result.stderr = "" + return result + raise RuntimeError(f"unexpected subprocess call: {argv}") + + +def _patched_subprocess(fake: _FakeCron): + """Patch _cron.subprocess.run to dispatch through the fake.""" + return patch.object(cron.subprocess, "run", side_effect=fake.run) + + +# ── install_entry ────────────────────────────────────────────────────────── + +class TestInstallEntry: + def test_installs_into_empty_crontab(self): + fake = _FakeCron() + with _patched_subprocess(fake): + changed = cron.install_entry("0 2 * * *", "open /a/b.app", "vault") + assert changed is True + assert "0 2 * * * open /a/b.app" in fake.content + assert "famstack-backup-vault" in fake.content + + def test_appends_to_existing_unrelated_entries(self): + fake = _FakeCron("0 5 * * * /some/other/job\n") + with _patched_subprocess(fake): + cron.install_entry("0 2 * * *", "open /a.app", "vault") + # The original unrelated entry survives. + assert "/some/other/job" in fake.content + # Ours got added. + assert "famstack-backup-vault" in fake.content + + def test_replaces_existing_entry_for_same_target(self): + fake = _FakeCron( + "0 2 * * * open /old.app # famstack-backup-vault\n" + ) + with _patched_subprocess(fake): + cron.install_entry("0 3 * * *", "open /new.app", "vault") + + # Old line gone, new line present. + assert "/old.app" not in fake.content + assert "/new.app" in fake.content + # Only one famstack-backup-vault line. + assert fake.content.count("famstack-backup-vault") == 1 + + def test_idempotent_reinstall_returns_false(self): + # Installing the exact same entry twice should be a no-op. + fake = _FakeCron() + with _patched_subprocess(fake): + first = cron.install_entry("0 2 * * *", "open /a.app", "vault") + second = cron.install_entry("0 2 * * *", "open /a.app", "vault") + assert first is True + assert second is False + # The second call didn't issue a write. + assert fake.write_calls == 1 + + def test_two_targets_coexist(self): + # Vault and offsite are different targets — both entries must + # survive each other's install. + fake = _FakeCron() + with _patched_subprocess(fake): + cron.install_entry("0 2 * * *", "open /vault.app", "vault") + cron.install_entry("0 4 * * 0", "open /offsite.app", "offsite") + assert "famstack-backup-vault" in fake.content + assert "famstack-backup-offsite" in fake.content + assert "vault.app" in fake.content + assert "offsite.app" in fake.content + + def test_raises_when_crontab_write_fails(self): + fake = _FakeCron() + fake.next_write_should_fail = True + with _patched_subprocess(fake): + with pytest.raises(RuntimeError, match="crontab edit failed"): + cron.install_entry("0 2 * * *", "open /a.app", "vault") + + +# ── remove_entry ─────────────────────────────────────────────────────────── + +class TestRemoveEntry: + def test_removes_matching_entry(self): + fake = _FakeCron( + "0 2 * * * open /a.app # famstack-backup-vault\n" + ) + with _patched_subprocess(fake): + removed = cron.remove_entry("vault") + assert removed is True + assert "famstack-backup-vault" not in fake.content + + def test_returns_false_when_no_entry_present(self): + # Empty crontab, nothing to remove — must not fail. + fake = _FakeCron() + with _patched_subprocess(fake): + removed = cron.remove_entry("vault") + assert removed is False + assert fake.write_calls == 0 # didn't bother to write + + def test_leaves_unrelated_entries_intact(self): + fake = _FakeCron( + "0 5 * * * /some/other/job\n" + "0 2 * * * open /a.app # famstack-backup-vault\n" + ) + with _patched_subprocess(fake): + cron.remove_entry("vault") + assert "/some/other/job" in fake.content + assert "famstack-backup-vault" not in fake.content + + def test_removing_vault_does_not_touch_offsite(self): + # Target-scoped marker means destroying one target leaves the + # other's entry untouched. + fake = _FakeCron( + "0 2 * * * open /vault.app # famstack-backup-vault\n" + "0 4 * * 0 open /offsite.app # famstack-backup-offsite\n" + ) + with _patched_subprocess(fake): + cron.remove_entry("vault") + assert "famstack-backup-vault" not in fake.content + assert "famstack-backup-offsite" in fake.content + assert "/offsite.app" in fake.content + + def test_idempotent_double_removal(self): + # Belt-and-suspenders: destroy lifecycle calls on_stop AND + # on_destroy; both call remove_entry. Second call must be a + # no-op, not a failure. + fake = _FakeCron( + "0 2 * * * open /a.app # famstack-backup-vault\n" + ) + with _patched_subprocess(fake): + first = cron.remove_entry("vault") + second = cron.remove_entry("vault") + assert first is True + assert second is False + + def test_empty_crontab_after_removal_is_written_empty(self): + # If our entry was the only one, the crontab should end up + # empty, not deleted. (Real-world: this avoids surprise if the + # user's crontab had only our entry.) + fake = _FakeCron( + "0 2 * * * open /a.app # famstack-backup-vault\n" + ) + with _patched_subprocess(fake): + cron.remove_entry("vault") + assert fake.content == "" + + +# ── is_installed ────────────────────────────────────────────────────────── + +class TestIsInstalled: + def test_true_when_marker_present(self): + fake = _FakeCron( + "0 2 * * * open /a.app # famstack-backup-vault\n" + ) + with _patched_subprocess(fake): + assert cron.is_installed("vault") is True + + def test_false_when_marker_absent(self): + fake = _FakeCron("0 5 * * * /some/other/job\n") + with _patched_subprocess(fake): + assert cron.is_installed("vault") is False + + def test_false_on_empty_crontab(self): + fake = _FakeCron() + with _patched_subprocess(fake): + assert cron.is_installed("vault") is False + + def test_scoped_to_target_name(self): + # vault marker present, but caller asks about offsite — answer + # must be False. + fake = _FakeCron( + "0 2 * * * open /a.app # famstack-backup-vault\n" + ) + with _patched_subprocess(fake): + assert cron.is_installed("offsite") is False + + +# ── remove_all_entries ──────────────────────────────────────────────────── + +class TestRemoveAllEntries: + def test_removes_every_famstack_backup_entry(self): + fake = _FakeCron( + "0 2 * * * open /v.app # famstack-backup-vault\n" + "0 4 * * 0 open /o.app # famstack-backup-offsite\n" + "0 5 * * * /unrelated/job\n" + ) + with _patched_subprocess(fake): + removed = cron.remove_all_entries() + assert removed == 2 + assert "famstack-backup" not in fake.content + assert "/unrelated/job" in fake.content + + def test_returns_zero_when_nothing_to_remove(self): + fake = _FakeCron("0 5 * * * /unrelated/job\n") + with _patched_subprocess(fake): + removed = cron.remove_all_entries() + assert removed == 0 + # No write — there was nothing to do. + assert fake.write_calls == 0 + + def test_handles_empty_crontab(self): + fake = _FakeCron() + with _patched_subprocess(fake): + removed = cron.remove_all_entries() + assert removed == 0 diff --git a/tests/stacklets/test_backup_e2e.py b/tests/stacklets/test_backup_e2e.py new file mode 100644 index 0000000..43474b8 --- /dev/null +++ b/tests/stacklets/test_backup_e2e.py @@ -0,0 +1,497 @@ +"""End-to-end tests for the backup pipeline against a real APFS volume. + +Uses ``hdiutil`` to create a sparse APFS disk image and attach it. +From the kernel's perspective it's indistinguishable from a USB drive: +``chflags uchg`` enforces, ``diskutil`` recognizes it, ``rsync`` writes +real bytes. No external storage needed. + +What these tests exercise that the unit suite mocks: + +* ``stat -f %T`` against a real filesystem +* ``hdiutil attach`` / ``diskutil`` against a real volume +* ``/usr/bin/rsync`` writing real files +* Kernel-enforced ``chflags uchg`` immutability +* The engine subprocess as a black box (stdin/argv/env in, JSON out) +* The orchestrator → engine → vault flow via ``./stack backup sync`` + +What they don't: + +* Cron sandbox — would touch the user's actual crontab +* TCC / Full Disk Access — opaque to the bundle being granted + +macOS-only: ``hdiutil`` and BSD ``chflags`` are Mac-specific. +""" + +from __future__ import annotations + +import json +import os +import subprocess +import sys +import time +from pathlib import Path + +import pytest + +pytestmark = pytest.mark.skipif( + sys.platform != "darwin", + reason="backup E2E tests require macOS (hdiutil + chflags + APFS)", +) + +REPO_ROOT = Path(__file__).resolve().parents[2] +ENGINE_SCRIPT = REPO_ROOT / "stacklets" / "backup" / "engines" / "external-disk" / "sync.py" +STACK_BIN = REPO_ROOT / "stack" + + +# ── APFS disk image fixture ─────────────────────────────────────────────── + +@pytest.fixture +def vault_image(tmp_path): + """Create and attach a 100MB APFS sparse image. Yields ``(name, mount)``. + + The volume name is unique per test invocation (pid + ms timestamp) + so concurrent runs and crash-leftovers don't collide. Detach is + best-effort with retry, falling back to ``-force`` if uchg-locked + files keep the volume busy. + """ + dmg = tmp_path / "vault.dmg" + name = f"famstack-test-vault-{os.getpid()}-{int(time.time() * 1000)}" + + subprocess.run( + ["hdiutil", "create", "-size", "100m", "-fs", "APFS", + "-volname", name, str(dmg)], + check=True, capture_output=True, + ) + subprocess.run( + ["hdiutil", "attach", str(dmg)], + check=True, capture_output=True, + ) + mount = Path(f"/Volumes/{name}") + assert mount.is_dir(), f"hdiutil attach didn't mount {mount}" + + try: + yield name, mount + finally: + _detach_with_retry(mount) + + +def _detach_with_retry(mount: Path) -> None: + """Detach a mounted dmg, retrying past transient busy errors. + + Falls back to ``-force`` after three attempts. Worst case: a stale + ``/Volumes/`` until reboot — annoying but not destructive. + """ + for _ in range(3): + result = subprocess.run( + ["hdiutil", "detach", str(mount)], + capture_output=True, text=True, + ) + if result.returncode == 0: + return + time.sleep(0.3) + subprocess.run( + ["hdiutil", "detach", str(mount), "-force"], + capture_output=True, + ) + + +# ── Other fixtures ───────────────────────────────────────────────────────── + +@pytest.fixture +def backup_data_dir(tmp_path): + """Post-install state for the engine: directory exists, canary + planted with the expected content. The engine no longer creates + the canary — that's on_install's job — so tests have to mirror it.""" + d = tmp_path / "backup-data" + d.mkdir() + (d / "canary").write_text("famstack-backup-canary-do-not-delete\n") + return d + + +@pytest.fixture +def fake_sources(tmp_path): + """Two source directories with real files. Matches the layout the + real photos/docs manifests expect when ``{data_dir}`` is rendered. + + The photos manifest declares ``path = "{data_dir}/photos/library/library"``, + so for the orchestrator's template rendering to land here, + ``data_dir`` must be the parent of the photos+docs tree — i.e. + ``tmp_path / "data"``, not ``tmp_path``. + """ + data_dir = tmp_path / "data" + photos = data_dir / "photos" / "library" / "library" + docs = data_dir / "docs" / "paperless" / "media" + photos.mkdir(parents=True) + docs.mkdir(parents=True) + for i in range(15): + (photos / f"photo-{i:03d}.jpg").write_bytes(b"x" * 256) + for i in range(12): + (docs / f"doc-{i:03d}.pdf").write_bytes(b"y" * 256) + return {"photos": photos, "docs": docs, "data_dir": data_dir} + + +# ── Helpers ──────────────────────────────────────────────────────────────── + +def _sources_env(fake_sources: dict, *, photos_min: int = 10, docs_min: int = 5) -> str: + """Build the ``$SOURCES`` env string the engine expects.""" + return "\n".join([ + f"photos/library|Photos|{fake_sources['photos']}|data/photos-library|{photos_min}", + f"docs/media|Documents|{fake_sources['docs']}|data/docs-media|{docs_min}", + ]) + + +def _run_engine(backup_data_dir: Path, vault_name: str, sources_env: str, + *, args=None): + env = os.environ.copy() + env["BACKUP_DATA_DIR"] = str(backup_data_dir) + env["VAULT_DISK"] = vault_name + env["SOURCES"] = sources_env + cmd = [sys.executable, str(ENGINE_SCRIPT)] + (args or []) + return subprocess.run(cmd, env=env, capture_output=True, text=True, timeout=60) + + +def _read_result(backup_data_dir: Path) -> dict: + """Read the latest run from history.jsonl (last good JSON line).""" + history = backup_data_dir / "logs" / "history.jsonl" + latest = None + for line in history.read_text().splitlines(): + line = line.strip() + if not line: + continue + try: + latest = json.loads(line) + except json.JSONDecodeError: + continue + assert latest is not None, f"history.jsonl had no parseable lines: {history}" + return latest + + +def _has_uchg(path: Path) -> bool: + """True if the BSD uchg flag is set on ``path``. ``ls -lO`` shows + the flag inline; we look for ``uchg`` as a word in the output.""" + result = subprocess.run(["ls", "-ldO", str(path)], + capture_output=True, text=True) + return "uchg" in result.stdout.split() + + +# ── Engine E2E ───────────────────────────────────────────────────────────── + +class TestEngineSyncE2E: + def test_first_sync_writes_and_locks_files( + self, vault_image, backup_data_dir, fake_sources + ): + name, mount = vault_image + result = _run_engine(backup_data_dir, name, + _sources_env(fake_sources), args=["--no-eject"]) + + assert result.returncode == 0, ( + f"engine failed (exit {result.returncode}):\n" + f"stdout:\n{result.stdout}\nstderr:\n{result.stderr}" + ) + + vault_photos = mount / "data" / "photos-library" + vault_docs = mount / "data" / "docs-media" + assert vault_photos.is_dir() + assert vault_docs.is_dir() + + photos = list(vault_photos.glob("*.jpg")) + assert len(photos) == 15 + for p in photos: + assert _has_uchg(p), f"{p.name} is not uchg-locked" + + data = _read_result(backup_data_dir) + assert data["success"] is True + assert data["dry_run"] is False + photo_result = next(s for s in data["sources"] if s["id"] == "photos/library") + assert photo_result["new_files"] == 15 + assert photo_result["total_files"] == 15 + + def test_immutable_files_resist_modification( + self, vault_image, backup_data_dir, fake_sources + ): + """Sanity-check the kernel actually refuses to modify uchg files. + + If this test fails, something's wrong with the filesystem's + BSD flag enforcement — the whole append-only contract relies + on the kernel honoring ``uchg``. + """ + name, mount = vault_image + _run_engine(backup_data_dir, name, _sources_env(fake_sources), + args=["--no-eject"]) + + locked = next((mount / "data" / "photos-library").glob("*.jpg")) + with pytest.raises(PermissionError): + locked.write_bytes(b"tampered") + + def test_second_sync_is_noop( + self, vault_image, backup_data_dir, fake_sources + ): + """``--ignore-existing`` means a re-run with the same sources + touches no files on the vault. No new uchg applications either.""" + name, mount = vault_image + sources = _sources_env(fake_sources) + _run_engine(backup_data_dir, name, sources, args=["--no-eject"]) + + photos = mount / "data" / "photos-library" + before = {p.name: p.stat().st_mtime for p in photos.iterdir()} + + result = _run_engine(backup_data_dir, name, sources, args=["--no-eject"]) + assert result.returncode == 0 + + after = {p.name: p.stat().st_mtime for p in photos.iterdir()} + assert before == after + + data = _read_result(backup_data_dir) + assert all(s["new_files"] == 0 for s in data["sources"]) + + def test_new_source_files_picked_up_on_next_run( + self, vault_image, backup_data_dir, fake_sources + ): + """Add a file to the source between runs. The next sync writes + and locks just the new one.""" + name, mount = vault_image + sources = _sources_env(fake_sources) + _run_engine(backup_data_dir, name, sources, args=["--no-eject"]) + + (fake_sources["photos"] / "extra.jpg").write_bytes(b"z" * 256) + + result = _run_engine(backup_data_dir, name, sources, args=["--no-eject"]) + assert result.returncode == 0 + + new_on_vault = mount / "data" / "photos-library" / "extra.jpg" + assert new_on_vault.is_file() + assert _has_uchg(new_on_vault) + + data = _read_result(backup_data_dir) + photo_result = next(s for s in data["sources"] if s["id"] == "photos/library") + assert photo_result["new_files"] == 1 + assert photo_result["total_files"] == 16 + + def test_canary_deletion_between_runs_aborts( + self, vault_image, backup_data_dir, fake_sources + ): + """Silent-rearm defense at the engine boundary. An attacker who + knows about the tripwire mustn't be able to delete it between + runs and slip past with a freshly-created replacement.""" + name, _ = vault_image + sources = _sources_env(fake_sources) + _run_engine(backup_data_dir, name, sources, args=["--no-eject"]) + + canary = backup_data_dir / "canary" + history = backup_data_dir / "logs" / "history.jsonl" + assert canary.exists() and history.exists() + + # Attacker deletes the canary. history.jsonl stays as the + # witness that a real previous run happened. + canary.unlink() + + result = _run_engine(backup_data_dir, name, sources, args=["--no-eject"]) + assert result.returncode != 0 + + data = _read_result(backup_data_dir) + assert data["success"] is False + reason = (data["failure_reason"] or "").lower() + assert "canary" in reason and "missing" in reason + + def test_canary_tamper_aborts_sync( + self, vault_image, backup_data_dir, fake_sources + ): + """The canary file is the precise ransomware tripwire. If its + contents change between runs, the sync must abort before + touching the vault.""" + name, mount = vault_image + sources = _sources_env(fake_sources) + _run_engine(backup_data_dir, name, sources, args=["--no-eject"]) + + # Tamper. The previous run created the canary with the known + # string; we overwrite it with garbage. + (backup_data_dir / "canary").write_text("tampered\n") + + result = _run_engine(backup_data_dir, name, sources, args=["--no-eject"]) + assert result.returncode != 0 + + data = _read_result(backup_data_dir) + assert data["success"] is False + assert "canary" in (data["failure_reason"] or "").lower() + + def test_refuses_when_source_under_minimum( + self, vault_image, backup_data_dir, fake_sources + ): + """Preflight is the coarse ransomware guard — refuses to sync + a source that's been wiped to fewer files than the declared + minimum. Critically, no vault writes happen.""" + name, mount = vault_image + # photos has 15 files; bump min to 100 so preflight fails + sources = _sources_env(fake_sources, photos_min=100) + + result = _run_engine(backup_data_dir, name, sources, args=["--no-eject"]) + assert result.returncode != 0 + + # The vault stays clean — preflight failure means we never + # mounted (or in this case never wrote, since the disk was + # already mounted by the test fixture). + assert not (mount / "data" / "photos-library").exists() + + data = _read_result(backup_data_dir) + assert data["success"] is False + assert "preflight" in (data["failure_reason"] or "").lower() + + def test_dry_run_writes_no_files( + self, vault_image, backup_data_dir, fake_sources + ): + """``--dry-run`` rsyncs in preview mode and skips locking. Vault + contents unchanged after.""" + name, mount = vault_image + result = _run_engine(backup_data_dir, name, + _sources_env(fake_sources), args=["--dry-run"]) + assert result.returncode == 0 + + # Vault has nothing — no real writes happened + data_dir = mount / "data" + if data_dir.exists(): + assert not list(data_dir.iterdir()) + + data = _read_result(backup_data_dir) + assert data["dry_run"] is True + assert data["success"] is True + + def test_refuses_non_apfs_filesystem( + self, tmp_path, backup_data_dir, fake_sources + ): + """The probe is what stops us silently degrading to a non-WORM + copy on a filesystem that doesn't honor BSD flags. Verify it + actually fires against a real FAT32 volume.""" + dmg = tmp_path / "fat-vault.dmg" + name = f"famstack-test-fat-{os.getpid()}-{int(time.time() * 1000)}" + + try: + subprocess.run( + ["hdiutil", "create", "-size", "10m", "-fs", "MS-DOS FAT32", + "-volname", name, str(dmg)], + check=True, capture_output=True, + ) + except subprocess.CalledProcessError as e: + pytest.skip(f"Couldn't create FAT32 test image: {e.stderr.decode()}") + + subprocess.run(["hdiutil", "attach", str(dmg)], + check=True, capture_output=True) + mount = Path(f"/Volumes/{name}") + try: + result = _run_engine(backup_data_dir, name, + _sources_env(fake_sources), args=["--no-eject"]) + assert result.returncode != 0 + + data = _read_result(backup_data_dir) + assert data["success"] is False + # The error message should mention the filesystem or + # immutability — we don't pin to exact wording, just the + # category of failure. + reason = (data["failure_reason"] or "").lower() + assert "filesystem" in reason or "immutability" in reason + finally: + _detach_with_retry(mount) + + +# ── Orchestrator E2E ─────────────────────────────────────────────────────── + +class TestOrchestratorE2E: + """End-to-end through the orchestrator: ``./stack backup sync`` with + ``STACK_DIR`` pointed at a fake instance. Exercises the full chain + from CLI dispatch through source discovery through engine subprocess + to vault writes.""" + + @pytest.fixture + def test_instance(self, tmp_path, vault_image, fake_sources): + name, _ = vault_image + instance = tmp_path / "instance" + instance.mkdir() + + # Symlink stacklets/ to the real repo — we test against the + # real photos/docs manifests so source discovery exercises + # the actual schema. + (instance / "stacklets").symlink_to(REPO_ROOT / "stacklets") + + # data_dir is wherever fake_sources put the source tree. + # photos/library/library/* and docs/paperless/media/* live + # under tmp_path/data, so data_dir is tmp_path. + data_dir = fake_sources["data_dir"] + + (instance / "stack.toml").write_text( + f'[core]\n' + f'data_dir = "{data_dir}"\n' + f'\n' + f'[backup.targets.{name}]\n' + f'engine = "external-disk"\n' + f'disk = "{name}"\n' + f'schedule = "0 2 * * *"\n' + ) + (instance / "users.toml").write_text("") + + # Simulate the post-install state. Real on_install plants the + # canary alongside the .stack/backup.setup-done marker; the + # silent-rearm check uses setup-done as its witness so missing + # canary + present setup-done = tampering. The test fixture + # has to mirror both, or the very first orchestrator run looks + # like an attack. + backup_state_dir = data_dir / "backup" + backup_state_dir.mkdir(parents=True, exist_ok=True) + (backup_state_dir / "canary").write_text( + "famstack-backup-canary-do-not-delete\n" + ) + + # Setup-done markers gate source discovery — only enabled + # stacklets contribute. + stack_dir = instance / ".stack" + stack_dir.mkdir() + (stack_dir / "photos.setup-done").write_text("") + (stack_dir / "docs.setup-done").write_text("") + (stack_dir / "backup.setup-done").write_text("") + (stack_dir / "secrets.toml").write_text("") + + return instance + + def test_stack_backup_sync_runs_engine_end_to_end( + self, vault_image, test_instance + ): + """Invoke ``./stack backup sync`` via subprocess with + ``STACK_DIR`` pointed at the fake instance. Verify: + + * the subprocess returns 0 + * files appear on the vault in the expected locations + * each file has the uchg flag set + + Matrix notification is skipped automatically: the test + instance's secrets file is empty, so the orchestrator's + ``_post_notification`` returns early with "stacker-bot + password not in secrets" and we continue. + """ + name, mount = vault_image + + env = os.environ.copy() + env["STACK_DIR"] = str(test_instance) + + result = subprocess.run( + [str(STACK_BIN), "backup", "sync", "--no-eject"], + env=env, capture_output=True, text=True, timeout=120, + ) + + assert result.returncode == 0, ( + f"./stack backup sync failed (exit {result.returncode}):\n" + f"stdout:\n{result.stdout}\n" + f"stderr:\n{result.stderr}" + ) + + vault_photos = mount / "data" / "photos-library" + vault_docs = mount / "data" / "docs-media" + assert vault_photos.is_dir(), \ + f"orchestrator didn't write photos to vault.\nstdout:\n{result.stdout}" + assert vault_docs.is_dir() + + photos = list(vault_photos.glob("*.jpg")) + docs = list(vault_docs.glob("*.pdf")) + assert len(photos) == 15 + assert len(docs) == 12 + + # Spot-check the uchg flag on a few files + assert _has_uchg(photos[0]) + assert _has_uchg(docs[0]) diff --git a/tests/stacklets/test_backup_engine.py b/tests/stacklets/test_backup_engine.py new file mode 100644 index 0000000..ff43fd5 --- /dev/null +++ b/tests/stacklets/test_backup_engine.py @@ -0,0 +1,444 @@ +"""Unit tests for the external-disk backup engine. + +Covers the pure-Python parts of the pipeline: source parsing, canary +behavior, preflight, filesystem capability probe, result-file shape. +Skips the rsync/diskutil/eject flows — those need a real disk and live +in integration tests. +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str( + REPO_ROOT / "stacklets" / "backup" / "engines" / "external-disk" +)) + +import sync as engine +from sync import ( + CANARY_STRING, + Source, + SourceResult, + SyncAborted, + SyncResult, + append_to_history, + format_number, + parse_sources, + preflight_check_sources, + probe_filesystem, + read_latest_run, + verify_canary, +) + + +# ── Source parsing ───────────────────────────────────────────────────────── + +class TestParseSources: + def test_single_record(self): + sources = parse_sources( + "photos/library|Photos|/data/photos/library|data/photos-library|10" + ) + assert len(sources) == 1 + s = sources[0] + assert s.id == "photos/library" + assert s.display == "Photos" + assert s.src_path == Path("/data/photos/library") + assert s.vault_subdir == "data/photos-library" + assert s.min_files == 10 + + def test_multiple_records_separated_by_newlines(self): + sources = parse_sources( + "photos/library|Photos|/a|data/p|10\n" + "docs/media|Documents|/b|data/d|5" + ) + assert [s.id for s in sources] == ["photos/library", "docs/media"] + assert sources[1].min_files == 5 + + def test_blank_lines_ignored(self): + sources = parse_sources( + "\n" + "photos/library|Photos|/a|data/p|10\n" + " \n" + "docs/media|Documents|/b|data/d|5\n" + ) + assert len(sources) == 2 + + def test_empty_input_aborts(self): + # Empty $SOURCES is a misconfiguration, not a valid "no work to do." + # We refuse loudly rather than silently doing nothing. + with pytest.raises(SyncAborted, match="No sources provided"): + parse_sources("") + + def test_whitespace_only_aborts(self): + with pytest.raises(SyncAborted, match="No sources provided"): + parse_sources(" \n\n ") + + def test_too_few_fields_aborts(self): + with pytest.raises(SyncAborted, match="Malformed source record"): + parse_sources("photos/library|Photos|/a|data/p") # only 4 fields + + def test_too_many_fields_aborts(self): + with pytest.raises(SyncAborted, match="Malformed source record"): + parse_sources("a|b|c|d|10|extra") + + def test_non_integer_min_files_aborts(self): + with pytest.raises(SyncAborted, match="min_files must be an integer"): + parse_sources("a|b|c|d|many") + + +# ── Canary ───────────────────────────────────────────────────────────────── + +class TestVerifyCanary: + def test_matching_content_passes(self, tmp_path, capsys): + canary = tmp_path / "canary" + canary.write_text(CANARY_STRING + "\n") + verify_canary(canary) # should not raise + + def test_missing_canary_aborts(self, tmp_path, capsys): + canary = tmp_path / "canary" + with pytest.raises(SyncAborted, match="missing"): + verify_canary(canary) + + def test_content_mismatch_aborts(self, tmp_path, capsys): + canary = tmp_path / "canary" + canary.write_text("not the expected string\n") + with pytest.raises(SyncAborted, match="Canary check failed"): + verify_canary(canary) + + def test_trailing_whitespace_in_canary_is_tolerated(self, tmp_path, capsys): + # The verifier strips before comparing, so a planter that + # included an extra newline or trailing space doesn't trip + # the corruption check. + canary = tmp_path / "canary" + canary.write_text(CANARY_STRING + "\n\n ") + verify_canary(canary) # should not raise + + +# ── Preflight ────────────────────────────────────────────────────────────── + +class TestPreflightCheckSources: + def _make_source(self, tmp_path: Path, name: str, file_count: int, min_files: int) -> Source: + src_dir = tmp_path / name + src_dir.mkdir() + for i in range(file_count): + (src_dir / f"file-{i}.txt").write_text("x") + return Source( + id=f"test/{name}", + display=name.title(), + src_path=src_dir, + vault_subdir=f"data/test-{name}", + min_files=min_files, + ) + + def test_passes_when_each_source_meets_min(self, tmp_path, capsys): + sources = [ + self._make_source(tmp_path, "a", file_count=20, min_files=10), + self._make_source(tmp_path, "b", file_count=15, min_files=10), + ] + preflight_check_sources(sources) # should not raise + + def test_aborts_when_any_source_under_min(self, tmp_path, capsys): + sources = [ + self._make_source(tmp_path, "ok", file_count=20, min_files=10), + self._make_source(tmp_path, "low", file_count=2, min_files=10), + ] + with pytest.raises(SyncAborted, match="Preflight failed"): + preflight_check_sources(sources) + + def test_aborts_when_source_dir_missing(self, tmp_path, capsys): + source = Source( + id="test/missing", + display="Missing", + src_path=tmp_path / "does-not-exist", + vault_subdir="data/test-missing", + min_files=1, + ) + with pytest.raises(SyncAborted, match="Preflight failed"): + preflight_check_sources([source]) + + def test_exact_min_count_passes(self, tmp_path, capsys): + # Edge: file_count == min_files should pass (not "strictly greater than"). + sources = [self._make_source(tmp_path, "exact", file_count=10, min_files=10)] + preflight_check_sources(sources) # should not raise + + +# ── _stat_fs_type (mocked mount output) ──────────────────────────────────── + +class TestStatFsType: + """Direct tests for the engine's mount-parsing. Lives alongside + TestProbeFilesystem (which mocks the return value) so a future + regression in the parser fails here loudly, not via opaque probe + behavior.""" + + def _mount(self, *lines): + return "\n".join(lines) + "\n" + + def test_apfs_extracted(self): + from unittest.mock import patch + out = self._mount( + "/dev/disk3s1 on / (apfs, sealed)", + "/dev/disk5s1 on /Volumes/foo (apfs, local, nodev)", + ) + with patch.object(engine.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = out + assert engine._stat_fs_type(Path("/Volumes/foo")) == "apfs" + + def test_smbfs_extracted(self): + from unittest.mock import patch + out = self._mount( + "//u@host/share on /Volumes/share (smbfs, nodev)" + ) + with patch.object(engine.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = out + assert engine._stat_fs_type(Path("/Volumes/share")) == "smbfs" + + def test_empty_when_not_in_mount_output(self): + from unittest.mock import patch + out = self._mount("/dev/disk3s1 on / (apfs, sealed)") + with patch.object(engine.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = out + assert engine._stat_fs_type(Path("/Volumes/absent")) == "" + + def test_partial_path_prefix_does_not_match(self): + from unittest.mock import patch + out = self._mount("/dev/disk5s1 on /Volumes/foobar (apfs, local)") + with patch.object(engine.subprocess, "run") as run: + run.return_value.returncode = 0 + run.return_value.stdout = out + assert engine._stat_fs_type(Path("/Volumes/foo")) == "" + + +# ── Filesystem capability ────────────────────────────────────────────────── + +class TestProbeFilesystem: + @pytest.fixture + def mock_fs_type(self, monkeypatch): + """Patch _stat_fs_type to return whatever string the test wants. + + Probing real filesystems isn't reproducible in CI, and stat -f + output is the only seam that matters here — what probe_filesystem + DOES with the type string is the testable behavior. + """ + def factory(fs_type: str): + monkeypatch.setattr(engine, "_stat_fs_type", lambda _path: fs_type) + return factory + + def test_apfs_passes(self, mock_fs_type, tmp_path, capsys): + mock_fs_type("apfs") + probe_filesystem(tmp_path) # should not raise + + def test_hfs_passes(self, mock_fs_type, tmp_path, capsys): + mock_fs_type("hfs") + probe_filesystem(tmp_path) + + def test_smbfs_aborts_with_network_message(self, mock_fs_type, tmp_path, capsys): + mock_fs_type("smbfs") + with pytest.raises(SyncAborted, match="does not support BSD immutability"): + probe_filesystem(tmp_path) + # Make sure the user-facing message points at the future restic engine. + err = capsys.readouterr().err + assert "restic" in err + + def test_nfs_aborts(self, mock_fs_type, tmp_path, capsys): + mock_fs_type("nfs") + with pytest.raises(SyncAborted, match="does not support BSD immutability"): + probe_filesystem(tmp_path) + + def test_exfat_aborts_with_reformat_message(self, mock_fs_type, tmp_path, capsys): + mock_fs_type("exfat") + with pytest.raises(SyncAborted, match="does not support BSD immutability"): + probe_filesystem(tmp_path) + err = capsys.readouterr().err + assert "Reformat" in err + + def test_msdos_aborts(self, mock_fs_type, tmp_path, capsys): + mock_fs_type("msdos") + with pytest.raises(SyncAborted): + probe_filesystem(tmp_path) + + def test_unknown_fs_aborts(self, mock_fs_type, tmp_path, capsys): + # Future filesystems we haven't classified should be refused too — + # the "supported list is the only safe list" principle. + mock_fs_type("zfs") + with pytest.raises(SyncAborted, match="not supported"): + probe_filesystem(tmp_path) + + def test_empty_fs_type_aborts(self, mock_fs_type, tmp_path, capsys): + # stat -f failed (returned ""). Treat like unknown — refuse. + mock_fs_type("") + with pytest.raises(SyncAborted): + probe_filesystem(tmp_path) + + +# ── Number formatting ────────────────────────────────────────────────────── + +class TestFormatNumber: + def test_under_thousand_unchanged(self): + assert format_number(0) == "0" + assert format_number(42) == "42" + assert format_number(999) == "999" + + def test_thousands_separated_with_dot(self): + # Dot, not comma — comma triggers phone-number linkification in + # some chat clients (Element among them). + assert format_number(1000) == "1.000" + assert format_number(48293) == "48.293" + assert format_number(1_234_567) == "1.234.567" + + +# ── Result writing ───────────────────────────────────────────────────────── + +class TestAppendToHistory: + def _result(self, **overrides) -> SyncResult: + defaults = dict( + success=True, + dry_run=False, + duration_seconds=125, + started_at="2026-05-14T02:00:00Z", + ended_at="2026-05-14T02:02:05Z", + run_context="cron", + run_user="arthur", + vault_disk="backup-vault", + vault_state="mounted", + vault_size="8.2G", + sources=[ + SourceResult( + id="photos/library", display="Photos", + status="ok", total_files=48293, new_files=12, + ), + ], + ) + defaults.update(overrides) + return SyncResult(**defaults) + + def test_writes_one_jsonl_line_per_run(self, tmp_path): + history = tmp_path / "logs" / "history.jsonl" + append_to_history(self._result(), history) + + lines = history.read_text().splitlines() + assert len(lines) == 1 + loaded = json.loads(lines[0]) + assert loaded["success"] is True + assert loaded["sources"][0]["id"] == "photos/library" + assert loaded["sources"][0]["total_files"] == 48293 + + def test_append_does_not_overwrite_prior_lines(self, tmp_path): + # Two successive appends must produce two distinct lines — + # never a JSON array rewrite, never an overwrite. The append- + # only contract is the whole reason we picked JSONL. + history = tmp_path / "logs" / "history.jsonl" + append_to_history(self._result(success=True), history) + append_to_history(self._result(success=False, failure_reason="Disk full"), history) + + lines = history.read_text().splitlines() + assert len(lines) == 2 + assert json.loads(lines[0])["success"] is True + assert json.loads(lines[1])["success"] is False + assert json.loads(lines[1])["failure_reason"] == "Disk full" + + def test_empty_failure_reason_serializes_as_null(self, tmp_path): + # Callers shouldn't have to distinguish "" from "no failure + # recorded" — coerce empty/missing to JSON null on write. + history = tmp_path / "history.jsonl" + append_to_history(SyncResult(success=True, failure_reason=None), history) + loaded = json.loads(history.read_text()) + assert loaded["failure_reason"] is None + + def test_creates_parent_directory(self, tmp_path): + # The history file usually lives under BACKUP_DATA_DIR/logs/ — + # the parent directory may not exist on the very first run. + history = tmp_path / "deeply" / "nested" / "history.jsonl" + append_to_history(self._result(), history) + assert history.exists() + + def test_each_line_terminates_with_newline(self, tmp_path): + # Newline-per-record is what makes JSONL JSONL. Without it, a + # second append would land on the same line as the first. + history = tmp_path / "history.jsonl" + append_to_history(self._result(), history) + content = history.read_text() + assert content.endswith("\n") + + +class TestReadLatestRun: + def _result(self, **overrides) -> SyncResult: + defaults = dict(success=True, started_at="2026-05-14T02:00:00Z") + defaults.update(overrides) + return SyncResult(**defaults) + + def test_returns_none_when_file_missing(self, tmp_path): + assert read_latest_run(tmp_path / "history.jsonl") is None + + def test_returns_none_when_file_empty(self, tmp_path): + history = tmp_path / "history.jsonl" + history.write_text("") + assert read_latest_run(history) is None + + def test_returns_single_run_from_one_line(self, tmp_path): + history = tmp_path / "history.jsonl" + append_to_history(self._result(vault_disk="backup-vault"), history) + loaded = read_latest_run(history) + assert loaded is not None + assert loaded["vault_disk"] == "backup-vault" + + def test_returns_the_last_line_when_multiple_runs(self, tmp_path): + # "Latest" means most recently appended, which in append-only + # JSONL is the last line. + history = tmp_path / "history.jsonl" + append_to_history(self._result(vault_disk="run-1"), history) + append_to_history(self._result(vault_disk="run-2"), history) + append_to_history(self._result(vault_disk="run-3"), history) + loaded = read_latest_run(history) + assert loaded["vault_disk"] == "run-3" + + def test_skips_corrupted_trailing_line(self, tmp_path): + # A crashed engine COULD in theory leave a partial line. Our + # writes are sub-PIPE_BUF so this shouldn't happen, but the + # reader must be tolerant — return the last GOOD line, not + # None, when the trailing line is unparseable. + history = tmp_path / "history.jsonl" + append_to_history(self._result(vault_disk="good-run"), history) + # Manually corrupt the trailing record (without going through append) + with history.open("a") as f: + f.write('{"bad json, no closing\n') + + loaded = read_latest_run(history) + assert loaded is not None + assert loaded["vault_disk"] == "good-run" + + def test_skips_blank_lines(self, tmp_path): + # Defensive: editor-introduced blank lines or stray newlines + # shouldn't break the scan. + history = tmp_path / "history.jsonl" + append_to_history(self._result(vault_disk="r1"), history) + with history.open("a") as f: + f.write("\n\n") + append_to_history(self._result(vault_disk="r2"), history) + with history.open("a") as f: + f.write("\n") + + loaded = read_latest_run(history) + assert loaded["vault_disk"] == "r2" + + +# ── Vault state ──────────────────────────────────────────────────────────── + +class TestDetectVaultState: + def test_drive_not_connected_takes_priority(self, tmp_path): + # Even if the mount point happens to exist for some reason, the + # "drive isn't there" signal wins. + assert engine.detect_vault_state(tmp_path, drive_not_connected=True) == "not_connected" + + def test_mounted_when_mount_point_exists(self, tmp_path): + assert engine.detect_vault_state(tmp_path, drive_not_connected=False) == "mounted" + + def test_ejected_when_mount_point_missing(self, tmp_path): + assert engine.detect_vault_state(tmp_path / "missing", drive_not_connected=False) == "ejected" diff --git a/tests/stacklets/test_backup_install.py b/tests/stacklets/test_backup_install.py new file mode 100644 index 0000000..ea49b9c --- /dev/null +++ b/tests/stacklets/test_backup_install.py @@ -0,0 +1,125 @@ +"""Unit tests for on_install's pure helpers: canary planting and .app +bundle generation. + +The interactive FDA walkthrough and crontab plumbing are tested +elsewhere (test_backup_cron). +""" + +from __future__ import annotations + +import os +import stat +import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT / "stacklets" / "backup" / "hooks")) +sys.path.insert(0, str(REPO_ROOT / "lib")) + +import on_install # noqa: E402 + + +class TestGenerateAppBundle: + @pytest.fixture + def bundle(self, tmp_path): + """Generate an .app bundle into tmp_path and return its path.""" + return on_install.generate_app_bundle( + target_dir=tmp_path, + stack_executable=Path("/path/to/stack"), + log_path=tmp_path / "logs" / "cron.log", + ) + + def test_bundle_directory_structure(self, bundle): + # Classic macOS .app layout — Contents/{Info.plist, MacOS/} + assert bundle.is_dir() + assert bundle.name == "FamstackVaultSync.app" + assert (bundle / "Contents" / "Info.plist").is_file() + assert (bundle / "Contents" / "MacOS" / "vault-sync").is_file() + + def test_info_plist_identifies_bundle(self, bundle): + plist = (bundle / "Contents" / "Info.plist").read_text() + # macOS looks these up when launching the bundle. + assert "CFBundleExecutable" in plist + assert "vault-sync" in plist + assert "CFBundleIdentifier" in plist + assert "dev.famstack.backup" in plist + + def test_info_plist_is_background_app(self, bundle): + # LSUIElement=true keeps the bundle out of the dock and + # Cmd-Tab — there's no UI, no reason to surface it as a + # foreground app. + plist = (bundle / "Contents" / "Info.plist").read_text() + assert "LSUIElement" in plist + assert "" in plist + + def test_info_plist_has_valid_xml_header(self, bundle): + plist = (bundle / "Contents" / "Info.plist").read_text() + # macOS is picky about the DOCTYPE; missing or malformed + # headers cause silent launch failures. + assert plist.startswith('") + + def test_executable_is_executable(self, bundle): + wrapper = bundle / "Contents" / "MacOS" / "vault-sync" + mode = wrapper.stat().st_mode + # User, group, and other all need x bit; cron may run with a + # narrower umask and the .app must still launch. + assert mode & stat.S_IXUSR + assert mode & stat.S_IXGRP + assert mode & stat.S_IXOTH + + def test_wrapper_invokes_stack_backup_sync(self, bundle): + wrapper = (bundle / "Contents" / "MacOS" / "vault-sync").read_text() + assert "/path/to/stack" in wrapper + assert "backup sync" in wrapper + + def test_wrapper_redirects_output_to_log(self, bundle, tmp_path): + wrapper = (bundle / "Contents" / "MacOS" / "vault-sync").read_text() + # Cron output is invisible by default — the wrapper must redirect + # to a known log path so a misbehaving scheduled run leaves a + # trail the user can inspect. + assert str(tmp_path / "logs" / "cron.log") in wrapper + assert "2>&1" in wrapper + + def test_idempotent_regeneration(self, tmp_path): + # Running install twice should leave the same bundle, not pile + # up duplicates or stale state. + on_install.generate_app_bundle( + tmp_path, Path("/p/stack"), tmp_path / "logs" / "cron.log" + ) + on_install.generate_app_bundle( + tmp_path, Path("/p/stack"), tmp_path / "logs" / "cron.log" + ) + # Still one .app, structure intact. + bundles = list(tmp_path.glob("*.app")) + assert len(bundles) == 1 + assert (bundles[0] / "Contents" / "Info.plist").is_file() + + +class TestPlantCanary: + def test_writes_canary_with_expected_content(self, tmp_path): + on_install.plant_canary(tmp_path) + canary = tmp_path / "canary" + assert canary.is_file() + assert canary.read_text().strip() == on_install.CANARY_STRING + + def test_idempotent_does_not_clobber_existing(self, tmp_path): + # An existing canary that's already been verified across syncs + # must survive a re-run of install. Clobbering it would make a + # tampered state indistinguishable from a fresh install. + canary = tmp_path / "canary" + canary.write_text("user-edited content (or already-verified canary)\n") + on_install.plant_canary(tmp_path) + assert canary.read_text() == "user-edited content (or already-verified canary)\n" + + def test_canary_string_matches_engine(self): + # The planter writes what the verifier expects — they share the + # constant via import, so this is really a regression guard + # against someone redefining it in either file. + engine_dir = REPO_ROOT / "stacklets" / "backup" / "engines" / "external-disk" + sys.path.insert(0, str(engine_dir)) + from sync import CANARY_STRING as ENGINE_CANARY + assert on_install.CANARY_STRING == ENGINE_CANARY diff --git a/tests/stacklets/test_backup_orchestrator.py b/tests/stacklets/test_backup_orchestrator.py new file mode 100644 index 0000000..5e4f288 --- /dev/null +++ b/tests/stacklets/test_backup_orchestrator.py @@ -0,0 +1,438 @@ +"""Unit tests for the backup orchestrator helpers. + +Covers the pure-Python parts of cli/_orchestrator.py: source discovery, +target parsing, source serialization, engine command building, result +reading, and Matrix notification formatting. The actual engine +invocation (subprocess) and Matrix posting (live network) are skipped +— those are integration territory. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT / "stacklets" / "backup" / "cli")) + +import _orchestrator as orch +from _orchestrator import ( + SourceRecord, + Target, + build_engine_command, + discover_archive_sources, + format_notification, + get_targets, + read_latest_run, + serialize_sources_env, +) + + +# ── Fixture helpers ──────────────────────────────────────────────────────── + +def _make_fake_stacklet( + root: Path, + stacklet_id: str, + archives: list, + enabled: bool = True, + name: str | None = None, +) -> None: + """Create a stacklet manifest under root/stacklets//stacklet.toml + with the given [[backup.archive]] entries. Optionally mark the + stacklet as enabled by creating its setup-done marker.""" + stacklets_dir = root / "stacklets" / stacklet_id + stacklets_dir.mkdir(parents=True, exist_ok=True) + + lines = [ + f'id = "{stacklet_id}"', + f'name = "{name or stacklet_id.title()}"', + 'category = "media"', + 'version = "0.1.0"', + ] + for archive in archives: + lines.append("") + lines.append("[[backup.archive]]") + lines.append(f'name = "{archive["name"]}"') + lines.append(f'path = "{archive["path"]}"') + if "min_files" in archive: + lines.append(f'min_files = {archive["min_files"]}') + + (stacklets_dir / "stacklet.toml").write_text("\n".join(lines) + "\n") + + if enabled: + marker_dir = root / ".stack" + marker_dir.mkdir(parents=True, exist_ok=True) + (marker_dir / f"{stacklet_id}.setup-done").write_text("") + + +# ── Source discovery ─────────────────────────────────────────────────────── + +class TestDiscoverArchiveSources: + def test_finds_archive_entries_from_enabled_stacklets(self, tmp_path): + _make_fake_stacklet( + tmp_path, "photos", + [{"name": "library", "path": "{data_dir}/photos/library/library", "min_files": 10}], + name="Photos", + ) + sources = discover_archive_sources( + tmp_path, tmp_path, Path("/var/famstack-data") + ) + assert len(sources) == 1 + s = sources[0] + assert s.id == "photos/library" + assert s.display == "Photos" + assert s.src_path == Path("/var/famstack-data/photos/library/library") + assert s.vault_subdir == "data/photos-library" + assert s.min_files == 10 + + def test_skips_unenabled_stacklets(self, tmp_path): + # Enabled photos contributes; disabled docs does not. + _make_fake_stacklet( + tmp_path, "photos", + [{"name": "library", "path": "{data_dir}/photos", "min_files": 1}], + enabled=True, + ) + _make_fake_stacklet( + tmp_path, "docs", + [{"name": "media", "path": "{data_dir}/docs", "min_files": 1}], + enabled=False, + ) + sources = discover_archive_sources(tmp_path, tmp_path, Path("/d")) + assert [s.id for s in sources] == ["photos/library"] + + def test_stacklets_without_backup_archive_skipped(self, tmp_path): + # photos has no [[backup.archive]] declaration at all. + stacklets_dir = tmp_path / "stacklets" / "photos" + stacklets_dir.mkdir(parents=True) + (stacklets_dir / "stacklet.toml").write_text( + 'id = "photos"\nname = "Photos"\n' + ) + (tmp_path / ".stack").mkdir() + (tmp_path / ".stack" / "photos.setup-done").write_text("") + + assert discover_archive_sources(tmp_path, tmp_path, Path("/d")) == [] + + def test_multiple_archives_per_stacklet(self, tmp_path): + _make_fake_stacklet( + tmp_path, "photos", + [ + {"name": "library", "path": "{data_dir}/a", "min_files": 1}, + {"name": "shared", "path": "{data_dir}/b", "min_files": 1}, + ], + ) + sources = discover_archive_sources(tmp_path, tmp_path, Path("/d")) + assert [s.id for s in sources] == ["photos/library", "photos/shared"] + assert [s.vault_subdir for s in sources] == ["data/photos-library", "data/photos-shared"] + + def test_template_variable_renders(self, tmp_path): + # {data_dir} must expand to whatever the orchestrator was given. + _make_fake_stacklet( + tmp_path, "photos", + [{"name": "library", "path": "{data_dir}/photos/library", "min_files": 1}], + ) + sources = discover_archive_sources( + tmp_path, tmp_path, Path("/totally/custom/data") + ) + assert sources[0].src_path == Path("/totally/custom/data/photos/library") + + def test_unknown_template_variable_kept_literal(self, tmp_path): + # A typo'd template var shouldn't crash discovery — the engine's + # preflight will surface "directory not found" with a useful + # error pointing at the broken path. + _make_fake_stacklet( + tmp_path, "photos", + [{"name": "library", "path": "{nonexistent_var}/photos", "min_files": 1}], + ) + sources = discover_archive_sources(tmp_path, tmp_path, Path("/d")) + # The format() call raises KeyError, we fall back to the raw string. + assert "{nonexistent_var}" in str(sources[0].src_path) or sources[0].src_path == Path("{nonexistent_var}/photos") + + def test_returns_empty_when_no_stacklets_dir(self, tmp_path): + # tmp_path is empty — no stacklets/ subdirectory exists. + assert discover_archive_sources(tmp_path, tmp_path, Path("/d")) == [] + + def test_malformed_manifest_skipped_not_fatal(self, tmp_path): + # A broken manifest in one stacklet shouldn't take down discovery + # of all the others. + _make_fake_stacklet( + tmp_path, "photos", + [{"name": "library", "path": "{data_dir}/p", "min_files": 1}], + ) + broken = tmp_path / "stacklets" / "broken" + broken.mkdir(parents=True) + (broken / "stacklet.toml").write_text("this is not [valid] toml = ===") + (tmp_path / ".stack" / "broken.setup-done").write_text("") + + sources = discover_archive_sources(tmp_path, tmp_path, Path("/d")) + assert [s.id for s in sources] == ["photos/library"] + + +# ── Target discovery ─────────────────────────────────────────────────────── + +class TestGetTargets: + def test_parses_a_target_block(self): + cfg = { + "backup": { + "targets": { + "vault": { + "engine": "external-disk", + "disk": "backup-vault", + "schedule": "0 2 * * *", + } + } + } + } + targets = get_targets(cfg) + assert len(targets) == 1 + t = targets[0] + assert t.name == "vault" + assert t.engine == "external-disk" + assert t.disk == "backup-vault" + assert t.schedule == "0 2 * * *" + + def test_returns_empty_when_no_backup_section(self): + assert get_targets({}) == [] + + def test_returns_empty_when_no_targets(self): + assert get_targets({"backup": {}}) == [] + + def test_skips_targets_missing_engine(self): + # A target without an engine is malformed — we'd rather skip + # than silently pick a default. + cfg = { + "backup": { + "targets": { + "ok": {"engine": "external-disk", "disk": "a"}, + "broken": {"disk": "b"}, # no engine + } + } + } + targets = get_targets(cfg) + assert [t.name for t in targets] == ["ok"] + + def test_multiple_targets(self): + cfg = { + "backup": { + "targets": { + "vault": {"engine": "external-disk", "disk": "vault"}, + "offsite": {"engine": "restic", "disk": ""}, + } + } + } + names = sorted(t.name for t in get_targets(cfg)) + assert names == ["offsite", "vault"] + + +# ── Source serialization ─────────────────────────────────────────────────── + +class TestSerializeSourcesEnv: + def test_single_record_formatted(self): + sources = [SourceRecord( + id="photos/library", display="Photos", + src_path=Path("/var/famstack-data/photos/library/library"), + vault_subdir="data/photos-library", min_files=10, + )] + env = serialize_sources_env(sources) + assert env == ( + "photos/library|Photos|/var/famstack-data/photos/library/library|" + "data/photos-library|10" + ) + + def test_multiple_records_newline_joined(self): + sources = [ + SourceRecord("photos/library", "Photos", Path("/a"), "data/p", 10), + SourceRecord("docs/media", "Documents", Path("/b"), "data/d", 5), + ] + env = serialize_sources_env(sources) + lines = env.split("\n") + assert len(lines) == 2 + assert lines[0].startswith("photos/library|") + assert lines[1].startswith("docs/media|") + + def test_empty_input_yields_empty_string(self): + assert serialize_sources_env([]) == "" + + +# ── Engine command ───────────────────────────────────────────────────────── + +class TestBuildEngineCommand: + def _args(self, **kw) -> argparse.Namespace: + defaults = dict(dry_run=False, no_eject=False, verbose=False, verify=False) + defaults.update(kw) + return argparse.Namespace(**defaults) + + def test_no_flags(self): + cmd = build_engine_command(Path("/tmp/sync.py"), self._args()) + # Last element is the script path; flags follow only if set. + assert cmd[-1] == "/tmp/sync.py" + assert "--dry-run" not in cmd + + def test_dry_run_added(self): + cmd = build_engine_command(Path("/tmp/sync.py"), self._args(dry_run=True)) + assert "--dry-run" in cmd + + def test_all_flags_added(self): + cmd = build_engine_command( + Path("/tmp/sync.py"), + self._args(dry_run=True, no_eject=True, verbose=True, verify=True), + ) + assert "--dry-run" in cmd + assert "--no-eject" in cmd + assert "--verbose" in cmd + assert "--verify" in cmd + + +# ── Result reading ───────────────────────────────────────────────────────── + +class TestReadLatestRun: + def test_returns_none_when_history_missing(self, tmp_path): + # Engine crashed before it could append — distinct from "engine + # reported a failure" which would have written a line. + assert read_latest_run(tmp_path) is None + + def test_reads_only_line_when_one_run(self, tmp_path): + (tmp_path / "logs").mkdir() + (tmp_path / "logs" / "history.jsonl").write_text( + '{"success": true, "sources": []}\n' + ) + assert read_latest_run(tmp_path) == {"success": True, "sources": []} + + def test_reads_last_line_when_many_runs(self, tmp_path): + # "Latest" = most recently appended. + (tmp_path / "logs").mkdir() + (tmp_path / "logs" / "history.jsonl").write_text( + '{"success": true, "vault_disk": "first"}\n' + '{"success": true, "vault_disk": "middle"}\n' + '{"success": false, "vault_disk": "latest"}\n' + ) + loaded = read_latest_run(tmp_path) + assert loaded["vault_disk"] == "latest" + assert loaded["success"] is False + + def test_returns_none_when_history_empty(self, tmp_path): + (tmp_path / "logs").mkdir() + (tmp_path / "logs" / "history.jsonl").write_text("") + assert read_latest_run(tmp_path) is None + + def test_skips_corrupted_trailing_line(self, tmp_path): + # A partial-write corruption (shouldn't happen with our atomic + # appends, but defense in depth) must not lose the last good + # run from the report. + (tmp_path / "logs").mkdir() + (tmp_path / "logs" / "history.jsonl").write_text( + '{"success": true, "vault_disk": "good"}\n' + '{"corrupted, no closing\n' + ) + loaded = read_latest_run(tmp_path) + assert loaded["vault_disk"] == "good" + + +# ── Notification formatting ──────────────────────────────────────────────── + +def _success_result(**overrides): + """Build a baseline successful result dict; tests override fields.""" + base = { + "success": True, + "dry_run": False, + "failure_reason": None, + "duration_seconds": 125, + "started_at": "2026-05-14T02:00:00Z", + "ended_at": "2026-05-14T02:02:05Z", + "run_context": "cron", + "run_user": "arthur", + "vault_disk": "backup-vault", + "vault_state": "mounted", + "vault_size": "8.2G", + "sources": [ + {"id": "photos/library", "display": "Photos", + "status": "ok", "total_files": 48293, "new_files": 12}, + {"id": "docs/media", "display": "Documents", + "status": "ok", "total_files": 4421, "new_files": 3}, + ], + } + base.update(overrides) + return base + + +class TestFormatNotification: + def test_success_headline_in_both_bodies(self): + plain, html = format_notification("vault", _success_result()) + assert plain.startswith("✅ Backup Sync Completed") + assert "✅ Backup Sync Completed" in html + + def test_failure_headline_includes_reason(self): + plain, html = format_notification("vault", _success_result( + success=False, failure_reason="Canary check failed", + )) + assert plain.startswith("❌ Backup Sync FAILED") + assert "Reason: Canary check failed" in plain + assert "❌ Backup Sync FAILED" in html + assert "Reason: Canary check failed" in html + + def test_dry_run_headline(self): + plain, html = format_notification("vault", _success_result(dry_run=True)) + assert "dry run" in plain.lower() + assert "dry run" in html.lower() + + def test_target_name_appears(self): + plain, html = format_notification("offsite", _success_result()) + assert "Target: offsite" in plain + assert "offsite" in html + + def test_duration_formatted_mm_ss(self): + plain, _ = format_notification("vault", _success_result(duration_seconds=125)) + assert "2m 5s" in plain + + def test_source_counts_use_dot_separator(self): + plain, html = format_notification("vault", _success_result()) + # Phone-number linkification in Element is the reason — dot, + # not comma. + assert "48.293" in plain + assert "48.293" in html + + def test_mounted_state_notes_context(self): + # The mounted-after-cron state isn't a failure; it's the + # documented operational truth. + plain, _ = format_notification("vault", _success_result( + vault_state="mounted", run_context="cron" + )) + assert "mounted" in plain + assert "cron" in plain + + def test_ejected_state(self): + plain, _ = format_notification("vault", _success_result(vault_state="ejected")) + assert "ejected" in plain.lower() + + def test_not_connected_state_warns(self): + plain, _ = format_notification("vault", _success_result( + success=False, vault_state="not_connected", + failure_reason="Backup disk not connected", + )) + assert "not connected" in plain.lower() + + def test_failed_source_marked_in_output(self): + result = _success_result( + success=False, + failure_reason="rsync failed for one source", + sources=[ + {"id": "photos/library", "display": "Photos", + "status": "ok", "total_files": 100, "new_files": 5}, + {"id": "docs/media", "display": "Documents", + "status": "FAILED", "total_files": 0, "new_files": 0}, + ], + ) + plain, html = format_notification("vault", result) + assert "Documents — FAILED" in plain + assert "FAILED" in html + + def test_no_sources_renders_cleanly(self): + # Edge case: result with empty sources list (engine aborted + # before sync_data ran). Should still produce a coherent + # message rather than blowing up. + plain, html = format_notification("vault", _success_result(sources=[])) + assert "Target: vault" in plain + assert "Target:" in html From 715831cd57e22601653f5bccc3a5a1e28836cbe4 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 15 May 2026 11:53:31 +0200 Subject: [PATCH 2/3] docs(stack-reference): clarify stacklet vs core boundary --- docs/stack-reference.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/stack-reference.md b/docs/stack-reference.md index 9f45c84..4422402 100644 --- a/docs/stack-reference.md +++ b/docs/stack-reference.md @@ -10,6 +10,19 @@ Think Spring Boot for self-hosted services on a Mac. --- +## Stacklet vs core + +The distinction is **infrastructure vs feature**. Core hosts cross-cutting +infrastructure that every running stack needs: Watchtower (auto-updates), +the bot runtime, the API socket, the LLM tools server. Vertical stacklets +host features a user might rationally decline: photos, documents, backup, +the AI stack. The test: could a user legitimately want this off? If yes, +it belongs in its own stacklet, even when multiple other stacklets opt +into it via manifest contracts (e.g. `[[backup.archive]]` declared by +photos and docs, consumed by the `backup` stacklet). + +--- + ## Directory Structure A stacklet is a directory under `stacklets/` containing at minimum a From 58df414bcd8add9f85da6bcc3aca9f9a0064bdf8 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 15 May 2026 16:40:35 +0200 Subject: [PATCH 3/3] refactor(backup): drop .app wrapper, grant FDA to /usr/sbin/cron --- docs/user-guide.md | 20 +- stacklets/backup/README.md | 12 +- stacklets/backup/_cron.py | 12 +- .../backup/engines/external-disk/README.md | 15 +- stacklets/backup/hooks/on_destroy.py | 54 ++--- stacklets/backup/hooks/on_install.py | 214 ++++++------------ stacklets/backup/hooks/on_start.py | 43 ++-- stacklets/backup/stacklet.toml | 17 +- tests/stacklets/test_backup_install.py | 115 +++------- 9 files changed, 177 insertions(+), 325 deletions(-) diff --git a/docs/user-guide.md b/docs/user-guide.md index 795aa91..f79d53d 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -512,11 +512,11 @@ This is the part everyone skips and regrets. famstack ships an opt-in backup sta ### The backup stacklet -Run `stack up backup`. You need an APFS-formatted external drive plugged in. The setup wizard asks for the disk name, encryption (default: plain APFS), and a nightly time (default 02:00). It then installs a small `.app` wrapper, asks you to grant it Full Disk Access, and adds a cron entry. +Run `stack up backup`. You need an APFS-formatted external drive plugged in. The setup wizard asks for the disk name, encryption (default: plain APFS), and a nightly time (default 02:00). It then installs a cron entry that runs `stack backup sync` at that hour, and walks you through granting Full Disk Access to `/usr/sbin/cron` in System Settings. The FDA grant is what lets the scheduled sync reach the archive disk; without it, cron jobs are sandbox-blocked from `/Volumes/*` on macOS Catalina and later. One drag-and-drop, then every cron job on this Mac inherits the access. **What gets backed up** -| Source | Path on the vault disk | +| Source | Path on the archive disk | |---|---| | Immich photo originals | `/Volumes//data/photos-library/` | | Paperless archived PDFs | `/Volumes//data/docs-media/` | @@ -525,22 +525,24 @@ Postgres databases for both are not backed up yet. You get your files back but l **How the protection works** -Every file written to the vault gets the kernel `uchg` flag. macOS refuses to modify or delete uchg files, even with `sudo`. `rsync --ignore-existing` means existing vault files are skipped on every run, so the backup is append-only by design and accidental `rm -rf` on your main system cannot propagate. A canary file is checked before every sync; if ransomware has touched `~/famstack-data`, the canary won't match and the sync aborts before opening the vault. +Every file written to the archive gets the kernel `uchg` flag. macOS refuses to modify or delete uchg files, even with `sudo`. `rsync --ignore-existing` means files already in the archive are skipped on every run, so the backup is append-only by design and accidental `rm -rf` on your main system cannot propagate. + +A **canary file** is a tripwire (named after the canary miners used to take underground to detect bad air). famstack plants a small file with known contents inside `~/famstack-data`; before every sync the engine reads it and refuses to proceed if the contents have changed. If something has been encrypting or modifying files under the data directory, the canary will not match what was planted and the sync aborts before opening the archive, so the corrupted state cannot propagate. **Daily operation** ```bash stack backup sync # run a sync now (from any context) stack backup status # last run, source counts, current mount state -stack down backup # remove the cron entry, keep .app and vault data -stack destroy backup # also remove the .app; vault and Keychain are preserved +stack down backup # remove the cron entry; canary, logs and archive contents stay +stack destroy backup # also remove BACKUP_DATA_DIR; archive disk and Keychain are preserved ``` The scheduled nightly run leaves the disk mounted between runs (cron cannot trigger eject under the macOS sandbox; files are kernel-locked regardless). Manual `stack backup sync` from Terminal does eject when it finishes. Results post to the `#famstack` Matrix room via stacker-bot. **Recovery (no special tooling needed)** -Plug the vault disk into any Mac and browse the files in Finder. To copy locked files out: +Plug the archive disk into any Mac and browse the files in Finder. To copy locked files out: ```bash sudo chflags -R nouchg /Volumes//data/photos-library// @@ -554,11 +556,11 @@ A `stack backup restore` command and `on_restore` hooks for database recovery ar | Threat | Covered | |---|---| | Ransomware encrypts your Mac | Yes (uchg + canary) | -| Accidental `rm -rf` on `~/famstack-data` | Yes (rsync never deletes from the vault) | -| You delete a photo on your phone | Yes (Immich propagates the delete to disk; vault keeps the original) | +| Accidental `rm -rf` on `~/famstack-data` | Yes (rsync never deletes from the archive) | +| You delete a photo on your phone | Yes (Immich propagates the delete to disk; the archive keeps the original) | | Vault drive stolen from your house | Only if you opted in to APFS encryption | | Vault drive hardware failure | No (single physical copy; offsite engine planned) | -| Fire or flood | No (vault is in the same building; offsite engine planned) | +| Fire or flood | No (archive is in the same building; offsite engine planned) | **Limitations to know about** diff --git a/stacklets/backup/README.md b/stacklets/backup/README.md index a3ea851..3aac270 100644 --- a/stacklets/backup/README.md +++ b/stacklets/backup/README.md @@ -67,14 +67,16 @@ contract has been exercised on at least one production sync. `stack destroy backup` removes the backup *tooling* — never the *backups*. Specifically: -- **Removed:** cron entry, FamstackVaultSync.app bundle, local logs, - canary file under BACKUP_DATA_DIR. -- **Preserved:** every file on the vault disk. The whole point of an +- **Removed:** cron entry, local logs, canary file under BACKUP_DATA_DIR. +- **Preserved:** every file on the archive disk. The whole point of an append-only archive is that it outlives the system that wrote it. - **Preserved:** the macOS Keychain entry for the disk passphrase - (encrypted vaults only). The user may want manual disk access after + (encrypted archives only). The user may want manual disk access after uninstall; the command to remove it is surfaced if they want a fully clean state. +- **Preserved:** the Full Disk Access grant on `/usr/sbin/cron`. It + also covers any other cron jobs on the system; the user can remove + it manually if they prefer. Defensive measure: `on_configure` refuses to let `BACKUP_DATA_DIR` point at a path under `/Volumes/`. That way the framework's automatic @@ -88,7 +90,7 @@ restore CLI exists yet — but you don't need one to get your photos back: ```bash -# 1. Plug the vault disk into any Mac and unlock it (Finder prompts +# 1. Plug the archive disk into any Mac and unlock it (Finder prompts # for the passphrase if encrypted) # 2. Browse to the originals diff --git a/stacklets/backup/_cron.py b/stacklets/backup/_cron.py index d1de9ba..80aa3d7 100644 --- a/stacklets/backup/_cron.py +++ b/stacklets/backup/_cron.py @@ -1,15 +1,13 @@ """Crontab install/remove helpers for the backup stacklet. -Backup's nightly run is a cron entry rather than a launchd job because -launchd's sandbox blocks ``diskutil`` operations even with Full Disk -Access — the only combination macOS actually allows is cron invoking -an .app bundle via ``open`` (see -``family-server/backup/docs/MACOS-SANDBOX-BACKUP-SCRIPT.md`` for the -full history of approaches that didn't work). +Backup's nightly run is a cron entry rather than a launchd job: +launchd's sandbox blocks ``diskutil`` and ``/Volumes/`` access even +with Full Disk Access, while cron inherits the FDA grant on +``/usr/sbin/cron`` once the user adds it in System Settings. Entries are identified by an inline marker comment:: - 0 2 * * * open /Volumes/... # famstack-backup-vault + 0 2 * * * /path/to/stack backup sync ... # famstack-backup-vault The marker has the target name appended so multiple targets (vault, offsite) can coexist without one's removal touching the other's entry. diff --git a/stacklets/backup/engines/external-disk/README.md b/stacklets/backup/engines/external-disk/README.md index 1a64c4e..ef71eee 100644 --- a/stacklets/backup/engines/external-disk/README.md +++ b/stacklets/backup/engines/external-disk/README.md @@ -50,12 +50,15 @@ SMB/S3 alike. ## Sandbox notes -`diskutil` operations (mount, eject, unlock) are restricted by macOS TCC -when called from `cron`, `launchd`, or any binary that hasn't been -granted Full Disk Access. The fix is the `.app` wrapper: a minimal app -bundle whose only purpose is to receive the FDA grant. Cron invokes it -via `open /path/to/FamstackVaultSync.app`, which routes through the -proper macOS app lifecycle and inherits the FDA permission. +`diskutil` operations (mount, eject, unlock) and writes to `/Volumes/*` +are restricted by macOS TCC when called from `cron` or `launchd` +unless the calling binary holds Full Disk Access. The current +install hook tells the user to grant FDA to `/usr/sbin/cron` itself +— one drag-and-drop in System Settings, after which every cron +job on the system inherits the access. The trade-off is scope: +that grant is system-wide, not scoped to famstack. Scoping it via +a dedicated `.app` bundle is a planned alternative if users +push back on the broad permission. `diskutil eject` from cron is sandbox-blocked even with FDA. The disk stays mounted after scheduled runs (uchg flags still protect the data); diff --git a/stacklets/backup/hooks/on_destroy.py b/stacklets/backup/hooks/on_destroy.py index e7add61..c34d1d3 100644 --- a/stacklets/backup/hooks/on_destroy.py +++ b/stacklets/backup/hooks/on_destroy.py @@ -1,17 +1,14 @@ """on_destroy — tear down host-side state. NEVER touches existing backup data. -The framework calls on_stop FIRST during destroy, then on_destroy. The -cron entry is removed in on_stop; this hook removes it AGAIN as a -defensive measure. Both removals are idempotent (no-op if the entry is -already gone) so the double-call has no cost and protects against +The framework calls on_stop FIRST during destroy, then on_destroy. +The cron entry is removed in on_stop; this hook removes it AGAIN as +a defensive measure. Both removals are idempotent (no-op if the entry +is already gone) so the double-call has no cost and protects against on_stop having been skipped, failed, or never run. What this hook removes (regenerable host-side state): - - The cron entry installed by on_install (defensive re-removal) - - The FamstackVaultSync.app bundle — the framework's data-dir - cleanup will sweep it after this hook, but removing it explicitly - here makes the destroy-time summary accurate + - The cron entry installed by on_install (defensive re-removal). What is explicitly preserved: - The vault disk and every file on it. The whole point of an @@ -20,21 +17,18 @@ backup tooling, not asking us to wipe their photo history. - The macOS Keychain entry for the disk passphrase (encrypted vaults only). The user may want manual disk access after uninstall. + - The Full Disk Access grant on ``/usr/sbin/cron``. It also covers + any other cron jobs on the system; we can't remove TCC grants + programmatically anyway, and the user may want to keep it. The vault is on ``/Volumes/``, not under BACKUP_DATA_DIR — the -framework's data-dir cleanup never reaches it. on_configure refuses to -let BACKUP_DATA_DIR be placed under ``/Volumes/`` as a defensive +framework's data-dir cleanup never reaches it. on_configure refuses +to let BACKUP_DATA_DIR be placed under ``/Volumes/`` as a defensive measure against misconfiguration. - -The Full Disk Access entry in System Settings becomes orphaned (the -.app it points at is gone). macOS shows it as "This item refers to an -item that doesn't exist." Users can clean it up manually; we can't -remove TCC entries programmatically — that's the entire point of TCC. """ from __future__ import annotations -import shutil import sys from pathlib import Path @@ -47,38 +41,22 @@ from stack.prompt import dim, done, nl, out # noqa: E402 -APP_BUNDLE_NAME = "FamstackVaultSync.app" - - def run(ctx): - backup_data_dir = Path(ctx.env["BACKUP_DATA_DIR"]) - # Defensive cron sweep — should already be empty after on_stop, but - # cheap to verify. Use the wildcard sweep rather than a per-target - # loop because by destroy time we may have no target config left to - # iterate. + # cheap to verify. Wildcard rather than per-target because by destroy + # time we may have no target config left to iterate. try: removed = cron.remove_all_entries() if removed: done(f"Removed {removed} stale backup cron entr{'y' if removed == 1 else 'ies'}") except RuntimeError as e: - # Loud failure: a stale cron entry pointed at a now-deleted - # .app is the worst-case operational outcome — silently failing - # to clean up is exactly what NOT to do. + # Loud failure: a stale cron entry firing nightly against an + # uninstalled backup is the worst-case operational outcome. raise RuntimeError( f"Could not remove backup cron entries: {e}\n" f"Run 'crontab -e' and delete any line containing 'famstack-backup-'." ) - # Remove the .app bundle explicitly. The framework's destroy will - # also wipe BACKUP_DATA_DIR/ recursively after we return, so this - # is partly cosmetic — but doing it here means the summary printed - # below reflects reality, not promises. - app_path = backup_data_dir / APP_BUNDLE_NAME - if app_path.is_dir(): - shutil.rmtree(app_path, ignore_errors=True) - done(f"Removed app bundle: {app_path}") - nl() out("Preserved:") out(" • Vault disk contents — append-only archive outlives the tooling.") @@ -86,6 +64,6 @@ def run(ctx): dim(" sudo chflags -R nouchg /Volumes//data && rm -rf /Volumes//data") out(" • Keychain passphrase entry (encrypted vaults). Remove with:") dim(" security delete-generic-password -a ''") - out(" • Full Disk Access entry in System Settings (orphaned — remove") - out(" manually via Privacy & Security → Full Disk Access → -)") + out(" • Full Disk Access grant on /usr/sbin/cron. Leave it if you") + out(" have other cron jobs; otherwise remove via Privacy & Security.") nl() diff --git a/stacklets/backup/hooks/on_install.py b/stacklets/backup/hooks/on_install.py index e1b3a37..6414a20 100644 --- a/stacklets/backup/hooks/on_install.py +++ b/stacklets/backup/hooks/on_install.py @@ -1,23 +1,24 @@ -"""on_install — install the FDA-granted .app, walk the user through the -FDA grant, and add the nightly cron entry. +"""on_install — set up host-side state and install the nightly cron entry. -Runs once after on_configure on first ``stack up backup``. Idempotent — -re-running it after a successful install is a no-op (the .app and cron -entry already exist and get rewritten with the same content). +Runs once after on_configure on first ``stack up backup``. Idempotent. -Why the .app dance: macOS TCC restricts ``diskutil`` operations from -background processes (cron, launchd) unless the binary has been granted -Full Disk Access. FDA can't be granted to a raw script or symlink — -only to a proper .app bundle. So we generate a minimal .app whose only -job is to receive the FDA grant and shell out to ``stack backup sync``. +Steps: -The cron line invokes the app via ``open``, which routes through the -proper macOS app lifecycle so the FDA permission applies. +* Create ``BACKUP_DATA_DIR`` and its ``logs/`` subdir. +* Plant the canary tripwire. +* Install the cron entry that fires the nightly sync. +* Walk the user through granting Full Disk Access to ``/usr/sbin/cron`` + so the scheduled run can reach the vault disk. + +The cron command invokes ``./stack backup sync`` directly with output +redirected to ``BACKUP_DATA_DIR/logs/cron.log``. No ``.app`` wrapper: +granting FDA to ``/usr/sbin/cron`` covers every cron job on the +system, which is the trade-off the user accepts in exchange for not +maintaining a custom app bundle. """ from __future__ import annotations -import stat import subprocess import sys from pathlib import Path @@ -29,19 +30,17 @@ from _config import read_target # noqa: E402 import _cron as cron # noqa: E402 -# Import the canary constant from the engine so the planter writes -# exactly what the verifier expects. Single source of truth. +# Single source of truth for the canary contents — the engine verifies +# what install plants. _ENGINE_DIR = _BACKUP_DIR / "engines" / "external-disk" sys.path.insert(0, str(_ENGINE_DIR)) from sync import CANARY_STRING # noqa: E402 from stack.prompt import ( # noqa: E402 - ask, bold, confirm, dim, done, nl, out, section, warn, + bold, confirm, dim, done, nl, out, section, warn, ) -APP_BUNDLE_NAME = "FamstackVaultSync.app" -APP_BUNDLE_ID = "dev.famstack.backup" TARGET_NAME = "vault" @@ -59,40 +58,20 @@ def run(ctx): "Did on_configure run successfully?" ) - section("Backup install", f"FDA wrapper + cron entry for target '{TARGET_NAME}'") + section("Backup install", f"Host state + cron entry for target '{TARGET_NAME}'") nl() - # 1. Make the directories the engine and the .app both need. + # 1. Directories the engine needs. backup_data_dir.mkdir(parents=True, exist_ok=True) (backup_data_dir / "logs").mkdir(parents=True, exist_ok=True) + done(f"State directory: {backup_data_dir}") - # 2. Plant the canary. The engine only *verifies* — it doesn't - # create the tripwire — so install is where it gets seeded. - # Idempotent: only writes if the file isn't already present - # (re-running install must not clobber an existing canary that - # might already have been verified across successful syncs). + # 2. Canary planting. plant_canary(backup_data_dir) - # 3. Generate the .app bundle. - app_path = generate_app_bundle( - target_dir=backup_data_dir, - stack_executable=repo_root / "stack", - log_path=backup_data_dir / "logs" / "cron.log", - ) - done(f"App bundle: {app_path}") - nl() - - # 4. Walk the user through the FDA grant. - if sys.stdin.isatty(): - _fda_walkthrough(app_path) - else: - warn("Non-interactive install — Full Disk Access must be granted manually:") - out(f" System Settings → Privacy & Security → Full Disk Access → + → {app_path}") - nl() - - # 5. Install the cron entry. + # 3. Cron entry. schedule = target.get("schedule", "0 2 * * *") - cron_command = f"open {app_path}" + cron_command = _cron_command(repo_root, backup_data_dir) try: changed = cron.install_entry(schedule, cron_command, TARGET_NAME) except RuntimeError as e: @@ -101,21 +80,38 @@ def run(ctx): f"Add this line to your crontab manually (crontab -e):\n" f" {schedule} {cron_command} # {cron.marker_for(TARGET_NAME)}" ) - if changed: - done(f"Cron entry installed: {schedule} → {app_path}") + done(f"Cron entry installed: {schedule}") else: done("Cron entry already up to date") - nl() + + # 4. FDA walkthrough for cron. + if sys.stdin.isatty(): + _fda_walkthrough() + else: + warn("Non-interactive install — grant Full Disk Access to /usr/sbin/cron") + out(" System Settings → Privacy & Security → Full Disk Access → + → /usr/sbin/cron") + nl() bold("Setup complete.") - out("Run 'stack backup sync' to test now (manual run also tries to eject).") - out("The scheduled run fires nightly per the cron entry. Disk stays") - out("mounted between scheduled runs (sandbox blocks eject from cron);") - out("files are protected by chflags uchg.") + out("Run a sync now: stack backup sync") + out("Check the last run: stack backup status") nl() +# ── Cron command ─────────────────────────────────────────────────────────── + +def _cron_command(repo_root: Path, backup_data_dir: Path) -> str: + """The shell command cron runs. + + Output is appended to ``cron.log`` under the state dir so a + misbehaving scheduled run leaves a trail. + """ + log_path = backup_data_dir / "logs" / "cron.log" + stack_bin = repo_root / "stack" + return f"{stack_bin} backup sync >> {log_path} 2>&1" + + # ── Canary planter ──────────────────────────────────────────────────────── def plant_canary(backup_data_dir: Path) -> None: @@ -137,111 +133,39 @@ def plant_canary(backup_data_dir: Path) -> None: done(f"Canary planted: {canary_path}") -# ── .app bundle generation ──────────────────────────────────────────────── +# ── FDA walkthrough ──────────────────────────────────────────────────────── -def generate_app_bundle( - target_dir: Path, - stack_executable: Path, - log_path: Path, -) -> Path: - """Generate the FamstackVaultSync.app bundle. +def _fda_walkthrough() -> None: + """Walk the user through granting Full Disk Access to ``/usr/sbin/cron``. - A .app bundle is a directory tree macOS treats as a single - "application." Ours is the bare minimum: an ``Info.plist`` that - identifies the bundle and an executable that shells out to - ``stack backup sync``. The bundle exists ONLY so macOS TCC can - attach a Full Disk Access grant to it — there's no UI, no dock - icon, no real "app." + Without this grant, cron-invoked processes can't read or write + files under ``/Volumes/*`` and the nightly sync silently fails. + macOS won't let us script the grant — TCC requires user + interaction — so we deep-link to the right Settings pane and + instruct. """ - app_path = target_dir / APP_BUNDLE_NAME - contents_dir = app_path / "Contents" - macos_dir = contents_dir / "MacOS" - macos_dir.mkdir(parents=True, exist_ok=True) - - # Info.plist — minimum keys macOS needs to recognize the bundle. - # LSUIElement=true keeps it out of the dock and Cmd-Tab. - (contents_dir / "Info.plist").write_text(_info_plist()) - - # The executable wrapper. Cron fires `open `, macOS launches - # the bundle, the bundle's executable runs this script. - wrapper = macos_dir / "vault-sync" - wrapper.write_text(_wrapper_script(stack_executable, log_path)) - wrapper.chmod(wrapper.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) - - return app_path - - -def _info_plist() -> str: - return ( - '\n' - '\n' - '\n' - '\n' - ' CFBundleExecutable\n' - ' vault-sync\n' - f' CFBundleIdentifier\n' - f' {APP_BUNDLE_ID}\n' - ' CFBundleName\n' - ' FamstackVaultSync\n' - ' CFBundleVersion\n' - ' 1.0\n' - ' LSUIElement\n' - ' \n' - '\n' - '\n' - ) - - -def _wrapper_script(stack_executable: Path, log_path: Path) -> str: - """The .app's executable. Logs go to BACKUP_DATA_DIR/logs/cron.log - so a scheduled run that's gone wrong leaves a trail the user can - inspect without trawling Console.app.""" - return ( - "#!/bin/bash\n" - "# Auto-generated by stacklets/backup/hooks/on_install.py — do not edit.\n" - "# Invoked by the cron entry: `open `.\n" - f'LOG="{log_path}"\n' - 'mkdir -p "$(dirname "$LOG")"\n' - f'exec "{stack_executable}" backup sync >> "$LOG" 2>&1\n' - ) - - -# ── FDA walkthrough ──────────────────────────────────────────────────────── - -def _fda_walkthrough(app_path: Path) -> None: - """Open System Settings to the Full Disk Access pane and walk the - user through adding the .app. We can't programmatically grant TCC - permissions — that's the whole point of TCC — so this is the best - we can do.""" - bold("Full Disk Access grant") - out("The backup script reads from your stacklet data directories and") - out("writes to the external vault disk. Both need Full Disk Access") - out("when the script runs from cron (a sandboxed context).") + bold("Full Disk Access for cron") + out("Cron-invoked processes can't reach the vault disk without an") + out("explicit Full Disk Access grant. macOS won't let us automate") + out("this; you grant it once and every cron job inherits the access.") nl() out("Steps:") - out(f" 1. Settings opens to the Full Disk Access pane.") - out(f" 2. Click + to add an app.") - out(f" 3. Press {bold_text('⌘⇧G')} and paste:") - out(f" {app_path}") - out(f" 4. Select {bold_text('FamstackVaultSync.app')} and turn it on.") + out(" 1. Settings opens to the Full Disk Access pane.") + out(" 2. Click + to add an app.") + out(" 3. Press Cmd+Shift+G and paste: /usr/sbin/cron") + out(" 4. Select cron and turn it on. Authenticate when asked.") + nl() + dim("Scope note: this grants FDA to every cron job on this Mac, not") + dim("just famstack's. If you have other cron jobs and prefer to scope") + dim("the grant, the alternative is a dedicated .app wrapper (planned).") nl() - # Deep-link to the FDA pane. This URL works on macOS 13+. Older - # macOS opens to the general Privacy pane. subprocess.run([ "open", "x-apple.systempreferences:com.apple.preference.security?Privacy_AllFiles", ], check=False) if not confirm("Done? (you don't have to relaunch anything)", default=True): - warn("Skipping FDA confirmation — backups may fail until granted.") - dim(" You can run 'stack up backup' again later to re-trigger this prompt.") + warn("Skipping FDA confirmation — scheduled syncs may fail until granted.") + dim(" Run 'stack up backup' again later to re-trigger this prompt.") nl() - - -def bold_text(s: str) -> str: - """Inline bold wrapping. Helper because ``stack.prompt.bold`` prints - on its own line; we need inline emphasis.""" - from stack.prompt import BOLD, RESET - return f"{BOLD}{s}{RESET}" diff --git a/stacklets/backup/hooks/on_start.py b/stacklets/backup/hooks/on_start.py index 4f5b700..30789c3 100644 --- a/stacklets/backup/hooks/on_start.py +++ b/stacklets/backup/hooks/on_start.py @@ -1,13 +1,13 @@ """on_start — ensure the cron entry is present. -Runs on every ``stack up backup``. Idempotent — install_entry replaces -an existing entry if the schedule or command changed (e.g. data_dir -was reconfigured, schedule was edited in stack.toml), and is a no-op -when the entry is already current. +Runs on every ``stack up backup``. Idempotent — ``install_entry`` +replaces an existing entry if the schedule changed (e.g. the user +edited ``[backup.targets.vault].schedule`` in stack.toml), and is a +no-op when the entry is already current. This hook is the natural place to pick up stack.toml edits: a user -changes ``schedule`` from 02:00 to 03:30, runs ``stack up backup``, the -cron entry updates. No need to ``stack destroy`` + reconfigure. +changes the schedule from 02:00 to 03:30, runs ``stack up backup``, +the cron entry updates. No need to destroy + reconfigure. """ from __future__ import annotations @@ -24,43 +24,40 @@ TARGET_NAME = "vault" -APP_BUNDLE_NAME = "FamstackVaultSync.app" def run(ctx): instance_dir = Path(ctx.stack.instance_dir) + repo_root = Path(ctx.stack.root) backup_data_dir = Path(ctx.env["BACKUP_DATA_DIR"]) target = read_target(instance_dir / "stack.toml", TARGET_NAME) if target is None: # No target configured. on_configure should have caught this, - # but be defensive — silently skipping here would leave the - # user with no scheduled run and no warning. + # but silent skip would leave the user with no scheduled run + # and no warning. ctx.step(f"No [backup.targets.{TARGET_NAME}] in stack.toml — skipping cron install") return - app_path = backup_data_dir / APP_BUNDLE_NAME - if not app_path.is_dir(): - # The .app should have been installed by on_install. If it's - # missing here, something deleted it after install — re-running - # `stack up backup` should regenerate it via on_install, but - # the framework only runs on_install once. Surface the issue. - ctx.step( - f"App bundle missing at {app_path}. " - f"Run 'stack destroy backup && stack up backup' to reinstall." - ) - return - schedule = target.get("schedule", "0 2 * * *") + cron_command = _cron_command(repo_root, backup_data_dir) try: - changed = cron.install_entry(schedule, f"open {app_path}", TARGET_NAME) + changed = cron.install_entry(schedule, cron_command, TARGET_NAME) except RuntimeError as e: raise RuntimeError( f"Cron install failed: {e}\n" f"Add this line to your crontab manually (crontab -e):\n" - f" {schedule} open {app_path} # {cron.marker_for(TARGET_NAME)}" + f" {schedule} {cron_command} # {cron.marker_for(TARGET_NAME)}" ) if changed: ctx.step(f"Cron entry updated for target '{TARGET_NAME}' ({schedule})") else: ctx.step(f"Cron entry already current for target '{TARGET_NAME}'") + + +def _cron_command(repo_root: Path, backup_data_dir: Path) -> str: + """Mirror the command on_install installs, so reinstalls on every + `stack up` keep the entry in sync with current paths.""" + log_path = backup_data_dir / "logs" / "cron.log" + stack_bin = repo_root / "stack" + return f"{stack_bin} backup sync >> {log_path} 2>&1" diff --git a/stacklets/backup/stacklet.toml b/stacklets/backup/stacklet.toml index a538f5b..4e4becb 100644 --- a/stacklets/backup/stacklet.toml +++ b/stacklets/backup/stacklet.toml @@ -29,22 +29,23 @@ category = "infrastructure" type = "host" # Backup runs once per night via a host-installed cron entry. The cron -# entry launches an .app bundle via `open` so that macOS Full Disk Access -# applies — required to read external volumes from a background process. +# command invokes ``stack backup sync`` directly. macOS sandboxes cron +# from reading external volumes unless ``/usr/sbin/cron`` itself has +# Full Disk Access — on_install walks the user through granting it. # See engines/external-disk/README.md for the sandboxing rationale. hints = [ "Run 'stack backup sync' to test a backup now", - "Run 'stack backup status' to see the last run and vault size", + "Run 'stack backup status' to see the last run and archive size", ] [env.defaults] # BACKUP_DATA_DIR is this stacklet's own state directory (canary file, -# audit log, run history, the FDA-granted .app bundle). It is NOT the -# source data being backed up, and NOT the target vault disk — both of -# those have separate names elsewhere. The form here follows the -# framework convention {STACKLET}_DATA_DIR = "{data_dir}/{stacklet_id}" -# that other stacklets use (cf. PAPERLESS_DATA_DIR in docs/stacklet.toml). +# audit log, run history). It is NOT the source data being backed up, +# and NOT the archive disk where backups land — both of those have +# separate names elsewhere. The form here follows the framework +# convention {STACKLET}_DATA_DIR = "{data_dir}/{stacklet_id}" that +# other stacklets use (cf. PAPERLESS_DATA_DIR in docs/stacklet.toml). # The name is confusing because "backup" is both a stacklet name and a # verb; the contents are state, not backups. BACKUP_DATA_DIR = "{data_dir}/backup" diff --git a/tests/stacklets/test_backup_install.py b/tests/stacklets/test_backup_install.py index ea49b9c..1a8867a 100644 --- a/tests/stacklets/test_backup_install.py +++ b/tests/stacklets/test_backup_install.py @@ -1,14 +1,13 @@ -"""Unit tests for on_install's pure helpers: canary planting and .app -bundle generation. +"""Unit tests for on_install's pure helpers: canary planting and the +cron command builder. -The interactive FDA walkthrough and crontab plumbing are tested -elsewhere (test_backup_cron). +The interactive FDA walkthrough is integration-only (requires a TTY + +System Settings) and isn't covered here. Crontab plumbing has its own +tests in test_backup_cron. """ from __future__ import annotations -import os -import stat import sys from pathlib import Path @@ -21,84 +20,6 @@ import on_install # noqa: E402 -class TestGenerateAppBundle: - @pytest.fixture - def bundle(self, tmp_path): - """Generate an .app bundle into tmp_path and return its path.""" - return on_install.generate_app_bundle( - target_dir=tmp_path, - stack_executable=Path("/path/to/stack"), - log_path=tmp_path / "logs" / "cron.log", - ) - - def test_bundle_directory_structure(self, bundle): - # Classic macOS .app layout — Contents/{Info.plist, MacOS/} - assert bundle.is_dir() - assert bundle.name == "FamstackVaultSync.app" - assert (bundle / "Contents" / "Info.plist").is_file() - assert (bundle / "Contents" / "MacOS" / "vault-sync").is_file() - - def test_info_plist_identifies_bundle(self, bundle): - plist = (bundle / "Contents" / "Info.plist").read_text() - # macOS looks these up when launching the bundle. - assert "CFBundleExecutable" in plist - assert "vault-sync" in plist - assert "CFBundleIdentifier" in plist - assert "dev.famstack.backup" in plist - - def test_info_plist_is_background_app(self, bundle): - # LSUIElement=true keeps the bundle out of the dock and - # Cmd-Tab — there's no UI, no reason to surface it as a - # foreground app. - plist = (bundle / "Contents" / "Info.plist").read_text() - assert "LSUIElement" in plist - assert "" in plist - - def test_info_plist_has_valid_xml_header(self, bundle): - plist = (bundle / "Contents" / "Info.plist").read_text() - # macOS is picky about the DOCTYPE; missing or malformed - # headers cause silent launch failures. - assert plist.startswith('") - - def test_executable_is_executable(self, bundle): - wrapper = bundle / "Contents" / "MacOS" / "vault-sync" - mode = wrapper.stat().st_mode - # User, group, and other all need x bit; cron may run with a - # narrower umask and the .app must still launch. - assert mode & stat.S_IXUSR - assert mode & stat.S_IXGRP - assert mode & stat.S_IXOTH - - def test_wrapper_invokes_stack_backup_sync(self, bundle): - wrapper = (bundle / "Contents" / "MacOS" / "vault-sync").read_text() - assert "/path/to/stack" in wrapper - assert "backup sync" in wrapper - - def test_wrapper_redirects_output_to_log(self, bundle, tmp_path): - wrapper = (bundle / "Contents" / "MacOS" / "vault-sync").read_text() - # Cron output is invisible by default — the wrapper must redirect - # to a known log path so a misbehaving scheduled run leaves a - # trail the user can inspect. - assert str(tmp_path / "logs" / "cron.log") in wrapper - assert "2>&1" in wrapper - - def test_idempotent_regeneration(self, tmp_path): - # Running install twice should leave the same bundle, not pile - # up duplicates or stale state. - on_install.generate_app_bundle( - tmp_path, Path("/p/stack"), tmp_path / "logs" / "cron.log" - ) - on_install.generate_app_bundle( - tmp_path, Path("/p/stack"), tmp_path / "logs" / "cron.log" - ) - # Still one .app, structure intact. - bundles = list(tmp_path.glob("*.app")) - assert len(bundles) == 1 - assert (bundles[0] / "Contents" / "Info.plist").is_file() - - class TestPlantCanary: def test_writes_canary_with_expected_content(self, tmp_path): on_install.plant_canary(tmp_path) @@ -123,3 +44,29 @@ def test_canary_string_matches_engine(self): sys.path.insert(0, str(engine_dir)) from sync import CANARY_STRING as ENGINE_CANARY assert on_install.CANARY_STRING == ENGINE_CANARY + + +class TestCronCommand: + def test_invokes_stack_backup_sync(self): + cmd = on_install._cron_command(Path("/repo"), Path("/data/backup")) + # The cron command must call the right CLI on the right repo. + assert "/repo/stack" in cmd + assert "backup sync" in cmd + + def test_redirects_output_to_cron_log(self): + # Cron output is invisible by default; the redirect ensures a + # misbehaving scheduled run leaves a trail the user can inspect. + cmd = on_install._cron_command(Path("/repo"), Path("/data/backup")) + assert "/data/backup/logs/cron.log" in cmd + assert ">>" in cmd + assert "2>&1" in cmd + + def test_uses_absolute_paths(self): + # cron's PATH is minimal; relative paths break. Both the binary + # and the log destination must be absolute. + cmd = on_install._cron_command(Path("/repo"), Path("/data/backup")) + for token in cmd.split(): + # Skip the redirect operators and 2>&1 + if token in (">>", "2>&1", "backup", "sync"): + continue + assert token.startswith("/"), f"non-absolute token in cron line: {token!r}"