From 64f5c060150bb5f64bda4955b17de5dbaca29df6 Mon Sep 17 00:00:00 2001 From: serprex <159546+serprex@users.noreply.github.com> Date: Wed, 24 Jun 2026 21:52:54 +0000 Subject: [PATCH] optimize non-summary delta backups no longer an order of magnitude slower than wal-g: parallel wal walk, prefer local files to s3, filter blocks by LSN when delta or summary missing fallback to walking wal before doing full backup --- .github/workflows/pg-compat.yml | 10 + Cargo.lock | 54 +- Cargo.toml | 2 +- bench/.gitignore | 9 +- bench/README.md | 112 +++- bench/config.env.example | 29 +- bench/matrix.sh | 5 +- bench/op_matrix.sh | 25 +- bench/run.sh | 24 +- bench/run_op.sh | 194 +++--- bench/scripts/driver/pgbench_init.sh | 16 +- bench/scripts/driver/run_workload.sh | 9 +- bench/scripts/driver/workload_burst.sh | 58 +- bench/scripts/lib.sh | 57 +- bench/scripts/make_source_tarball.sh | 31 + bench/scripts/sut/00_mount_nvme.sh | 19 +- bench/scripts/sut/01_install_pg18.sh | 8 +- bench/scripts/sut/03_build_walrus.sh | 52 +- bench/scripts/sut/05_install_pgbackrest.sh | 27 +- bench/scripts/sut/10_init_pg.sh | 22 +- bench/scripts/sut/11_write_walg_env.sh | 82 +-- bench/scripts/sut/30_select_daemon.sh | 11 +- bench/scripts/sut/40_smoke_test.sh | 10 +- bench/setup.sh | 26 +- bench/terraform/config.env.tftpl | 33 + bench/terraform/deploy.tf | 83 +++ bench/terraform/iam.tf | 4 +- bench/terraform/instances.tf | 5 +- bench/terraform/network.tf | 3 +- bench/terraform/outputs.tf | 12 + bench/terraform/s3.tf | 4 +- bench/terraform/variables.tf | 44 +- bench/tools/Cargo.toml | 8 + bench/tools/src/bin/analyze.rs | 497 +-------------- bench/tools/src/bin/compare.rs | 589 ++++++++++++++++++ bench/tools/src/bin/sampler.rs | 80 +-- bench/tools/src/lib.rs | 3 + bench/tools/src/viz.rs | 446 ++++++++++++++ ci/cross_tool_encryption.sh | 4 +- ci/cross_tool_forward.sh | 2 +- ci/cross_tool_lzma.sh | 4 +- ci/cross_tool_retention.sh | 2 +- ci/cross_tool_stream.sh | 21 + ci/delta_sidecar.sh | 102 +++ ci/lib.sh | 23 +- src/config/mod.rs | 33 + src/crypto/libsodium.rs | 127 ++++ src/daemon/mod.rs | 63 ++ src/daemon/protocol.rs | 64 ++ src/daemon/uploader.rs | 77 +++ src/main.rs | 2 +- src/pg/backup/copy.rs | 6 +- src/pg/backup/delete.rs | 4 +- src/pg/backup/delta.rs | 585 +++++++++++++++--- src/pg/backup/fs_push.rs | 329 +++++++++- src/pg/backup/list.rs | 10 +- src/pg/backup/mod.rs | 87 ++- src/pg/backup/push.rs | 329 ++++++++-- src/pg/backup/show.rs | 10 +- src/pg/backup/tar_streamer.rs | 143 ++++- src/pg/backup/wal_delta.rs | 684 +++++++++++++++++++++ src/pg/replication/server.rs | 2 +- src/pg/wal/receive.rs | 61 +- src/pg/wal/show.rs | 97 ++- src/pg/wal/verify.rs | 16 +- src/pg/wal_summaries.rs | 87 +-- src/pg/walparser/mod.rs | 5 +- src/pg/walparser/state.rs | 185 +++++- src/retry.rs | 28 + src/storage/creds.rs | 60 ++ tests/backup_roundtrip.rs | 27 +- tests/cli_bin.rs | 5 +- tests/retention.rs | 7 +- tests/vm_live.rs | 27 +- tests/wal_roundtrip.rs | 85 ++- 75 files changed, 4835 insertions(+), 1271 deletions(-) create mode 100755 bench/scripts/make_source_tarball.sh create mode 100644 bench/terraform/config.env.tftpl create mode 100644 bench/terraform/deploy.tf create mode 100644 bench/tools/src/bin/compare.rs create mode 100644 bench/tools/src/lib.rs create mode 100644 bench/tools/src/viz.rs create mode 100755 ci/cross_tool_stream.sh create mode 100755 ci/delta_sidecar.sh diff --git a/.github/workflows/pg-compat.yml b/.github/workflows/pg-compat.yml index 443b811..d71af78 100644 --- a/.github/workflows/pg-compat.yml +++ b/.github/workflows/pg-compat.yml @@ -83,18 +83,28 @@ jobs: - backup_show - wal_overwrite - daemon + - delta_sidecar - cross_tool_forward - cross_tool_reverse + - cross_tool_stream - cross_tool_encryption - cross_tool_retention - cross_tool_lzma - cross_tool_delta + exclude: + # wal-g v3.0.8 streams only the pre-PG15 BASE_BACKUP protocol; the + # PG15+ tagged-CopyData parser landed post-release (wal-g#2262), so + # streaming interop stays on 13/14 until the next wal-g release + - { pg: 15, test: cross_tool_stream } + - { pg: 16, test: cross_tool_stream } + - { pg: 17, test: cross_tool_stream } include: - { pg: 18, test: full_backup } - { pg: 18, test: backup_mark } - { pg: 18, test: backup_show } - { pg: 18, test: wal_overwrite } - { pg: 18, test: daemon } + - { pg: 18, test: delta_sidecar } steps: - uses: actions/checkout@v7 diff --git a/Cargo.lock b/Cargo.lock index 0981273..7261473 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -264,9 +264,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chacha20" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +checksum = "d524456ba66e72eb8b115ff89e01e497f8e6d11d78b70b1aa13c0fbd97540a81" dependencies = [ "cfg-if", "cipher", @@ -1129,9 +1129,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.102" +version = "0.3.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" +checksum = "53b44bfcdb3f8d5837a46dae1ca9660a837176eee74a28b229bc626816589102" dependencies = [ "cfg-if", "futures-util", @@ -1184,9 +1184,9 @@ checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" [[package]] name = "log" -version = "0.4.32" +version = "0.4.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" +checksum = "0ceec5bc11778974d1bcb055b18002eba7f4b3518b6a0081b3af5f21666da9ad" [[package]] name = "lru-slab" @@ -1375,9 +1375,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.9" +version = "0.11.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +checksum = "0c1a41e437b6bbd489372cd4971de128e85c855f56c57f283d20ff016cf7c0a8" dependencies = [ "bytes", "cfg_aliases", @@ -1395,9 +1395,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.14" +version = "0.11.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +checksum = "4fcb935c5bec503c2f0e306bdd3e58bb9029dcb14fa8d9ac76e3a5256ac0763e" dependencies = [ "aws-lc-rs", "bytes", @@ -1431,9 +1431,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.45" +version = "1.0.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +checksum = "dfbc457d0c7a0759a614551b11a6409e5951f6c7537be1f1b7682b9ae9230368" dependencies = [ "proc-macro2", ] @@ -1609,9 +1609,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.40" +version = "0.23.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" +checksum = "6b92b125634d9b795e7beca796cc790df15a7fb38323bf3196fda83292d06b1f" dependencies = [ "aws-lc-rs", "once_cell", @@ -2283,7 +2283,7 @@ checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" [[package]] name = "wal-rus" -version = "0.1.1" +version = "0.2.0" dependencies = [ "anyhow", "astral-tokio-tar", @@ -2358,9 +2358,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.125" +version = "0.2.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" +checksum = "4b067c0c11094aef6b7a801c1e34a26affafdf3d051dba08456b868789aaf9a4" dependencies = [ "cfg-if", "once_cell", @@ -2371,9 +2371,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.75" +version = "0.4.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280" +checksum = "c62df1340f32221cb9c54d6a27b030e3dba64361d4a95bed55f9aacb44da291d" dependencies = [ "js-sys", "wasm-bindgen", @@ -2381,9 +2381,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.125" +version = "0.2.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" +checksum = "167ce5e579f6bcf889c4f7175a8a5a585de84e8ff93976ce393efa5f2837aab1" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2391,9 +2391,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.125" +version = "0.2.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" +checksum = "f3997c7839262f4ef12cf90b818d6340c18e80f263f1a94bf157d0ec4420380e" dependencies = [ "bumpalo", "proc-macro2", @@ -2404,9 +2404,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.125" +version = "0.2.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" +checksum = "dc1b4cb0cc549fcf58d7dfc081778139b3d283a081644e833e84682ad71cea24" dependencies = [ "unicode-ident", ] @@ -2426,9 +2426,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.102" +version = "0.3.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d" +checksum = "8622dcb61c0bcc9fffa6938bed81210af2da9a7e4a1a834b2e37a59b6dfb6141" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 16d0753..bc15f34 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "wal-rus" -version = "0.1.1" +version = "0.2.0" edition = "2024" rust-version = "1.89" description = "Rust port of wal-g for PostgreSQL, optimized for no-overcommit hosts" diff --git a/bench/.gitignore b/bench/.gitignore index c22fb29..98dca7d 100644 --- a/bench/.gitignore +++ b/bench/.gitignore @@ -4,12 +4,15 @@ /.venv/ __pycache__/ -# rust build artifacts +# rust build artifacts + source tarballs from make_source_tarball.sh /tools/target/ +/walrus-src-*.tar.gz -# terraform state + generated SSH key + run log (never commit) +# terraform state + generated SSH key + run log + local var overrides (never commit) +/terraform/tfplan /terraform/.terraform/ /terraform/*.tfstate /terraform/*.tfstate.* +/terraform/*.tfvars /terraform/walrus_bench_key.pem -/terraform/apply.log +/terraform/*.log diff --git a/bench/README.md b/bench/README.md index 5ee4d1c..363c247 100644 --- a/bench/README.md +++ b/bench/README.md @@ -25,15 +25,19 @@ matrix.sh loop pgbackrest/walg/walrus × repeats run_op.sh run ONE operation cell (op × tool × run-id) op_matrix.sh loop backup-send/backup-fetch/wal-receive × tools × repeats scripts/lib.sh shared driver scaffolding sourced by run.sh + run_op.sh +scripts/make_source_tarball.sh git archive walrus src for a checkout-less SUT scripts/sut/ per-host bootstrap steps (00..40, systemd units) scripts/driver/ pgbench workload: schema, seed, burst (FPI storm) -tools/ Rust crate (standalone): bench-sampler + bench-analyze bins +tools/ Rust crate (standalone): bench-sampler + bench-analyze + + bench-compare bins; shared colorscheme/render lib (viz) ``` `bench-sampler` (1 Hz mem/CPU/WAL/backlog sampler; `--daemon walg|walrus|pgbackrest` -picks unit-MainPID vs proc-match) and `bench-analyze` (aggregated plots + -self-describing CSV/JSON exports) are built from `tools/` by `setup.sh` and installed -to `/usr/local/bin`. +picks unit-MainPID vs proc-match), `bench-analyze` (aggregated time-series plots + +self-describing CSV/JSON exports), and `bench-compare` (grouped-bar cross-tool backup +comparison) are built from `tools/` by `setup.sh` and installed to `/usr/local/bin`. +All three draw through one `bench_tools::viz` module, so the dark palette (walrus +yellow, wal-g red, pgbackrest blue) is identical across every chart. ## Prerequisites @@ -43,12 +47,46 @@ to `/usr/local/bin`. Debian 13; any PGDG-supported release should work. - An **S3 bucket** and credentials. `walrus` reads credentials from the **environment only** (no IMDS, no shared-config profiles), so `config.env` must carry explicit keys. +- The scripts build `walrus` from an **in-repo checkout** by default (`bench/` must + sit inside the repo). For a fresh SUT with no checkout, package the source on the dev + box with `bench/scripts/make_source_tarball.sh` (a `git archive`; the commit id rides + in the tarball so provenance is preserved), get it onto the box, and set + `WALRUS_SRC_TARBALL` in `config.env` — `03_build_walrus.sh` unpacks and builds from + it. `terraform/` can do all of this for you (see [Provisioning with terraform](#provisioning-with-terraform)). - Conventional paths the scripts assume: `PGDATA=/dat/18/data`, daemon env file `/etc/postgresql/wal-g.env`, daemon socket `/tmp/wal-g`, PG binaries under `/usr/lib/postgresql/18/bin`. A spare NVMe is mounted at `/dat` by `scripts/sut/00_mount_nvme.sh` (AWS instance-store oriented) — on other hosts set `SKIP_MOUNT=1` and provide `/dat` on a fast disk yourself. +## Provisioning with terraform + +`terraform/` stands up the EC2 SUT (VPC, instance-store box, S3 bucket, IAM +instance profile, SSH key). It can also ship the source and bootstrap the box, so a +single `apply` goes from nothing to ready-to-bench: + +```sh +t=$(bench/scripts/make_source_tarball.sh) # git archive of HEAD +terraform -chdir=bench/terraform apply \ + -var my_ip="$(curl -s ifconfig.me)/32" \ + -var walrus_src_tarball="$t" \ + -var run_setup=true \ + -var pg_password='change-me' +``` + +- `walrus_src_tarball` (alone) uploads + unpacks the self-contained archive (whole repo, + harness included) to `~/walrus`, keeping the tarball at `~/walrus-src.tar.gz`. +- `run_setup=true` then writes `~/walrus/bench/config.env` (from + `config.env.tftpl`) and runs `setup.sh` over SSH — installs PG18, builds all three + tools, starts the units. Needs `pg_password`. S3 creds come from the **instance + profile via IMDS**, so no AWS keys are written. Re-runs when the source or a config + knob changes (a password-only change won't retrigger — `terraform taint` to force). +- Omit both to get a bare box and deploy by hand. See `terraform/deploy.tf`. + +Workload **sizing** (`SCALE`, `CHURN_*`, `*_SECONDS`) is templated at defaults; tune it +by editing `config.env` on the box (or `config.env.tftpl` before apply). After apply, +`terraform output next_steps` prints the SSH line and what to run. + ## Run it ```sh @@ -111,8 +149,8 @@ rest of walrus's data movement, single-host, reusing the same 1 Hz sampler — h attached by `--proc-match` on the tool's process name, since these are one-shot CLI runs, not daemons. `backup-fetch` and `wal-receive` run with the archive daemons stopped, so the sample is the op process alone. The backup-push ops -(`backup-send`/`backup-delta`/`backup-delta-summaries`) keep the tool's own archive -daemon live — a base backup's `pg_backup_stop` blocks on `BackupWaitWalArchive` until +(`backup-send`/`backup-delta`/`backup-delta-sidecar`/`backup-delta-summaries`) keep the +tool's own archive daemon live — a base backup's `pg_backup_stop` blocks on `BackupWaitWalArchive` until its WAL is archived — so for those the sample is the op process plus the mostly-idle daemon (~27 MB for walrus; wal-g's fan-out daemon adds more baseline). @@ -120,33 +158,44 @@ daemon (~27 MB for walrus; wal-g's fan-out daemon adds more baseline). |---|---|---|---| | `backup-send` | `backup-push --full` | `backup --type=full` | full base backup → S3 | | `backup-fetch` | `backup-fetch LATEST` | `restore` | restore ← S3 | -| `backup-delta` | `backup-push` (delta, `wi1`) | `backup --type=incr` | delta backup → S3 | +| `backup-delta` | `backup-push` (delta, `wi1`) | `backup --type=incr` | delta backup → S3 (map from raw archived WAL) | +| `backup-delta-sidecar` | `backup-push` (delta, `wi1`), `WALG_USE_WAL_DELTA=1` on the daemon | — (no peer) | delta backup → S3 (map folds `_delta` sidecars) | | `backup-delta-summaries` | `backup-push --delta-from-wal-summaries` | — (walrus-only) | delta from PG17 WAL summaries → S3 | | `wal-receive` | `wal-receive ` | — (no equivalent) | stream WAL from PG | walrus's walsender (serving WAL over the replication protocol) has no CLI entry point yet, so `wal-send` is intentionally absent. -The two delta cells exercise walrus's incremental backup. `backup-delta` builds the -delta map by walking **archived WAL** (the default source; wal-g-comparable `wi1` -wire format, so it stays cross-tool). `backup-delta-summaries` instead sources the map -from `$PGDATA/pg_wal/summaries` (PG17 `summarize_wal=on`, enabled by `10_init_pg.sh`); -wal-g and pgbackrest have no WAL-summary delta, so it is walrus-only. Both first force -a checkpoint, then drive a `DELTA_CHURN_SECONDS` burst — with the archiver live so the -churn WAL is in the repo — drain, keep the archiver live, then time the delta push. -They need a parent full, so -`backup-send` must precede them. `DELTA_ORIGIN=LATEST_FULL` keeps each delta cell -anchored to the chain root by default; `DELTA_MAX_STEPS` still caps chain depth. +The three delta cells exercise walrus's incremental backup; they differ only in the +**changed-block map source** (the `backup-push` command and `wi1` wire format are +identical, so all stay cross-tool): + +- `backup-delta` — walks **archived raw WAL**, reparsed in full. The cold worst case + (no sidecars). +- `backup-delta-sidecar` — same source, but `WALG_USE_WAL_DELTA=1` is written to the + daemon env file, so the archiver records `_delta` sidecars during the churn; + the push then folds whole 16-segment groups and reparses only the trailing partial + group. A walrus/wal-g daemon feature (both honor the var); pgbackrest has no peer. +- `backup-delta-summaries` — sources the map from `$PGDATA/pg_wal/summaries` (PG17 + `summarize_wal=on`, enabled by `10_init_pg.sh`); wal-g and pgbackrest have no + WAL-summary delta, so it is walrus-only. + +All three first force a checkpoint, then drive a `DELTA_CHURN_SECONDS` burst — with the +archiver live so the churn WAL is in the repo — drain, keep the archiver live, then +time the delta push. They need a parent full, so `backup-send` must precede them. +`DELTA_ORIGIN=LATEST_FULL` keeps each delta cell anchored to the chain root by default; +`DELTA_MAX_STEPS` still caps chain depth. ```sh # one cell (assumes setup.sh ran; non-fetch ops need the seeded DB) ./run_op.sh backup-send walrus r1 ./run_op.sh backup-fetch walrus r1 # fetches LATEST; run a backup-send first ./run_op.sh backup-delta walrus r1 # churn → delta push; needs a parent full +./run_op.sh backup-delta-sidecar walrus r1 # WALG_USE_WAL_DELTA=1 archiver; folds sidecars ./run_op.sh backup-delta-summaries walrus r1 # walrus-only; needs summarize_wal=on ./run_op.sh wal-receive walrus r1 # streams for WAL_RECEIVE_SECONDS -# whole sweep: send → fetch → delta → delta-summaries → wal-receive +# whole sweep: send → fetch → delta → delta-sidecar → delta-summaries → wal-receive ./op_matrix.sh ``` @@ -155,7 +204,7 @@ Each cell writes the sampler CSVs plus `op_metrics.txt`: | Field | Meaning | |---|---| | `elapsed_s` | wall-clock of the operation | -| `bytes_processed` | backup-send: on-disk cluster size (excl. `pg_wal`); backup-fetch: restored bytes; backup-delta(-summaries): S3-inventory byte growth across the push (the delta's stored size); wal-receive: S3-inventory byte growth while receiver drains | +| `bytes_processed` | backup-send: on-disk cluster size (excl. `pg_wal`); backup-fetch: restored bytes; backup-delta / -sidecar / -summaries: S3-inventory byte growth across the push (the delta's stored size); wal-receive: S3-inventory byte growth while receiver drains | | `throughput_mb_s` | `bytes_processed / elapsed_s / 1e6` | | `checkpoint_before_workload` | `1` when cell forced a checkpoint before FPI-sensitive work (backup-send, delta churn, wal-receive); else `0` | | `delta_origin` | delta parent policy passed as `WALG_DELTA_ORIGIN` for walrus / wal-g delta cells | @@ -171,6 +220,31 @@ Notes: - pgbackrest `backup` (full or `incr`) needs live archiving, so those cells point `archive_command` at pgbackrest and drain first, as the archive bench does. +### Comparing ops across tools (`bench-compare`) + +`bench-compare` renders one grouped-bar chart from the per-op result dirs — bars +grouped by op, one bar per tool/variant (same colors as `bench-analyze`). It pulls +size + elapsed from `op_metrics.txt`, peak CPU from `cpu.csv`, and peak RSS (`VmHWM`) +from `mem.csv`; duration is log-scaled (op timings span ~20 s to ~500 s). Replicas of a +variant on one op are aggregated to the median; absent cells (e.g. walrus-only +`delta-summaries`) just leave a gap. + +```sh +# one --run/--label per result dir; variant = label sans -b +bench-compare \ + --run results/backup-send-walrus-v020 --label walrus \ + --run results/backup-send-walg-v020 --label walg \ + --run results/backup-send-pgbackrest-v020 --label pgbackrest \ + --run results/backup-delta-walrus-v020 --label walrus \ + --run results/backup-delta-sidecar-walrus-v020 --label walrus \ + ... \ + --out results/plots --stamp v020 +``` + +Outputs `backup_compare.png` (size / duration / CPU / memory panels), `ops_summary.md`, +and `ops_compare_.csv`. Restrict/reorder the x groups with `--ops` +(comma-separated op names; default is the backup family, `wal-receive` excluded). + ## Config knobs See `config.env.example`. Common ones: `UPLOAD_CONCURRENCY` (wal-g concurrency / diff --git a/bench/config.env.example b/bench/config.env.example index 8f1c057..cce4746 100644 --- a/bench/config.env.example +++ b/bench/config.env.example @@ -1,18 +1,13 @@ -# bench config — copy to config.env (gitignored) and fill in. -# Sourced by setup.sh, run.sh, matrix.sh. Plain bash; no quoting tricks needed. +# bench config, copy to config.env # --- S3 target --------------------------------------------------------------- BUCKET=my-wal-bench-bucket AWS_REGION=us-east-1 -# AWS credentials. walrus (wal-rs) reads credentials from the ENVIRONMENT ONLY -# — no IMDS, no shared-config profiles — so explicit keys are REQUIRED here; -# 11_write_walg_env.sh writes them into the daemons' env file. The same keys -# are used by the `aws` CLI for the run's S3 inventory + smoke test. -# (AWS_PROFILE would cover the CLI but NOT walrus, so set the keys.) +# Static AWS keys for off-EC2; delete on EC2 to use IMDS AWS_ACCESS_KEY_ID=AKIA... AWS_SECRET_ACCESS_KEY=... -# Only for temporary/STS credentials; leave unset for long-lived keys. +# Temporary/STS only #AWS_SESSION_TOKEN= # --- Postgres bench role ----------------------------------------------------- @@ -30,20 +25,24 @@ BURST_SECONDS=300 # high-WAL burst phase (the heavy-load measuremen #BURST_WORKERS= # defaults to driver nproc # --- operation benchmarks (run_op.sh / op_matrix.sh) ------------------------- -# backup-send / backup-fetch / backup-delta / backup-delta-summaries / -# wal-receive, cross-tool. Both dirs are wiped per run; point them at the fast -# disk (under /dat by convention). +# Point transient dirs at fast disk #RESTORE_DIR=/dat/restore # backup-fetch restores here #WAL_RECV_DIR=/dat/walrecv # wal-receive assembles segments here +# Sampler runs as postgres; keep results off a 0750 $HOME it can't traverse +#RESULTS_ROOT=/dat/bench-results # per-cell sampler CSVs + op_metrics land here WAL_RECEIVE_SECONDS=300 # wal-receive streaming window (burst drives WAL) -# Delta cells: churn window that dirties pages between the parent full and the -# delta push, and the delta-chain depth (WALG_DELTA_MAX_STEPS) for walrus/wal-g. +# Delta churn window + chain depth DELTA_CHURN_SECONDS=300 DELTA_MAX_STEPS=7 DELTA_ORIGIN=LATEST_FULL # --- host conventions (override only if your box differs) -------------------- -# BUILD_USER owns the rust/cargo toolchain (defaults to sudo invoker, then ubuntu). +# BUILD_USER owns rust/cargo toolchain #BUILD_USER=ubuntu -# CIDR allowed in pg_hba; single-host driver == this box. +# Build walrus from a shipped source tarball instead of an in-repo checkout +# (fresh SUT with no .git). Make it on the dev box with +# bench/scripts/make_source_tarball.sh, scp it over, point this at it. +#WALRUS_SRC_TARBALL=/tmp/walrus-src.tar.gz +#WALRUS_SRC_DIR=/opt/walbench/src # where the tarball is unpacked +# CIDR allowed in pg_hba #DRIVER_CIDR=127.0.0.1/32 diff --git a/bench/matrix.sh b/bench/matrix.sh index f57ad90..800d5d1 100755 --- a/bench/matrix.sh +++ b/bench/matrix.sh @@ -4,10 +4,7 @@ # # RUN_ID - label for this sweep's result dirs (default r1). # -# Sweeps comparison on this host: pgbackrest, wal-g, walrus, once each, -# calling run.sh per cell. Single-host counterpart of the external fleet's -# orchestrate/run_matrix.sh (no SSH, and no GOMEMLIMIT-cap cell — that was a -# GC-policy experiment, not intrinsic footprint). +# Sweep archive-command cells across daemons set -euo pipefail SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" diff --git a/bench/op_matrix.sh b/bench/op_matrix.sh index b3190c4..5214650 100755 --- a/bench/op_matrix.sh +++ b/bench/op_matrix.sh @@ -4,24 +4,21 @@ # # RUN_ID - label for this sweep's result dirs (default r1). # -# Sweeps data-movement operation benchmarks (run_op.sh) on this host: -# backup-send, backup-fetch, backup-delta, backup-delta-summaries, then -# wal-receive — each across the tools that implement it, once. Op order matters: -# backup-send runs first so a parent full exists for backup-fetch to restore and -# for the delta cells to extend. +# Sweep data-movement ops across tools that implement them +# backup-send runs first so backup-fetch has something to restore +# Delta cells take their own parent full, avoiding cross-tool WAL gaps # -# Skipped cells: pgbackrest has no wal-receive equivalent; backup-delta-summaries -# is walrus-only (no wal-g / pgbackrest WAL-summary delta). Override OPS / TOOLS -# via env. Counterpart of matrix.sh (archive path). +# Skipped cells: pgbackrest has no wal-receive equivalent; backup-delta-sidecar +# has no pgbackrest peer (WALG_USE_WAL_DELTA is a walrus/wal-g daemon feature); +# backup-delta-summaries is walrus-only (no wal-g / pgbackrest WAL-summary delta). +# Override OPS / TOOLS via env # -# backup-delta-chain (DELTA_MAX_STEPS-deep chain + leaf restore) is omitted from -# the default sweep — it churns once per step, so its cost scales with depth. Opt -# in with OPS="backup-send backup-delta-chain" (backup-send must precede it). +# backup-delta-chain is opt-in: OPS="backup-delta-chain" set -euo pipefail SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" RUN_ID="${1:-${RUN_ID:-r1}}" -read -r -a OPS <<< "${OPS:-backup-send backup-fetch backup-delta backup-delta-summaries wal-receive}" +read -r -a OPS <<< "${OPS:-backup-send backup-fetch backup-delta backup-delta-sidecar backup-delta-summaries wal-receive}" read -r -a TOOLS <<< "${TOOLS:-pgbackrest walg walrus}" log() { printf '[op-matrix %s] %s\n' "$(date -u +%H:%M:%S)" "$*" >&2; } @@ -38,6 +35,10 @@ for op in "${OPS[@]}"; do log "skip ${op}/${tool} (walrus-only)" continue fi + if [[ "${op}" == "backup-delta-sidecar" && "${tool}" == "pgbackrest" ]]; then + log "skip ${op}/${tool} (no WALG_USE_WAL_DELTA peer)" + continue + fi log "=== run ${op} ${tool} ${RUN_ID} ===" "${SCRIPT_DIR}/run_op.sh" "${op}" "${tool}" "${RUN_ID}" done diff --git a/bench/run.sh b/bench/run.sh index eb3c8a4..2bf0f9b 100755 --- a/bench/run.sh +++ b/bench/run.sh @@ -5,22 +5,13 @@ # DAEMON - walg | walrus | pgbackrest (which archiver to exercise) # RUN_ID - free-form label, e.g. r1 / 2026-06-22 # -# Drives ONE benchmark cell end to end on THIS host (PG + daemon + pgbench all -# local). Single-host counterpart of the external fleet's orchestrate/run_one.sh, -# with SSH and IMDS removed: credentials come from config.env, the workload runs -# against the local cluster. +# Drive one archive-command benchmark cell on this host # -# 1. Select the daemon: write wal-g.env (this cell's concurrency), start its -# unit, point archive_command at the tool's own client, pre-drain backlog. -# (pgbackrest is daemonless: set process-max, stanza, archive-push, drain.) -# 2. Reset pg_stat_archiver, start the 1 Hz sampler into the results dir. -# 3. Run the workload (high-WAL burst) against local PG. -# 4. Stop the sampler, capture the S3 inventory + provenance. +# Select archiver, drain backlog, sample high-WAL burst, capture inventory # # Results land under bench/results/-/ (override RESULTS_ROOT). # Assumes ./setup.sh has run and the bench DB is seeded (pgbench_init.sh). -# Run as a normal user (it uses sudo for the root steps); do not run pgbench as -# root. +# Run as normal user, sudo handles root steps set -euo pipefail SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" @@ -51,8 +42,7 @@ REPO_ROOT="$(cd -- "${SCRIPT_DIR}/.." >/dev/null 2>&1 && pwd)" PG_ENV_FILE="/etc/postgresql/wal-g.env" AWS_REGION="${AWS_REGION:-us-east-1}" COMPRESSION="${WALG_COMPRESSION_METHOD:-lz4}" -# Scope the archive prefix per daemon+run (same bucket = same destination storage, -# fair comparison) so cells of a sweep do not pile WAL into one shared prefix. +# Isolate each daemon+run under one bucket WALG_PREFIX="s3://${BUCKET}/walg-bench/${DAEMON}/${RUN_ID}" PGBACKREST_REPO_PATH="/pgbackrest-bench/${DAEMON}/${RUN_ID}" PGBACKREST_STANZA="walbench" @@ -116,11 +106,7 @@ REMOTE drain_backlog 10 300 else log "writing ${PG_ENV_FILE} + selecting ${DAEMON} (own daemon-client)" - # 11 writes the shared env file from the env credentials; 30 stops the other - # unit, starts this one, points archive_command at its own client, waits the - # socket. Both inherit BUCKET/creds/region from this shell. ENV_FILE is pinned - # to the daemon env path so a caller-set ENV_FILE (config selector) preserved - # by sudo -E cannot redirect 11's OUTPUT onto the config file. + # Pin ENV_FILE so sudo -E cannot turn config selector into env-file output ENV_FILE="${PG_ENV_FILE}" \ BUCKET="${BUCKET}" UPLOAD_CONCURRENCY="${UPLOAD_CONCURRENCY}" \ WALG_S3_PREFIX="${WALG_PREFIX}" \ diff --git a/bench/run_op.sh b/bench/run_op.sh index 2b51427..59ccb09 100755 --- a/bench/run_op.sh +++ b/bench/run_op.sh @@ -2,61 +2,54 @@ # # run_op.sh OP TOOL RUN_ID # -# OP - backup-send | backup-fetch | backup-delta | backup-delta-summaries | -# backup-delta-chain | wal-receive (data-movement operation) +# OP - backup-send | backup-fetch | backup-delta | backup-delta-sidecar | +# backup-delta-summaries | backup-delta-chain | wal-receive +# (data-movement operation) # TOOL - walrus | walg | pgbackrest (implementation) # RUN_ID - free-form label, e.g. r1 / 2026-06-22 # -# Benchmarks ONE data-movement operation with ONE tool, single-host (PG + tool -# local), cross-tool where an equivalent exists. Counterpart of run.sh, which -# benches the archive_command (wal-push) path; this covers the rest of walrus: +# Benchmark one data-movement operation with one local tool # # backup-send base backup -> S3 walrus/wal-g backup-push ... --full | # pgbackrest backup --type=full # backup-fetch restore <- S3 walrus/wal-g backup-fetch | pgbackrest restore # backup-delta delta backup -> S3 walrus/wal-g backup-push (wi1) | pgbackrest backup --type=incr +# backup-delta-sidecar delta, archiver walrus/wal-g backup-push (wi1), | (no pgbackrest peer) +# pre-records WALG_USE_WAL_DELTA=1 on the daemon +# _delta # backup-delta-summaries delta from WAL walrus backup-push | (walrus-only) # summaries -> S3 --delta-from-wal-summaries # backup-delta-chain N-deep delta chain walrus/wal-g backup-push xN | pgbackrest backup --type=incr xN # + restore of leaf (origin=LATEST), then backup-fetch LATEST # wal-receive stream WAL from PG walrus/wal-g wal-receive | (no pgbackrest peer) # -# Delta cells need a parent full backup (backup-send must precede them) and a -# churn phase: they configure the tool, checkpoint, drive a DELTA_CHURN_SECONDS -# burst with the archiver live (the default delta map walks archived WAL), -# drain, then time the delta push while archiver stays live. backup-delta-summaries -# instead sources the delta map from $PGDATA/pg_wal/summaries (needs -# summarize_wal=on, set by 10_init_pg.sh) and is walrus-only (no wal-g / -# pgbackrest peer). DELTA_ORIGIN defaults to LATEST_FULL so both delta paths -# anchor to chain root. Delta size is S3-inventory byte growth across the push, -# not on-disk cluster size. +# Delta cells take a fresh parent full, churn, drain, then time delta push +# with tool archiver still live. Variants differ only by changed-block source: +# backup-delta archived raw WAL, reparsed in full (no sidecars) -- the +# cold worst case +# backup-delta-sidecar same, but WALG_USE_WAL_DELTA=1 on the archiver pre-records +# _delta sidecars during the churn, so the push folds +# whole groups and reparses only the trailing partial group +# (walrus + wal-g; pgbackrest has no peer) +# backup-delta-summaries $PGDATA/pg_wal/summaries (needs summarize_wal=on, set by +# 10_init_pg.sh); walrus-only +# DELTA_ORIGIN defaults to LATEST_FULL, the in-cell full +# Delta size is S3-inventory byte growth across the push, not on-disk cluster size. # -# backup-delta-chain builds a real DELTA_MAX_STEPS-deep chain: each step churns, -# drains, then pushes a delta with WALG_DELTA_ORIGIN=LATEST so it extends the -# PREVIOUS delta (LATEST_FULL would re-anchor each to the root, leaving restore -# depth 2). Every step is timed + sized on its own (chain_metrics.txt), then a -# backup-fetch LATEST walks full + all N deltas to exercise restore-time replay. -# Its churn is per-step and INSIDE the sampler window, so the daemon's archiving -# during churn is sampled too; the per-step push timings isolate the push. +# backup-delta-chain uses WALG_DELTA_ORIGIN=LATEST, then restores leaf backup +# chain_metrics.txt records per-step push timing + size # -# walrus's walsender (serving WAL via the replication protocol) has no CLI entry -# point yet, so wal-send is intentionally absent. +# wal-send is absent until walrus exposes walsender CLI # -# The 1 Hz sampler is reused, here attached with --proc-match : these -# ops are one-shot CLI processes, not systemd units. Both archive daemons are -# stopped first; backup-push ops (NEEDS_ARCHIVE) then start ONLY the tool's own -# daemon and leave it up across the push (pg_backup_stop blocks on WAL archival), -# so for those the sample is the op process plus the mostly-idle daemon (~27 MB -# for walrus). backup-fetch / wal-receive run with no daemon — op process only. +# Sampler attaches with --proc-match . Backup-push samples op + daemon +# because pg_backup_stop waits for WAL archival # -# Results: bench/results/--/ — sampler CSVs, op_metrics.txt +# Results: bench/results/--/, sampler CSVs, op_metrics.txt # (elapsed, bytes processed, MB/s), provenance.txt, s3_inventory.txt. Override # RESULTS_ROOT to relocate. # -# Assumes ./setup.sh has run. backup-send and wal-receive also assume the bench -# DB is seeded (pgbench_init.sh); backup-fetch assumes a compatible backup-send -# already produced a backup to fetch. Run as a normal user (uses sudo for root -# steps); do not run pgbench as root. +# Assumes setup + seeded DB, backup-fetch needs prior compatible backup-send +# Run as normal user, sudo handles root steps set -euo pipefail SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" @@ -66,7 +59,7 @@ LOG_TAG=op load_config if [[ $# -ne 3 ]]; then - echo "usage: $0 " >&2 + echo "usage: $0 " >&2 exit 2 fi OP="$1" @@ -74,8 +67,8 @@ TOOL="$2" RUN_ID="$3" case "${OP}" in - backup-send|backup-fetch|backup-delta|backup-delta-summaries|backup-delta-chain|wal-receive) ;; - *) echo "error: OP must be backup-send|backup-fetch|backup-delta|backup-delta-summaries|backup-delta-chain|wal-receive, got '${OP}'" >&2; exit 2 ;; + backup-send|backup-fetch|backup-delta|backup-delta-sidecar|backup-delta-summaries|backup-delta-chain|wal-receive) ;; + *) echo "error: OP must be backup-send|backup-fetch|backup-delta|backup-delta-sidecar|backup-delta-summaries|backup-delta-chain|wal-receive, got '${OP}'" >&2; exit 2 ;; esac case "${TOOL}" in walrus|walg|pgbackrest) ;; @@ -85,26 +78,31 @@ if [[ "${OP}" == "wal-receive" && "${TOOL}" == "pgbackrest" ]]; then echo "error: pgbackrest has no wal-receive equivalent (skip this cell)" >&2 exit 2 fi -# WAL-summary-sourced delta is a walrus-only path (no wal-g / pgbackrest peer). +# WAL-summary delta is walrus-only if [[ "${OP}" == "backup-delta-summaries" && "${TOOL}" != "walrus" ]]; then echo "error: backup-delta-summaries is walrus-only (skip this cell)" >&2 exit 2 fi +# WALG_USE_WAL_DELTA has no pgbackrest peer +if [[ "${OP}" == "backup-delta-sidecar" && "${TOOL}" == "pgbackrest" ]]; then + echo "error: backup-delta-sidecar has no pgbackrest equivalent (skip this cell)" >&2 + exit 2 +fi -# Single-delta ops drive one churn phase, then one delta push; group for branches. +# Single-delta ops churn once, then push once IS_DELTA=0 -[[ "${OP}" == "backup-delta" || "${OP}" == "backup-delta-summaries" ]] && IS_DELTA=1 -# Chain op churns + pushes per step inside the timed loop (not the single step 1b). +[[ "${OP}" == "backup-delta" || "${OP}" == "backup-delta-sidecar" || "${OP}" == "backup-delta-summaries" ]] && IS_DELTA=1 +# Sidecar variant records _delta during churn +USE_WAL_DELTA="" +[[ "${OP}" == "backup-delta-sidecar" ]] && USE_WAL_DELTA=1 +# Chain op churns + pushes inside timed loop IS_CHAIN=0 [[ "${OP}" == "backup-delta-chain" ]] && IS_CHAIN=1 -# Backup-push ops (full + delta) take a base backup, whose pg_backup_stop blocks -# on BackupWaitWalArchive until the backup's WAL is archived. So the tool's -# archiver MUST stay live across these cells (the sampler then sees the op -# process plus the mostly-idle daemon; for walrus that baseline is ~27 MB). +# Backup-push needs live archiver, pg_backup_stop waits for WAL archival # backup-fetch (restore) and wal-receive need no archiver. NEEDS_ARCHIVE=0 -case "${OP}" in backup-send|backup-delta|backup-delta-summaries|backup-delta-chain) NEEDS_ARCHIVE=1 ;; esac +case "${OP}" in backup-send|backup-delta|backup-delta-sidecar|backup-delta-summaries|backup-delta-chain) NEEDS_ARCHIVE=1 ;; esac : "${BUCKET:?set BUCKET in config.env}" : "${PGUSER:?set PGUSER in config.env}" @@ -115,9 +113,7 @@ case "${OP}" in backup-send|backup-delta|backup-delta-summaries|backup-delta-cha REPO_ROOT="$(cd -- "${SCRIPT_DIR}/.." >/dev/null 2>&1 && pwd)" AWS_REGION="${AWS_REGION:-us-east-1}" COMPRESSION="${WALG_COMPRESSION_METHOD:-lz4}" -# Scope the prefix per tool+run (same bucket = same destination storage, fair -# comparison) so fetch LATEST / implicit delta-parent resolve only within current -# tool/run backups, never another tool's or a prior sweep's. +# Isolate LATEST and delta parents per tool+run WALG_PREFIX="s3://${BUCKET}/walg-bench/${TOOL}/${RUN_ID}" PGBACKREST_REPO_PATH="/pgbackrest-bench/${TOOL}/${RUN_ID}" PGBACKREST_STANZA="walbench" @@ -129,12 +125,11 @@ WALG_BIN="/usr/bin/wal-g" RESULTS_ROOT="${RESULTS_ROOT:-${SCRIPT_DIR}/results}" RESULT_DIR="${RESULTS_ROOT}/${OP}-${TOOL}-${RUN_ID}" SAMPLER="/usr/local/bin/bench-sampler" -# Where backup-fetch restores into and wal-receive assembles segments. +# Restore and WAL-receive staging dirs RESTORE_DIR="${RESTORE_DIR:-/dat/restore}" WAL_RECV_DIR="${WAL_RECV_DIR:-/dat/walrecv}" WAL_RECEIVE_SECONDS="${WAL_RECEIVE_SECONDS:-300}" -# Delta cells: churn window that dirties pages between the parent full and the -# delta push, and the delta-chain depth handed to walrus/wal-g (WALG_DELTA_MAX_STEPS). +# Delta churn window + max chain depth DELTA_CHURN_SECONDS="${DELTA_CHURN_SECONDS:-300}" DELTA_MAX_STEPS="${DELTA_MAX_STEPS:-3}" DELTA_ORIGIN="${DELTA_ORIGIN:-LATEST_FULL}" @@ -150,9 +145,7 @@ else INV_PREFIX="${WALG_PREFIX}" fi -# Run a walrus/wal-g command as postgres with the daemon env file sourced -# (WALG_S3_PREFIX, AWS creds, region, compression, PGHOST). Absolute paths, so -# no reliance on the postgres login PATH. +# Run walrus/wal-g as postgres with daemon env run_tool() { sudo -u postgres bash -c ' set -a @@ -162,41 +155,52 @@ run_tool() { ' _ "$@" } -# Current WAL position as an absolute byte offset (for wal-receive throughput). +# Current WAL position, absolute bytes lsn_bytes() { PGPASSWORD="${PGPASSWORD}" psql -h "${PGHOST_DRIVER}" -U "${PGUSER}" -d walbench \ -tAc "SELECT pg_wal_lsn_diff(pg_current_wal_lsn(),'0/0')" } -# Total bytes stored under the tool's S3 prefix (delta cells diff before/after -# the push to size the increment). Empty/zero when the prefix has no objects. +# Total stored bytes under tool S3 prefix inv_size() { sudo aws s3 ls --recursive --summarize "${INV_PREFIX}/" --region "${AWS_REGION}" 2>/dev/null \ | awk '/Total Size:/ {print $3}' | tail -1 } -# Fail fast if no parent backup exists for a delta to anchor to. Without one, -# backup-push silently emits a FULL (mislabeled as a delta) and inv-growth sizing -# reports a full's bytes. op_matrix runs backup-send first; this guards lone runs. +# Verify parent full under tool prefix +# Without one, backup-push silently emits full backup assert_delta_parent() { local roots if [[ "${TOOL}" == "pgbackrest" ]]; then - # full backup-set dirs end in 'F/'; incr (delta) dirs end in 'I/' + # Full backup-set dirs end in F/, incr dirs end in I/ roots="$(sudo aws s3 ls "s3://${BUCKET}${PGBACKREST_REPO_PATH}/backup/${PGBACKREST_STANZA}/" \ --region "${AWS_REGION}" 2>/dev/null | awk '/ PRE / && /F\/$/ {n++} END{print n+0}')" else - # walrus/wal-g chain root = base_ without the _D_ delta suffix + # Chain root is base_ without _D_ suffix roots="$(sudo aws s3 ls "${WALG_PREFIX}/basebackups_005/" \ --region "${AWS_REGION}" 2>/dev/null | awk '/ PRE base_/ && !/_D_/ {n++} END{print n+0}')" fi if [[ "${roots:-0}" -eq 0 ]]; then - echo "error: no parent full backup under ${INV_PREFIX}; run backup-send ${TOOL} ${RUN_ID} first" >&2 + echo "error: no parent full under ${INV_PREFIX} after take_parent_full (the --full push may have failed)" >&2 echo " (a delta with no parent silently becomes a full, corrupting the measurement)" >&2 exit 1 fi log "parent check: ${roots} full backup(s) under ${INV_PREFIX}" } +# Take parent full inside delta cell +# Keeps parent-to-push WAL under current tool prefix +take_parent_full() { + log "delta-prep: fresh parent full via ${TOOL} (anchors delta inside this cell, archiver live)" + case "${TOOL}" in + walrus) run_tool "${WALRUS_BIN}" backup-push "${PGDATA_DIR}" --full ;; + walg) run_tool "${WALG_BIN}" backup-push "${PGDATA_DIR}" --full ;; + pgbackrest) sudo -u postgres pgbackrest --stanza="${PGBACKREST_STANZA}" backup --type=full ;; + esac + log "delta-prep: draining the parent full's WAL before churn" + drain_backlog 5 600 +} + # --- pre-flight: DB seeded? (backup-send + wal-receive need a populated DB) --- [[ "${OP}" == "backup-fetch" ]] || require_seeded @@ -204,8 +208,7 @@ log "op=${OP} tool=${TOOL} run_id=${RUN_ID} concurrency=${UPLOAD_CONCURRENCY}" CHECKPOINT_BEFORE_WORKLOAD=0 # --- step 1: tool config ----------------------------------------------------- -# Stop both archive daemons so neither pollutes proc-match (they share the -# 'walrus'/'wal-g' comm with the op process) and so they do not race archiving. +# Stop daemons before proc-match sampling and archive selection run_root <<'REMOTE' set -euo pipefail systemctl stop wal-g.service walrus.service 2>/dev/null || true @@ -230,9 +233,7 @@ echo "process-max -> $(grep -E '^process-max=' "${CONF}")" echo "repo1-path -> $(grep -E '^repo1-path=' "${CONF}")" sudo -u postgres pgbackrest --stanza="${STANZA}" stanza-create || true -# backup (full or incr) needs WAL archiving live (pgbackrest blocks on the -# start-WAL archive), so point archive_command at pgbackrest and drain. restore -# reads only the repo. backup-delta (incr) churns + drains in the delta-prep step. +# pgbackrest backup needs archive_command live, restore reads repo only if [[ "${OP}" == "backup-send" || "${OP}" == "backup-delta" || "${OP}" == "backup-delta-chain" ]]; then ARCHIVE_CMD="pgbackrest --stanza=${STANZA} archive-push %p" sudo -u postgres "${PGBIN}/psql" -p 5432 -tA \ @@ -247,20 +248,17 @@ REMOTE [[ "${NEEDS_ARCHIVE}" -eq 1 ]] && { log "pre-drain leftover backlog"; drain_backlog 10 300; } else log "writing /etc/postgresql/wal-g.env for ${TOOL}" - # Pin ENV_FILE to the daemon env path: 11_write_walg_env.sh reads ENV_FILE as - # its OUTPUT target, and sudo -E would otherwise leak a caller-set ENV_FILE - # (our config-file selector) and clobber it. + # Pin ENV_FILE so sudo -E cannot redirect env output to config file ENV_FILE="/etc/postgresql/wal-g.env" \ BUCKET="${BUCKET}" UPLOAD_CONCURRENCY="${UPLOAD_CONCURRENCY}" \ WALG_S3_PREFIX="${WALG_PREFIX}" \ AWS_REGION="${AWS_REGION}" WALG_COMPRESSION_METHOD="${COMPRESSION}" \ + WALG_USE_WAL_DELTA="${USE_WAL_DELTA}" \ AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID:-}" AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY:-}" \ AWS_SESSION_TOKEN="${AWS_SESSION_TOKEN:-}" \ sudo -E bash "${SCRIPT_DIR}/scripts/sut/11_write_walg_env.sh" - # backup-push ops need a live archiver (see NEEDS_ARCHIVE): start the tool's - # daemon and point archive_command at its own client, then pre-drain leftover - # backlog. backup-fetch / wal-receive skip this (no archiving needed). + # backup-push needs live tool archiver; fetch/receive do not if [[ "${NEEDS_ARCHIVE}" -eq 1 ]]; then log "starting ${TOOL} archive daemon (backup-push waits on WAL archival at stop)" sudo bash "${SCRIPT_DIR}/scripts/sut/30_select_daemon.sh" "${TOOL}" @@ -274,16 +272,15 @@ if [[ "${OP}" == "backup-send" || "${OP}" == "wal-receive" ]]; then CHECKPOINT_BEFORE_WORKLOAD=1 fi -# Delta ops must extend an existing full; bail before churning if none exists. -[[ "${IS_DELTA}" -eq 1 || "${IS_CHAIN}" -eq 1 ]] && assert_delta_parent +# Delta/chain ops take parent full inside current tool prefix +if [[ "${IS_DELTA}" -eq 1 || "${IS_CHAIN}" -eq 1 ]]; then + take_parent_full + assert_delta_parent +fi -# --- step 1b: delta prep — churn between the parent full and the delta push --- -# The default delta map walks ARCHIVED WAL, so the churn WAL must reach the repo -# before the push. The tool's archiver is already live (step 1, NEEDS_ARCHIVE) -# and STAYS up through the push: pg_backup_stop blocks on WAL archival, so a push -# without a live archiver hangs. Just churn, then drain so the map is complete. -# (backup-delta-summaries sources the map from local pg_wal/summaries instead, -# but archiving the churn still lets pg_wal recycle and keeps the parent valid.) +# --- step 1b: delta prep, churn between parent full and delta push ------------ +# Default delta map reads archived WAL, so drain churn WAL before push +# Summaries path still archives churn so pg_wal can recycle if [[ "${IS_DELTA}" -eq 1 ]]; then log "delta-prep: checkpoint before churn" checkpoint_pg @@ -315,12 +312,12 @@ case "${OP}" in walg) run_tool "${WALG_BIN}" backup-push "${PGDATA_DIR}" --full ;; pgbackrest) sudo -u postgres pgbackrest --stanza="${PGBACKREST_STANZA}" backup --type=full ;; esac - # bytes processed = on-disk cluster size, excluding WAL (the backup payload) + # bytes processed = on-disk cluster size, excluding WAL BYTES="$(sudo du -sb --exclude=pg_wal "${PGDATA_DIR}" | awk '{print $1}')" ;; - backup-delta) + backup-delta|backup-delta-sidecar) inv_before="$(inv_size)"; inv_before="${inv_before:-0}" - log "delta backup -> ${INV_PREFIX} (wi1; origin=${DELTA_ORIGIN}; parent inventory ${inv_before} B)" + log "delta backup -> ${INV_PREFIX} (wi1; origin=${DELTA_ORIGIN}; sidecars=${USE_WAL_DELTA:-0}; parent inventory ${inv_before} B)" case "${TOOL}" in walrus) run_tool env WALG_DELTA_MAX_STEPS="${DELTA_MAX_STEPS}" \ WALG_DELTA_ORIGIN="${DELTA_ORIGIN}" \ @@ -330,7 +327,7 @@ case "${OP}" in "${WALG_BIN}" backup-push "${PGDATA_DIR}" ;; pgbackrest) sudo -u postgres pgbackrest --stanza="${PGBACKREST_STANZA}" backup --type=incr ;; esac - # bytes processed = inventory growth = the delta's stored (compressed) size + # bytes processed = inventory growth, compressed inv_after="$(inv_size)"; inv_after="${inv_after:-0}" BYTES=$(( inv_after - inv_before )); (( BYTES < 0 )) && BYTES=0 ;; @@ -344,10 +341,7 @@ case "${OP}" in BYTES=$(( inv_after - inv_before )); (( BYTES < 0 )) && BYTES=0 ;; backup-delta-chain) - # Build a DELTA_MAX_STEPS-deep chain (origin=LATEST: each delta extends the - # prior one). Per step: churn, drain, then time + size the push alone. BYTES - # accumulates per-step delta payloads (not END-START inventory: that would - # also count the inter-step churn WAL). chain_metrics.txt holds the breakdown. + # Each delta extends prior one; count pushed delta bytes, not inter-step WAL DELTA_ORIGIN=LATEST CHAIN_METRICS="${RESULT_DIR}/chain_metrics.txt" push_s_total=0 @@ -455,15 +449,12 @@ rm -rf "${WAL_RECV_DIR}" install -d -o postgres -g postgres "${WAL_RECV_DIR}" REMOTE if [[ "${TOOL}" == "walrus" ]]; then - # archive_dir is a rotation buffer: walrus uploads each rotated segment to - # WALG_S3_PREFIX, the SAME S3 destination wal-g streams to. Both are scored - # by what lands in storage (below), not where they stage locally. + # walrus stages locally, both tools are scored by S3 bytes recv_cmd=("${WALRUS_BIN}" wal-receive "${WAL_RECV_DIR}") else recv_cmd=("${WALG_BIN}" wal-receive) fi - # Launch as postgres with the env file sourced; redirect INSIDE sudo so the - # log lands in the postgres-owned results dir. Background the sudo wrapper. + # Redirect inside sudo so postgres owns receiver log sudo -u postgres bash -c ' set -a; . /etc/postgresql/wal-g.env; set +a log="$1"; shift @@ -487,10 +478,8 @@ REMOTE fi lsn_end="$(lsn_bytes)" - # Throughput = WAL that actually LANDED in the S3 destination, not WAL - # generated by PG (pg_current_wal_lsn advances regardless of receiver lag). - # Uploads are async, so keep the receiver alive and poll the inventory until - # it stops growing before sizing receipt. + # Throughput = WAL stored in S3, not WAL generated by PG + # Poll async uploads before sizing receipt log "draining receiver uploads into ${INV_PREFIX}" recv_after="${recv_before}"; prev="" for _ in $(seq 1 30); do @@ -504,7 +493,7 @@ REMOTE gen=$(( lsn_end - lsn_start )) log "wal-receive: generated=${gen} B (uncompressed) received=${BYTES} B (stored)" - # WAL generated but nothing stored => measured generation, not receipt. + # WAL generated but nothing stored means receipt was not measured if (( gen > 0 && BYTES == 0 )); then mark_invalid "wal-receive stored 0 B to ${INV_PREFIX} while ${gen} B WAL generated" fi @@ -559,6 +548,7 @@ write_provenance "${RESULT_DIR}" "${INV_PREFIX}" "${AWS_REGION}" \ "run_id=${RUN_ID}" \ "checkpoint_before_workload=${CHECKPOINT_BEFORE_WORKLOAD}" \ "delta_origin=${DELTA_ORIGIN}" \ + "use_wal_delta=${USE_WAL_DELTA:-0}" \ "harness_git=${HARNESS_GIT}" log "DONE: ${OP}-${TOOL}-${RUN_ID}" diff --git a/bench/scripts/driver/pgbench_init.sh b/bench/scripts/driver/pgbench_init.sh index 7982efe..af81352 100755 --- a/bench/scripts/driver/pgbench_init.sh +++ b/bench/scripts/driver/pgbench_init.sh @@ -2,13 +2,9 @@ # # pgbench_init.sh # -# Initialize the benchmark database on the SUT (driven over the network): -# 1. create the 'walbench' database if it is absent -# 2. pgbench -i -s "$SCALE" to lay down the standard TPC-B tables -# 3. apply gen_schema.sql to add the WAL-churn / bulk-COPY workload tables +# Initialize benchmark DB over network # -# All connection parameters come from libpq env vars so no IPs/passwords are -# hardcoded. Set PGHOST to the SUT private IP before running. +# Connection comes from libpq env vars # # Env vars (with defaults): # PGHOST (required) SUT private IP / host @@ -42,8 +38,7 @@ fi echo "==> Target: ${PGUSER}@${PGHOST}:${PGPORT}, database '${PGDATABASE}', scale ${SCALE}" -# 1. Create the database if it does not already exist. Connect to 'postgres' -# for the existence check / CREATE DATABASE. +# Create database if absent db_exists="$(psql -d postgres -At -c \ "SELECT 1 FROM pg_database WHERE datname = '${PGDATABASE}'")" @@ -54,12 +49,11 @@ else createdb "${PGDATABASE}" fi -# 2. Standard pgbench TPC-B tables at the requested scale. Init mode is -# single-threaded — pgbench rejects -j here ("cannot be used in init mode"). +# Standard pgbench TPC-B tables echo "==> pgbench -i -s ${SCALE} (this can take a while at large scale)." pgbench -i -s "${SCALE}" "${PGDATABASE}" -# 3. WAL-churn workload schema. CHURN_ROWS is passed as the :rows variable. +# WAL-churn workload schema echo "==> Applying gen_schema.sql with ${CHURN_ROWS} churn rows." psql -d "${PGDATABASE}" -v ON_ERROR_STOP=1 \ -v rows="${CHURN_ROWS}" \ diff --git a/bench/scripts/driver/run_workload.sh b/bench/scripts/driver/run_workload.sh index dffd64b..b77c871 100755 --- a/bench/scripts/driver/run_workload.sh +++ b/bench/scripts/driver/run_workload.sh @@ -2,13 +2,10 @@ # # run_workload.sh # -# Driver-side orchestrator for ONE benchmark cell. run.sh invokes it locally as: +# Driver workload wrapper for one benchmark cell: # PGHOST=.. PGUSER=.. PGPASSWORD=.. RUN_ID=.. bash scripts/driver/run_workload.sh # -# Runs the measured workload: the high-WAL burst phase, and BLOCKS until it -# finishes. The burst is the heavy-load phase we care about. Assumes the -# 'walbench' DB is already initialized (pgbench_init.sh ran once during setup) — -# it does NOT re-init. +# Runs high-WAL burst, assumes pgbench_init.sh already ran # # Env (with defaults): # PGHOST/PGUSER/PGPASSWORD (required; passed by run.sh) @@ -18,7 +15,7 @@ # CHURN_ROWS 2000000 must match pgbench_init.sh CHURN_ROWS # RUN_ID label, for logs only # -# Runs from scripts/driver/, next to the workload_burst.sh phase script. +# Runs next to workload_burst.sh set -euo pipefail SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" diff --git a/bench/scripts/driver/workload_burst.sh b/bench/scripts/driver/workload_burst.sh index 437a05f..1be2125 100755 --- a/bench/scripts/driver/workload_burst.sh +++ b/bench/scripts/driver/workload_burst.sh @@ -2,21 +2,12 @@ # # workload_burst.sh # -# High-WAL burst load, driven over the network at the SUT's PostgreSQL. -# Goal: generate WAL >= ~2x the single-daemon archive drain rate so the -# pg_wal *.ready backlog climbs into the hundreds/thousands and we can measure -# how each archiver daemon keeps up (or falls behind). +# High-WAL burst load against SUT PostgreSQL +# Target WAL >= ~2x single-daemon drain rate so the .ready backlog climbs, +# exposing how each archiver keeps up (or falls behind) # -# Strategy: -# * UPDATE storm -- random-row UPDATEs on wal_churn that mutate *indexed* -# columns (k1,k2,k3,tag,updated_at). After each checkpoint, the first touch -# of every heap + index page emits a full-page image (FPI), so random -# scatter across a wide, 5-B-tree table maximizes WAL bytes per row. -# * COPY storm -- a fraction of workers run large COPY batches into the -# unindexed wal_bulk table for bursty heap-insert WAL on top of the UPDATEs. -# -# Both are expressed as pgbench custom scripts run by N concurrent workers -# (one pgbench process per worker so COPY \. blocks behave) for DURATION. +# UPDATE workers dirty indexed rows; COPY workers bulk-insert unindexed rows +# One pgbench process per worker runs for DURATION # # Env vars (with defaults): # PGHOST (required) SUT private IP / host @@ -41,7 +32,7 @@ DURATION="${DURATION:-600}" CHURN_ROWS="${CHURN_ROWS:-2000000}" UPDATE_BATCH="${UPDATE_BATCH:-25}" COPY_ROWS="${COPY_ROWS:-50000}" -COPY_BLOB_REPEAT="${COPY_BLOB_REPEAT:-8}" # md5 repeats per blob; raises WAL bytes/row at ~same CPU +COPY_BLOB_REPEAT="${COPY_BLOB_REPEAT:-8}" # md5 repeats per blob PROTOCOL="${PROTOCOL:-prepared}" host_cpus="$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 8)" @@ -66,7 +57,7 @@ echo "==> update_batch=${UPDATE_BATCH} copy_rows=${COPY_ROWS} churn_rows=${CHURN # --- temp pgbench script files --------------------------------------------- WORKDIR="$(mktemp -d)" cleanup() { - # Kill any still-running worker pgbench processes, then remove temp scripts. + # Kill any remaining workers, then remove temp scripts if [[ -n "${WORKER_PIDS:-}" ]]; then # shellcheck disable=SC2086 kill ${WORKER_PIDS} 2>/dev/null || true @@ -78,9 +69,8 @@ trap cleanup EXIT INT TERM UPDATE_SQL="${WORKDIR}/update.sql" COPY_SQL="${WORKDIR}/copy.sql" -# UPDATE worker script: one random id per :rid, repeated UPDATE_BATCH times in a -# single transaction. Every mutated column is indexed so each row dirties the -# heap page plus several index pages -> heavy FPI WAL. +# UPDATE worker mutates indexed columns: post-checkpoint first page touch emits +# an FPI, so random scatter across the wide 5-index table maximizes WAL bytes/row { echo "\\set rmax ${CHURN_ROWS}" echo "BEGIN;" @@ -100,9 +90,7 @@ SQL echo "END;" } > "${UPDATE_SQL}" -# COPY worker script: build a COPY_ROWS-row batch on the server with -# generate_series feeding an INSERT...SELECT. This is the COPY-equivalent bulk -# heap-insert burst into the unindexed wal_bulk table, fully WAL-logged. +# COPY worker bulk-inserts fully logged rows into wal_bulk { echo "\\set batch random(1, 1000000000)" cat < "${COPY_SQL}" -# Reset wal_bulk so the COPY storm does not accumulate across cells/runs and -# fill the data volume — at COPY_ROWS=50000 a 10-min burst adds tens of GB, and -# nothing reclaimed it, so a multi-cell matrix on a finite NVMe eventually hit -# ENOSPC (PG crash + aborted workload). wal_bulk is pure churn (only there to -# emit heap-insert WAL), so truncating loses nothing and each cell starts clean. +# Reset COPY churn table: wal_bulk is pure churn, unreclaimed it adds tens of GB +# per burst until a multi-cell matrix hits ENOSPC (PG crash). Truncate loses +# nothing, each cell starts clean echo "==> TRUNCATE wal_bulk (bound data-volume growth across cells/runs)" psql -X -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE wal_bulk;" # --- launch workers --------------------------------------------------------- -# One pgbench process per worker (-c 1 -j 1), each looping its script for the -# whole DURATION. Running them as separate processes keeps a clean 1:1 mapping -# between workers and backends and isolates COPY workers from UPDATE workers. +# One pgbench process per worker keeps COPY and UPDATE isolated WORKER_PIDS="" launch_worker() { @@ -143,21 +127,20 @@ for i in $(seq 1 "${UPDATE_WORKERS}"); do launch_worker "update-${i}" "${UPDATE_SQL}" "${PROTOCOL}" done for i in $(seq 1 "${COPY_WORKERS}"); do - # COPY/INSERT-SELECT workers use the simple protocol; no benefit from prepared. + # COPY/INSERT-SELECT gains nothing from prepared protocol launch_worker "copy-${i}" "${COPY_SQL}" "simple" done echo "==> Launched ${WORKERS} workers; running for ${DURATION}s ..." -# Wait for all workers; tally failures without aborting mid-burst so every log is -# still summarized below. +# Wait for all workers, then summarize all logs failed_workers=0 for pid in ${WORKER_PIDS}; do if ! wait "${pid}"; then failed_workers=$(( failed_workers + 1 )) fi done -WORKER_PIDS="" # all reaped; nothing for cleanup() to kill +WORKER_PIDS="" # all reaped echo "==> Per-worker pgbench summaries:" for log in "${WORKDIR}"/*.log; do @@ -166,16 +149,13 @@ for log in "${WORKDIR}"/*.log; do grep -E "tps|number of transactions actually processed|failed" "${log}" || cat "${log}" done -# pgbench prints "number of failed transactions" only when >0 (deadlock / -# serialization / mid-run disconnect): a worker can exit 0 yet still drop work, so -# a clean exit code alone does not prove a full-strength workload. +# pgbench can exit 0 with failed transactions failed_txns="$(grep -hoE 'number of failed transactions: [0-9]+' "${WORKDIR}"/*.log 2>/dev/null \ | awk '{s+=$NF} END{print s+0}' || true)" echo "==> Burst finished: failed_workers=${failed_workers} failed_txns=${failed_txns}" -# A degraded burst means this cell saw a weaker workload than its peers and is not -# comparable. Exit non-zero; callers record an explicit invalid-run marker. +# Degraded bursts are not comparable if (( failed_workers != 0 || failed_txns != 0 )); then echo "FATAL: burst degraded (${failed_workers} worker(s) failed, ${failed_txns} failed txn(s))" >&2 exit 1 diff --git a/bench/scripts/lib.sh b/bench/scripts/lib.sh index 9c47a6b..9ed98fb 100644 --- a/bench/scripts/lib.sh +++ b/bench/scripts/lib.sh @@ -1,19 +1,13 @@ # shellcheck shell=bash # -# lib.sh — shared scaffolding for the single-host benchmark drivers. +# lib.sh, shared benchmark driver helpers # -# Sourced (not executed) by run.sh (archive_command path) and run_op.sh -# (data-movement ops). Holds only the plumbing both share verbatim: config load, -# logging, the local root-exec wrapper, the seeded-DB preflight, archive-backlog -# drain, sampler start/stop, and inventory+provenance capture. The two drivers -# keep their own measurement models (daemon-as-signal vs daemon-as-noise, burst -# vs one-shot op); this file is scaffolding, not policy. +# Sourced by run.sh and run_op.sh # -# Relies on globals set by sourcing driver before each call: SCRIPT_DIR, -# LOG_TAG, PGUSER, PGPASSWORD, PGHOST_DRIVER, PGDATA_DIR, PGBIN, RESULT_DIR, -# SAMPLER, AWS_REGION. +# Requires driver globals: SCRIPT_DIR, LOG_TAG, PGUSER, PGPASSWORD, +# PGHOST_DRIVER, PGDATA_DIR, PGBIN, RESULT_DIR, SAMPLER, AWS_REGION -# Source config.env (or ENV_FILE) with auto-export so child sudo blocks inherit. +# Source config.env with auto-export for child sudo blocks load_config() { set -a # shellcheck source=../config.env.example @@ -23,11 +17,10 @@ load_config() { log() { printf '[%s %s] %s\n' "${LOG_TAG}" "$(date -u +%H:%M:%S)" "$*" >&2; } -# Run a bash snippet as root locally (fed on stdin; positional args after --). +# Run stdin script as root run_root() { sudo bash -s -- "$@"; } -# Abort unless the bench DB is seeded (wal_churn present). Callers that do not -# need a populated DB (e.g. restore) skip this. +# Abort unless bench DB is seeded require_seeded() { local seeded seeded="$(PGPASSWORD="${PGPASSWORD}" psql -h "${PGHOST_DRIVER}" -U "${PGUSER}" \ @@ -40,11 +33,8 @@ require_seeded() { fi } -# drain_backlog THRESHOLD ITERS — wait until the .ready archive backlog falls to -# THRESHOLD, polling every 2s up to ITERS times. Settles leftover WAL before a -# measured window so the sample is not contaminated by prior load. Aborts the -# cell (exit nonzero) if backlog still exceeds THRESHOLD after ITERS: a timed-out -# drain leaks prior load into the measured start, so fail rather than sample it. +# Wait for .ready archive backlog to fall below threshold +# Abort on timeout to avoid sampling prior load drain_backlog() { local threshold="$1" iters="$2" run_root "${PGDATA_DIR}" "${threshold}" "${iters}" <<'REMOTE' @@ -64,7 +54,7 @@ echo "drain complete: ready backlog = ${rb}" REMOTE } -# Normalize FPI state before burst workloads. CHECKPOINT is superuser-only. +# Normalize FPI state before bursts checkpoint_pg() { run_root "${PGBIN:-/usr/lib/postgresql/18/bin}" <<'REMOTE' set -euo pipefail @@ -75,10 +65,8 @@ echo "checkpoint complete" REMOTE } -# Reset archiver stats and launch the 1 Hz sampler as postgres into RESULT_DIR. -# MODE_FLAG/MODE_VALUE select the attach mode: --daemon (run.sh, the -# daemon IS the measurement) or --proc-match (run_op.sh, match the op -# process). Aborts if the sampler does not come up. +# Reset archiver stats and launch sampler +# MODE_FLAG/MODE_VALUE select --daemon or --proc-match start_sampler() { local mode_flag="$1" mode_value="$2" log "starting sampler (${mode_flag} ${mode_value}) -> ${RESULT_DIR}" @@ -86,6 +74,15 @@ start_sampler() { set -euo pipefail RESULT_DIR="$1"; SAMPLER="$2"; MODE_FLAG="$3"; MODE_VALUE="$4"; PGDATA="$5" install -d -o postgres -g postgres "${RESULT_DIR}" +# Sampler runs as postgres, so RESULT_DIR must be postgres-traversable; the +# default results/ under a 0750 home is not. Fail clear, not as a cryptic +# sampler start failure. Relocate via RESULTS_ROOT (e.g. /dat/bench-results). +if ! sudo -u postgres test -w "${RESULT_DIR}"; then + echo "error: ${RESULT_DIR} unwritable by postgres (sampler runs as postgres)" >&2 + echo " ancestor likely not traversable (e.g. a 0750 home dir); set" >&2 + echo " RESULTS_ROOT to a postgres-traversable path, e.g. /dat/bench-results" >&2 + exit 1 +fi sudo -u postgres psql -X -q -c "SELECT pg_stat_reset_shared('archiver');" >/dev/null 2>&1 || true sudo -u postgres bash -c " nohup '${SAMPLER}' ${MODE_FLAG} '${MODE_VALUE}' --pgdata '${PGDATA}' \ @@ -103,8 +100,7 @@ echo "sampler running pid=${SPID}" REMOTE } -# Stop the sampler (TERM, then KILL after 10s grace). Safe to call twice and from -# an EXIT trap; never fails the caller. +# Stop sampler, safe from EXIT trap stop_sampler() { log "stopping sampler" run_root "${RESULT_DIR}" <<'REMOTE' || true @@ -119,9 +115,7 @@ fi REMOTE } -# Record an explicit invalid-run marker in RESULT_DIR. bench-analyze skips any run -# dir containing INVALID, so a degraded cell is excluded from comparison instead -# of being silently averaged in. Reason is free text. Relies on RESULT_DIR. +# Mark degraded run so analysis skips it mark_invalid() { log "INVALID run: $*" run_root "${RESULT_DIR}" "$*" <<'REMOTE' || true @@ -134,10 +128,7 @@ chown postgres:postgres "${RESULT_DIR}/INVALID" REMOTE } -# Capture the S3 inventory and write provenance.txt. Args: RESULT_DIR INV_PREFIX -# REGION then any number of leading "key=value" lines (driver-specific identity: -# daemon/op/tool, run_id, sizing, harness_git). The shared tool version/sha block -# and captured_at are appended. +# Capture S3 inventory and provenance write_provenance() { run_root "$@" <<'REMOTE' set -euo pipefail diff --git a/bench/scripts/make_source_tarball.sh b/bench/scripts/make_source_tarball.sh new file mode 100755 index 0000000..759afcb --- /dev/null +++ b/bench/scripts/make_source_tarball.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Package walrus source into a tarball for a fresh SUT that has no git checkout. +# +# git archive embeds the commit id in the tarball's pax header, so +# 03_build_walrus.sh recovers it via `git get-tar-commit-id` and provenance +# survives the transfer even though the extracted tree carries no .git. +# +# Usage: make_source_tarball.sh [REF] # REF defaults to HEAD +# Prints the tarball path on stdout (progress on stderr), so it composes: +# t=$(bench/scripts/make_source_tarball.sh) && scp "$t" sut:/tmp/ +set -euo pipefail + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +# bench/scripts -> repo root +REPO_ROOT="${WALRUS_REPO:-$(cd -- "${SCRIPT_DIR}/../.." >/dev/null 2>&1 && pwd)}" +REF="${1:-${WALRUS_REF:-HEAD}}" + +if [[ ! -d "${REPO_ROOT}/.git" ]]; then + echo "ERROR: ${REPO_ROOT} is not a git checkout; nothing to archive." >&2 + exit 1 +fi + +SHA="$(git -C "${REPO_ROOT}" rev-parse "${REF}")" +OUT="${WALRUS_SRC_TARBALL:-${REPO_ROOT}/bench/walrus-src-${SHA:0:12}.tar.gz}" + +echo "=== git archive ${REF} (${SHA}) -> ${OUT} ===" >&2 +git -C "${REPO_ROOT}" archive --format=tar.gz --prefix=walrus/ -o "${OUT}" "${REF}" +echo "=== wrote $(du -h "${OUT}" | cut -f1) ===" >&2 + +# Path on stdout for scripting +printf '%s\n' "${OUT}" diff --git a/bench/scripts/sut/00_mount_nvme.sh b/bench/scripts/sut/00_mount_nvme.sh index 0c8f971..9eff369 100755 --- a/bench/scripts/sut/00_mount_nvme.sh +++ b/bench/scripts/sut/00_mount_nvme.sh @@ -1,6 +1,5 @@ #!/usr/bin/env bash -# Detect the non-root NVMe instance-store device, format it ext4 (only if not -# already formatted), mount it at /dat, and prepare the PG18 data parent dir. +# Mount non-root NVMe instance store at /dat set -euo pipefail MOUNT_POINT=/dat @@ -11,24 +10,23 @@ if [[ $EUID -ne 0 ]]; then exit 1 fi -# The root filesystem's backing device. lsblk -no PKNAME gives the parent disk -# of whatever device hosts "/", e.g. nvme0n1. +# Root filesystem backing disk root_src="$(findmnt -no SOURCE / )" root_disk="$(lsblk -no PKNAME "${root_src}" | head -n1)" if [[ -z "${root_disk}" ]]; then - # Some setups mount / directly on the whole disk; fall back to basename. + # Some setups mount / on whole disk root_disk="$(basename "${root_src}")" fi echo "Root device: ${root_src} (disk: ${root_disk})" -# Pick the first NVMe whole-disk that is not the root disk and has no children. +# Pick first non-root NVMe whole disk target="" while read -r name type; do [[ "${type}" == "disk" ]] || continue [[ "${name}" == "${root_disk}" ]] && continue case "${name}" in nvme*) - # Skip disks that already have partitions/children mounted as root. + # Skip root disk target="${name}" break ;; @@ -44,7 +42,7 @@ fi dev="/dev/${target}" echo "Selected instance-store device: ${dev}" -# Guard: only mkfs if there is no existing filesystem on the device. +# Create filesystem only when absent fstype="$(blkid -o value -s TYPE "${dev}" 2>/dev/null || true)" if [[ -z "${fstype}" ]]; then echo "No filesystem detected on ${dev}; creating ext4..." @@ -63,10 +61,7 @@ else fi mkdir -p "${PG_PARENT}" -# The postgres user is created later by 01_install_pg18.sh, and 10_init_pg.sh -# sets ${PG_PARENT} ownership. Keep /dat root-owned but world-traversable so -# postgres can reach PGDATA underneath it; chown to postgres only once it exists -# (e.g. on a re-run after PG is installed). +# Keep /dat traversable until postgres user exists chmod 755 "${MOUNT_POINT}" if id -u postgres >/dev/null 2>&1; then chown postgres:postgres "${MOUNT_POINT}" "${PG_PARENT}" diff --git a/bench/scripts/sut/01_install_pg18.sh b/bench/scripts/sut/01_install_pg18.sh index a51f20c..96759f8 100755 --- a/bench/scripts/sut/01_install_pg18.sh +++ b/bench/scripts/sut/01_install_pg18.sh @@ -1,14 +1,12 @@ #!/usr/bin/env bash -# Install PostgreSQL 18 (PGDG), build toolchain, Go 1.26.2, and Rust 1.89 for -# building wal-g and walrus on Ubuntu 24.04 (noble). +# Install PG18, build deps, Go, and Rust set -euo pipefail GO_VERSION="${GO_VERSION:-1.26.2}" RUST_VERSION="${RUST_VERSION:-1.89.0}" GO_TARBALL="go${GO_VERSION}.linux-amd64.tar.gz" GO_URL="https://go.dev/dl/${GO_TARBALL}" -# User who will run cargo to build walrus. Defaults to the sudo invoker (dev box, -# any distro), then ubuntu (provisioned EC2 SUT) — matches the other sut scripts. +# User owning cargo toolchain BUILD_USER="${BUILD_USER:-${SUDO_USER:-ubuntu}}" if [[ $EUID -ne 0 ]]; then @@ -66,7 +64,7 @@ if [[ ! -x /usr/local/go/bin/go ]] || ! /usr/local/go/bin/go version | grep -q " tar -C /usr/local -xzf "${tmp}/${GO_TARBALL}" rm -rf "${tmp}" fi -# Make go available on PATH for all login shells. +# Put Go on login-shell PATH cat > /etc/profile.d/go.sh <<'EOF' export PATH=$PATH:/usr/local/go/bin EOF diff --git a/bench/scripts/sut/03_build_walrus.sh b/bench/scripts/sut/03_build_walrus.sh index dfe26ba..8fb0586 100755 --- a/bench/scripts/sut/03_build_walrus.sh +++ b/bench/scripts/sut/03_build_walrus.sh @@ -1,34 +1,25 @@ #!/usr/bin/env bash -# Build walrus (wal-rs) from the in-repo source and install to -# /usr/local/bin/walrus. +# Build walrus from in-repo source, or from a shipped source tarball +# (WALRUS_SRC_TARBALL) when the SUT has no git checkout. # -# bench/ lives inside the wal-rs repo, so the source is right here: build the -# repo working tree directly (no clone, no uploaded tarball). The git SHA is -# read from the repo when present. +# Records git SHA when present set -euo pipefail SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" -# bench/scripts/sut -> repo root is three levels up. -REPO_ROOT="${WALRUS_REPO:-$(cd -- "${SCRIPT_DIR}/../../.." >/dev/null 2>&1 && pwd)}" INSTALL_BIN="/usr/local/bin/walrus" SHA_FILE="/opt/walbench/walrus.sha" -# User that owns the rustup/cargo toolchain installed by 01_install_pg18.sh. -# Defaults to the sudo invoker (dev box) then ubuntu (provisioned SUT). +# User owning rustup/cargo toolchain BUILD_USER="${BUILD_USER:-${SUDO_USER:-ubuntu}}" if [[ $EUID -ne 0 ]]; then echo "ERROR: must run as root (use sudo) so installs to /usr/local/bin succeed." >&2 exit 1 fi -# Idempotent: skip the (slow) cargo build if walrus is already installed. +# Skip build when installed, unless FORCE_REBUILD=1 if [[ -z "${FORCE_REBUILD:-}" && -x "${INSTALL_BIN}" ]]; then echo "walrus already installed; skipping build (FORCE_REBUILD=1 to rebuild)." exit 0 fi -if [[ ! -f "${REPO_ROOT}/Cargo.toml" ]]; then - echo "ERROR: wal-rs source not found at ${REPO_ROOT} (no Cargo.toml)." >&2 - exit 1 -fi if ! id "${BUILD_USER}" >/dev/null 2>&1; then echo "ERROR: build user '${BUILD_USER}' does not exist." >&2 exit 1 @@ -40,11 +31,38 @@ if [[ ! -x "${cargo_bin}" ]]; then exit 1 fi -WALRUS_SHA="$(git -C "${REPO_ROOT}" rev-parse HEAD 2>/dev/null || echo unknown)" +# Source: shipped tarball (fresh SUT) or in-repo tree. make_source_tarball.sh +# builds the tarball via git archive; the commit id rides in its pax header and +# is recovered here, so provenance survives a transfer with no .git. +if [[ -n "${WALRUS_SRC_TARBALL:-}" ]]; then + if [[ ! -f "${WALRUS_SRC_TARBALL}" ]]; then + echo "ERROR: WALRUS_SRC_TARBALL=${WALRUS_SRC_TARBALL} not found." >&2 + exit 1 + fi + SRC_DIR="${WALRUS_SRC_DIR:-/opt/walbench/src}" + echo "=== Unpacking ${WALRUS_SRC_TARBALL} -> ${SRC_DIR} ===" + rm -rf "${SRC_DIR}" + mkdir -p "${SRC_DIR}" + tar -C "${SRC_DIR}" -xzf "${WALRUS_SRC_TARBALL}" + # --prefix=walrus/ nests the tree; fall back when archived without it + REPO_ROOT="${SRC_DIR}/walrus" + [[ -f "${REPO_ROOT}/Cargo.toml" ]] || REPO_ROOT="${SRC_DIR}" + WALRUS_SHA="$(gzip -dc "${WALRUS_SRC_TARBALL}" | git get-tar-commit-id 2>/dev/null || echo unknown)" + # Extracted as root; build user needs to write target/ + chown -R "${BUILD_USER}:${BUILD_USER}" "${SRC_DIR}" +else + # bench/scripts/sut -> repo root + REPO_ROOT="${WALRUS_REPO:-$(cd -- "${SCRIPT_DIR}/../../.." >/dev/null 2>&1 && pwd)}" + WALRUS_SHA="$(git -C "${REPO_ROOT}" rev-parse HEAD 2>/dev/null || echo unknown)" +fi +if [[ ! -f "${REPO_ROOT}/Cargo.toml" ]]; then + echo "ERROR: wal-rs source not found at ${REPO_ROOT} (no Cargo.toml)." >&2 + exit 1 +fi + echo "=== Building walrus from ${REPO_ROOT} (SHA ${WALRUS_SHA}) ===" -# cargo runs as BUILD_USER, which must own the tree (target/ is gitignored, so -# building in place is fine). The binary is installed by root afterwards. +# Build as BUILD_USER, install binary as root echo "=== cargo build --release ===" sudo -u "${BUILD_USER}" -H bash -c "cd '${REPO_ROOT}' && '${cargo_bin}' build --release" diff --git a/bench/scripts/sut/05_install_pgbackrest.sh b/bench/scripts/sut/05_install_pgbackrest.sh index f09302e..ae9e8da 100755 --- a/bench/scripts/sut/05_install_pgbackrest.sh +++ b/bench/scripts/sut/05_install_pgbackrest.sh @@ -1,17 +1,7 @@ #!/usr/bin/env bash -# Install pgBackRest (PGDG apt) and configure it as the third WAL archiver under -# test. Unlike wal-g/walrus there is NO long-running daemon: PG's archiver runs -# `pgbackrest archive-push` per segment; in async mode the first invocation forks -# a transient async process that pushes ready segments with up to `process-max` -# worker processes, then exits. Footprint is therefore a process TREE, sampled by -# bench-sampler's --proc-match mode (--daemon pgbackrest), not a systemd MainPID. +# Install pgBackRest and configure async archive-push # -# Credentials: on EC2, pgBackRest speaks IMDSv2 natively (repo1-s3-key-type=auto, -# since pgBackRest 2.39) and reads the instance-profile role directly. Off-AWS -# (dev / Debian, no IMDS) it falls back to static keys from the environment -# (repo1-s3-key-type=shared) — the same AWS_ACCESS_KEY_ID/SECRET path wal-g and -# walrus use via wal-g.env. Repo lives in the SAME bucket as wal-g under a -# SEPARATE prefix (repo1-path) so the two never collide. +# Uses env static keys off-AWS, otherwise pgBackRest IMDS auth # # Usage: # BUCKET=my-bucket [UPLOAD_CONCURRENCY=4] sudo ./05_install_pgbackrest.sh @@ -27,8 +17,7 @@ PGDATA="${PGDATA:-/dat/18/data}" PGBIN="${PGBIN:-/usr/lib/postgresql/18/bin}" CONF_DIR="/etc/pgbackrest" CONF="${CONF_DIR}/pgbackrest.conf" -# Spool + logs on the data NVMe (not tmpfs, not the small root volume); async -# only writes tiny ack files to the spool, so disk pressure is negligible. +# Put spool + logs on data NVMe SPOOL_PATH="${PGBACKREST_SPOOL_PATH:-/dat/pgbackrest/spool}" LOG_PATH="${PGBACKREST_LOG_PATH:-/dat/pgbackrest/log}" @@ -55,7 +44,7 @@ install -d -o postgres -g postgres -m 0750 "${CONF_DIR}" install -d -o postgres -g postgres -m 0750 "${SPOOL_PATH}" install -d -o postgres -g postgres -m 0750 "${LOG_PATH}" -# S3 auth: static keys from env when present (off-AWS / Debian), else IMDS role. +# Static keys from env off-AWS, otherwise IMDS if [[ -n "${AWS_ACCESS_KEY_ID:-}" && -n "${AWS_SECRET_ACCESS_KEY:-}" ]]; then echo "=== pgbackrest S3 auth: shared (static keys from environment) ===" S3_AUTH="repo1-s3-key-type=shared @@ -69,9 +58,8 @@ else fi echo "=== Writing ${CONF} (process-max=${UPLOAD_CONCURRENCY}, bucket=${BUCKET}) ===" -# process-max matches WALG_UPLOAD_CONCURRENCY so pgbackrest's async parallelism is -# the same knob as wal-g's background uploader — the throughput<->memory tradeoff -# is compared at equal fan-out. compress-type=lz4 matches WALG_COMPRESSION_METHOD. +# Match process-max to WALG_UPLOAD_CONCURRENCY +# Use lz4 to match WALG_COMPRESSION_METHOD umask 077 tmp="$(mktemp)" cat > "${tmp}" </dev/null 2>&1 \ && pg_lsclusters -h 2>/dev/null | awk '{print $1"/"$2}' | grep -qx '18/main'; then echo "=== Removing PGDG default cluster 18/main (frees port 5432) ===" @@ -45,22 +41,18 @@ fi echo "=== Writing postgresql.conf ===" sudo -u postgres tee "${PGDATA}/postgresql.conf" >/dev/null < [UPLOAD_CONCURRENCY] set -euo pipefail BUCKET="${BUCKET:-${1:-}}" UPLOAD_CONCURRENCY="${UPLOAD_CONCURRENCY:-${2:-4}}" -# backup-fetch / wal-fetch download fan-out; defaults to upload concurrency so a -# single concurrency sweep tunes both directions (override DOWNLOAD_CONCURRENCY -# to decouple). +# Download fan-out defaults to upload fan-out DOWNLOAD_CONCURRENCY="${DOWNLOAD_CONCURRENCY:-${UPLOAD_CONCURRENCY}}" ENV_FILE="${ENV_FILE:-/etc/postgresql/wal-g.env}" AWS_REGION="${AWS_REGION:-us-east-1}" COMPRESSION_METHOD="${WALG_COMPRESSION_METHOD:-lz4}" -IMDS="http://169.254.169.254/latest" +# Pre-record _delta sidecars during wal-push +USE_WAL_DELTA="${WALG_USE_WAL_DELTA:-}" if [[ $EUID -ne 0 ]]; then echo "ERROR: must run as root (use sudo) to write ${ENV_FILE}." >&2 @@ -34,8 +29,7 @@ if [[ -z "${BUCKET}" ]]; then exit 1 fi -# Storage prefix both daemons archive into. run.sh / run_op.sh scope it per -# tool+run for isolation; default keeps the shared bench prefix for setup/smoke. +# Storage prefix, scoped by drivers per tool+run WALG_S3_PREFIX="${WALG_S3_PREFIX:-s3://${BUCKET}/walg-bench}" ACCESS_KEY="${AWS_ACCESS_KEY_ID:-}" @@ -43,46 +37,12 @@ SECRET_KEY="${AWS_SECRET_ACCESS_KEY:-}" SESSION_TOKEN="${AWS_SESSION_TOKEN:-}" if [[ -n "${ACCESS_KEY}" && -n "${SECRET_KEY}" ]]; then - echo "=== Using AWS credentials from environment ===" -else - echo "=== Fetching temporary credentials via IMDSv2 ===" - TOKEN="$(curl -sf -X PUT "${IMDS}/api/token" \ - -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600')" - if [[ -z "${TOKEN}" ]]; then - echo "ERROR: no env credentials and failed to obtain IMDSv2 token." >&2 - exit 1 - fi - - ROLE="$(curl -sf -H "X-aws-ec2-metadata-token: ${TOKEN}" \ - "${IMDS}/meta-data/iam/security-credentials/")" - if [[ -z "${ROLE}" ]]; then - echo "ERROR: no IAM role attached to this instance." >&2 - exit 1 - fi - echo "IAM role: ${ROLE}" - - CREDS_JSON="$(curl -sf -H "X-aws-ec2-metadata-token: ${TOKEN}" \ - "${IMDS}/meta-data/iam/security-credentials/${ROLE}")" - - read_field() { - local key="$1" - if command -v jq >/dev/null 2>&1; then - printf '%s' "${CREDS_JSON}" | jq -r ".${key}" - else - printf '%s' "${CREDS_JSON}" \ - | python3 -c "import sys,json;print(json.load(sys.stdin)['${key}'])" - fi - } - - ACCESS_KEY="$(read_field AccessKeyId)" - SECRET_KEY="$(read_field SecretAccessKey)" - SESSION_TOKEN="$(read_field Token)" - echo "Credentials expire at: $(read_field Expiration)" -fi - -if [[ -z "${ACCESS_KEY}" || -z "${SECRET_KEY}" ]]; then - echo "ERROR: incomplete credentials (need AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY)." >&2 + echo "=== Using static AWS credentials from environment ===" +elif [[ -n "${ACCESS_KEY}" || -n "${SECRET_KEY}" ]]; then + echo "ERROR: incomplete credentials; set BOTH AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY, or neither (IMDS)." >&2 exit 1 +else + echo "=== No static credentials; daemons resolve EC2 instance-role creds via IMDS ===" fi echo "=== Writing ${ENV_FILE} (UPLOAD_CONCURRENCY=${UPLOAD_CONCURRENCY} DOWNLOAD_CONCURRENCY=${DOWNLOAD_CONCURRENCY}) ===" @@ -97,16 +57,20 @@ WALG_UPLOAD_CONCURRENCY=${UPLOAD_CONCURRENCY} WALG_DOWNLOAD_CONCURRENCY=${DOWNLOAD_CONCURRENCY} PGHOST=/var/run/postgresql PGDATA=/dat/18/data -AWS_ACCESS_KEY_ID=${ACCESS_KEY} -AWS_SECRET_ACCESS_KEY=${SECRET_KEY} EOF -# Session token only for temporary (IMDS / STS) credentials. -if [[ -n "${SESSION_TOKEN}" ]]; then - printf 'AWS_SESSION_TOKEN=%s\n' "${SESSION_TOKEN}" >> "${tmp}" +# Static keys only off-AWS; absent means IMDS +if [[ -n "${ACCESS_KEY}" ]]; then + printf 'AWS_ACCESS_KEY_ID=%s\n' "${ACCESS_KEY}" >> "${tmp}" + printf 'AWS_SECRET_ACCESS_KEY=%s\n' "${SECRET_KEY}" >> "${tmp}" + [[ -n "${SESSION_TOKEN}" ]] && printf 'AWS_SESSION_TOKEN=%s\n' "${SESSION_TOKEN}" >> "${tmp}" +fi +# Omit unset sidecar flag, empty value fails walrus bool parse +if [[ -n "${USE_WAL_DELTA}" ]]; then + printf 'WALG_USE_WAL_DELTA=%s\n' "${USE_WAL_DELTA}" >> "${tmp}" fi install -o postgres -g postgres -m 0600 "${tmp}" "${ENV_FILE}" rm -f "${tmp}" echo "Done. ${ENV_FILE}:" -# Show keys only, never secret values. +# Redact secret values sed -E 's/^(AWS_ACCESS_KEY_ID|AWS_SECRET_ACCESS_KEY|AWS_SESSION_TOKEN)=.*/\1=/' "${ENV_FILE}" diff --git a/bench/scripts/sut/30_select_daemon.sh b/bench/scripts/sut/30_select_daemon.sh index de2352f..dec19d2 100755 --- a/bench/scripts/sut/30_select_daemon.sh +++ b/bench/scripts/sut/30_select_daemon.sh @@ -1,7 +1,5 @@ #!/usr/bin/env bash -# Select exactly one archive daemon (wal-g or walrus). Stops both units, -# removes the shared socket, installs (if needed) and starts the chosen unit, -# waits for the socket to appear, and prints the active PID/cgroup. +# Select one archive daemon # # Usage: sudo ./30_select_daemon.sh walg|walrus set -euo pipefail @@ -55,10 +53,7 @@ if [[ ! -S "${SOCKET}" ]]; then exit 1 fi -# Point archive_command at the chosen daemon's OWN client (walg_archive's -# extension does not interoperate with the walrus daemon; each tool's own -# daemon-client does). walrus needs the absolute WAL path; wal-g uses %f. -# Best-effort here (PG is up by this point in setup); run_one.sh sets it per cell. +# archive_command must use chosen tool's own daemon client PGDATA_DIR="/dat/18/data" if [[ "${CHOICE}" == "walg" ]]; then archive_cmd="/usr/bin/walg-daemon-client ${SOCKET} wal-push %f" @@ -66,7 +61,7 @@ else archive_cmd="/usr/local/bin/walrus daemon-client --socket ${SOCKET} wal-push ${PGDATA_DIR}/%p" fi echo "=== Setting archive_command for ${CHOICE} (and clearing archive_library) ===" -# Each ALTER SYSTEM in its own -c (cannot run inside a transaction block). +# ALTER SYSTEM cannot run inside transaction block sudo -u postgres /usr/lib/postgresql/18/bin/psql -p 5432 -tA \ -c "ALTER SYSTEM SET archive_library = '';" \ -c "ALTER SYSTEM SET archive_command = '${archive_cmd}';" \ diff --git a/bench/scripts/sut/40_smoke_test.sh b/bench/scripts/sut/40_smoke_test.sh index 12417a6..00b1d69 100755 --- a/bench/scripts/sut/40_smoke_test.sh +++ b/bench/scripts/sut/40_smoke_test.sh @@ -1,7 +1,5 @@ #!/usr/bin/env bash -# Smoke test the currently-active archive daemon: force a few WAL switches, -# insert a tiny table, wait for archiving, then confirm WAL objects landed under -# s3:///walg-bench/wal_005/. FAIL loudly if nothing appears. +# Smoke test active archive daemon # # Usage: BUCKET=my-bucket sudo ./40_smoke_test.sh (or pass BUCKET as $1) set -euo pipefail @@ -29,9 +27,7 @@ if [[ ! -S "${SOCKET}" ]]; then fi echo "=== Baseline object count under ${S3_WAL_PREFIX} (before this daemon archives) ===" -# The prefix is SHARED by both daemons. Counting >0 would false-pass walrus on -# objects wal-g left earlier (and vice versa). Capture a baseline and require an -# INCREASE so the test proves THIS daemon archived. +# Shared prefix, require count increase from baseline before="$(aws s3 ls "${S3_WAL_PREFIX}" --region "${AWS_REGION}" 2>/dev/null | grep -c . || true)" echo "baseline=${before}" @@ -47,7 +43,7 @@ for _ in $(seq 1 "${WAL_SWITCHES}"); do run_psql -d postgres -tAc \ "INSERT INTO walg_smoke (payload) SELECT repeat('y',256) FROM generate_series(1,500);" done -# Final switch so the last populated segment becomes archivable. +# Make last populated segment archivable run_psql -d postgres -tAc "SELECT pg_switch_wal();" >/dev/null echo "=== Waiting up to ${WAIT_SECONDS}s for NEW archived WAL under ${S3_WAL_PREFIX} ===" diff --git a/bench/setup.sh b/bench/setup.sh index 213da38..c7491d4 100755 --- a/bench/setup.sh +++ b/bench/setup.sh @@ -1,22 +1,14 @@ #!/usr/bin/env bash # -# setup.sh — bootstrap THIS host as a single-box benchmark System-Under-Test. +# setup.sh, bootstrap this host as single-box benchmark SUT # -# Runs the numbered scripts/sut steps in order: mount NVMe (optional) -> install -# PG18 + toolchains -> build wal-g, walrus (from this repo), walg_archive -> init -# the PG18 cluster -> install + stanza pgbackrest -> create the bench role -> -# deploy the sampler -> write wal-g.env -> install both systemd units. -# -# Single-host counterpart of the external AWS fleet's setup_sut.sh: no SSH, no -# source upload (walrus builds straight from this repo), driver == this box. -# -# Run as root: sudo ./setup.sh (config from ./config.env) +# Run as root: sudo ./setup.sh, config from ./config.env set -euo pipefail SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" SUT="${SCRIPT_DIR}/scripts/sut" -# Export everything sourced so the sub-scripts inherit it (BUCKET, creds, etc). +# Export config for child setup scripts set -a # shellcheck source=config.env.example . "${ENV_FILE:-${SCRIPT_DIR}/config.env}" @@ -32,7 +24,7 @@ fi : "${PGPASSWORD:?set PGPASSWORD in config.env}" : "${UPLOAD_CONCURRENCY:?set UPLOAD_CONCURRENCY in config.env}" -# Toolchain owner + pg_hba CIDR. Single-host driver is loopback. +# Toolchain owner + pg_hba CIDR export BUILD_USER="${BUILD_USER:-${SUDO_USER:-ubuntu}}" export DRIVER_CIDR="${DRIVER_CIDR:-127.0.0.1/32}" @@ -41,9 +33,8 @@ log() { printf '[setup %s] %s\n' "$(date -u +%H:%M:%S)" "$*" >&2; } cd "${SUT}" chmod +x ./*.sh -# 00 is AWS instance-store specific: it formats+mounts a spare NVMe at /dat. -# Skip on a box that already has /dat (or set SKIP_MOUNT=1 to point PGDATA at an -# existing fast disk yourself). +# 00 formats + mounts spare NVMe at /dat +# Skip when /dat already points at fast storage if [[ -n "${SKIP_MOUNT:-}" ]]; then log "SKIP_MOUNT set — skipping 00_mount_nvme.sh (ensure /dat exists)" else @@ -65,14 +56,15 @@ else ${PSQL} -c "CREATE ROLE \"${PGUSER}\" LOGIN PASSWORD '${PGPASSWORD}' CREATEDB;" fi -log "build bench-tools (bench-sampler + bench-analyze) from ${SCRIPT_DIR}/tools" +log "build bench-tools (bench-sampler + bench-analyze + bench-compare) from ${SCRIPT_DIR}/tools" build_home="$(getent passwd "${BUILD_USER}" | cut -d: -f6)" cargo_bin="${build_home}/.cargo/bin/cargo" sudo -u "${BUILD_USER}" -H bash -c "cd '${SCRIPT_DIR}/tools' && '${cargo_bin}' build --release" -log "deploy bench-sampler + bench-analyze to /usr/local/bin" +log "deploy bench-sampler + bench-analyze + bench-compare to /usr/local/bin" install -m 0755 "${SCRIPT_DIR}/tools/target/release/bench-sampler" /usr/local/bin/bench-sampler install -m 0755 "${SCRIPT_DIR}/tools/target/release/bench-analyze" /usr/local/bin/bench-analyze +install -m 0755 "${SCRIPT_DIR}/tools/target/release/bench-compare" /usr/local/bin/bench-compare log "11 write wal-g.env"; bash ./11_write_walg_env.sh log "install both systemd units (via 30, starts walg)"; bash ./30_select_daemon.sh walg diff --git a/bench/terraform/config.env.tftpl b/bench/terraform/config.env.tftpl new file mode 100644 index 0000000..41e2c8c --- /dev/null +++ b/bench/terraform/config.env.tftpl @@ -0,0 +1,33 @@ +# Generated by terraform (deploy.tf). Edit on the box to retune sizing per run. +# S3 creds resolve from the instance profile via IMDS, so no AWS keys here. + +# --- S3 target --------------------------------------------------------------- +BUCKET=${bucket} +AWS_REGION=${region} + +# --- Postgres bench role ----------------------------------------------------- +PGUSER=${pg_user} +PGPASSWORD=${pg_password} + +# --- daemon / archiver tuning ------------------------------------------------ +UPLOAD_CONCURRENCY=${upload_concurrency} +WALG_COMPRESSION_METHOD=lz4 + +# --- workload sizing --------------------------------------------------------- +SCALE=5000 +CHURN_ROWS=2000000 +BURST_SECONDS=300 + +# --- operation benchmarks (run_op.sh / op_matrix.sh) ------------------------- +WAL_RECEIVE_SECONDS=300 +DELTA_CHURN_SECONDS=300 +DELTA_MAX_STEPS=7 +DELTA_ORIGIN=LATEST_FULL + +# Sampler runs as postgres; ubuntu $HOME is 0750 so postgres can't reach the +# default results/ under it. /dat (NVMe) is root-owned 0755, traversable. +RESULTS_ROOT=/dat/bench-results + +# Build walrus from the uploaded tarball; 03_build_walrus.sh recovers the +# commit id from it so provenance survives the checkout-less box. +WALRUS_SRC_TARBALL=/home/ubuntu/walrus-src.tar.gz diff --git a/bench/terraform/deploy.tf b/bench/terraform/deploy.tf new file mode 100644 index 0000000..4c5d34e --- /dev/null +++ b/bench/terraform/deploy.tf @@ -0,0 +1,83 @@ +# Ship the walrus source tarball to the box over SSH and unpack it. +# +# The git-archive tarball is self-contained: the whole repo (walrus source + +# this bench harness) under a walrus/ prefix, so the box needs no git checkout. +# After apply: cd ~/walrus/bench, set WALRUS_SRC_TARBALL=~/walrus-src.tar.gz in +# config.env (03_build_walrus.sh recovers the commit id from it), run setup.sh. +# +# Enabled only when var.walrus_src_tarball is set; re-runs when its bytes change. +resource "terraform_data" "walrus_src" { + count = var.walrus_src_tarball != "" ? 1 : 0 + + triggers_replace = { + instance = aws_instance.bench.id + tarball = filesha256(var.walrus_src_tarball) + } + + connection { + type = "ssh" + host = aws_instance.bench.public_ip + user = "ubuntu" + private_key = tls_private_key.bench.private_key_pem + } + + provisioner "file" { + source = var.walrus_src_tarball + destination = "/home/ubuntu/walrus-src.tar.gz" + } + + # Unpack for the harness; keep the tarball for the SHA-preserving build + provisioner "remote-exec" { + inline = [ + "rm -rf /home/ubuntu/walrus", + "tar -xzf /home/ubuntu/walrus-src.tar.gz -C /home/ubuntu", + ] + } +} + +# Bootstrap the unpacked box: write config.env, run setup.sh (PG18 + build all +# tools + systemd units). Opt-in via run_setup; re-runs when source or a config +# knob changes (a password-only change won't retrigger — taint to force). +resource "terraform_data" "bootstrap" { + count = var.run_setup && var.walrus_src_tarball != "" ? 1 : 0 + + lifecycle { + precondition { + condition = var.pg_password != "" + error_message = "run_setup requires pg_password (PGPASSWORD for the bench role)." + } + } + + triggers_replace = { + src = terraform_data.walrus_src[0].id + bucket = aws_s3_bucket.bench.id + region = var.region + pg_user = var.pg_user + upload_concurrency = var.upload_concurrency + } + + connection { + type = "ssh" + host = aws_instance.bench.public_ip + user = "ubuntu" + private_key = tls_private_key.bench.private_key_pem + } + + provisioner "file" { + content = templatefile("${path.module}/config.env.tftpl", { + bucket = aws_s3_bucket.bench.id + region = var.region + pg_user = var.pg_user + pg_password = var.pg_password + upload_concurrency = var.upload_concurrency + }) + destination = "/home/ubuntu/walrus/bench/config.env" + } + + # m5d local NVMe is mounted by 00_mount_nvme.sh, so no SKIP_MOUNT + provisioner "remote-exec" { + inline = [ + "cd /home/ubuntu/walrus/bench && sudo bash setup.sh", + ] + } +} diff --git a/bench/terraform/iam.tf b/bench/terraform/iam.tf index 01f5896..c887339 100644 --- a/bench/terraform/iam.tf +++ b/bench/terraform/iam.tf @@ -1,6 +1,4 @@ -# Instance role scoped to the bench bucket. 11_write_walg_env.sh bridges these -# IMDSv2 creds into wal-g.env (walrus has no IMDS credential chain); the aws CLI -# and pgbackrest read the instance role directly. +# Instance role scoped to bench bucket data "aws_iam_policy_document" "assume_role" { statement { diff --git a/bench/terraform/instances.tf b/bench/terraform/instances.tf index e5a3f55..3829f6e 100644 --- a/bench/terraform/instances.tf +++ b/bench/terraform/instances.tf @@ -39,10 +39,7 @@ resource "local_sensitive_file" "ssh_key" { file_permission = "0600" } -# All-in-one bench box: PG18 + wal-g/walrus daemons + local pgbench driver. -# The 'd' instance family ships a local NVMe instance-store; 00_mount_nvme.sh -# detects the non-root NVMe, mkfs.ext4, and mounts it at /dat (PGDATA + WAL + -# restore). Root holds the Go/Rust toolchains + build trees. +# Bench box with local NVMe for /dat resource "aws_instance" "bench" { ami = data.aws_ami.ubuntu_noble.id instance_type = var.instance_type diff --git a/bench/terraform/network.tf b/bench/terraform/network.tf index 4ad2fbd..f07fd16 100644 --- a/bench/terraform/network.tf +++ b/bench/terraform/network.tf @@ -1,5 +1,4 @@ -# Dedicated VPC + public subnet so the box is isolated from any existing VPCs. -# Single host, so no intra-SG PG rule is needed (pgbench talks to PG over loopback). +# Dedicated VPC + public subnet for bench box data "aws_availability_zones" "available" { state = "available" diff --git a/bench/terraform/outputs.tf b/bench/terraform/outputs.tf index 83325be..fb96cef 100644 --- a/bench/terraform/outputs.tf +++ b/bench/terraform/outputs.tf @@ -22,3 +22,15 @@ output "region" { description = "AWS region of the bench resources." value = var.region } + +output "walrus_src_remote" { + description = "Where walrus_src_tarball landed on the box (unpacked tree + kept tarball), or a hint when disabled." + value = var.walrus_src_tarball != "" ? "unpacked to /home/ubuntu/walrus; tarball at /home/ubuntu/walrus-src.tar.gz (point WALRUS_SRC_TARBALL there)" : "(set -var walrus_src_tarball=... to upload source)" +} + +output "next_steps" { + description = "What to do after apply." + value = var.run_setup ? "Box bootstrapped. SSH in: ssh -i ${local_sensitive_file.ssh_key.filename} ubuntu@${aws_instance.bench.public_ip}; then cd walrus/bench && bash scripts/sut/40_smoke_test.sh, seed the DB, run matrix.sh." : ( + var.walrus_src_tarball != "" ? "Source uploaded. SSH in, cd walrus/bench, fill config.env, run sudo ./setup.sh (or re-apply with -var run_setup=true)." : "Bare box. Get the harness onto it (set walrus_src_tarball, or scp/clone), then run setup.sh." + ) +} diff --git a/bench/terraform/s3.tf b/bench/terraform/s3.tf index d7510d5..5c1f334 100644 --- a/bench/terraform/s3.tf +++ b/bench/terraform/s3.tf @@ -2,9 +2,7 @@ resource "random_id" "suffix" { byte_length = 4 } -# Private bench bucket; force_destroy + a 7-day lifecycle so a forgotten teardown -# does not leak storage. setup/smoke default to s3:///walg-bench; runs -# scope walrus/wal-g and pgbackrest below tool-specific prefixes by run. +# Private bench bucket with short object lifecycle resource "aws_s3_bucket" "bench" { bucket = "walrus-bench-${random_id.suffix.hex}" force_destroy = true diff --git a/bench/terraform/variables.tf b/bench/terraform/variables.tf index 0f62dfe..52f7787 100644 --- a/bench/terraform/variables.tf +++ b/bench/terraform/variables.tf @@ -1,6 +1,4 @@ -# Single-box IaC for the in-repo bench (driver == SUT == one host). The external -# multi-replica fleet lives elsewhere; this provisions exactly one all-in-one box -# that runs PG18 + the archivers + pgbench locally, per the bench/ README. +# Single-box IaC for in-repo bench variable "region" { description = "AWS region for all bench resources." @@ -9,9 +7,9 @@ variable "region" { } variable "profile" { - description = "AWS CLI named profile used for provisioning." + description = "AWS CLI named profile used for provisioning. Override per your setup with terraform.tfvars." type = string - default = "pg-dev-postgresqladmindev" + default = "default" } variable "instance_type" { @@ -29,3 +27,39 @@ variable "my_ip" { error_message = "my_ip must be a valid CIDR, e.g. 203.0.113.4/32." } } + +variable "walrus_src_tarball" { + description = "Local path to a walrus source tarball (from bench/scripts/make_source_tarball.sh) to upload + unpack on the box, so it needs no git checkout. Empty disables upload (build from an in-place checkout instead)." + type = string + default = "" + + validation { + condition = var.walrus_src_tarball == "" || fileexists(var.walrus_src_tarball) + error_message = "walrus_src_tarball must be empty or point at an existing file; build it with bench/scripts/make_source_tarball.sh." + } +} + +variable "run_setup" { + description = "After uploading the tarball, generate config.env and run setup.sh to bootstrap the box (PG18 + build all tools + systemd units). Requires walrus_src_tarball and pg_password. S3 creds come from the instance profile via IMDS, so no AWS keys are written." + type = bool + default = false +} + +variable "pg_user" { + description = "Bench Postgres role created by setup.sh (PGUSER in config.env)." + type = string + default = "walbench" +} + +variable "pg_password" { + description = "Password for the bench Postgres role (PGPASSWORD). Required when run_setup is true; never echoed." + type = string + default = "" + sensitive = true +} + +variable "upload_concurrency" { + description = "UPLOAD_CONCURRENCY in config.env (wal-g concurrency / pgbackrest process-max)." + type = number + default = 4 +} diff --git a/bench/tools/Cargo.toml b/bench/tools/Cargo.toml index 1c7cd81..c1463ac 100644 --- a/bench/tools/Cargo.toml +++ b/bench/tools/Cargo.toml @@ -11,6 +11,10 @@ description = "walrus bench harness: 1 Hz resource sampler + CSV/plot analyzer" # parent package (which has no [workspace.members]). [workspace] +[lib] +name = "bench_tools" +path = "src/lib.rs" + [[bin]] name = "bench-sampler" path = "src/bin/sampler.rs" @@ -19,6 +23,10 @@ path = "src/bin/sampler.rs" name = "bench-analyze" path = "src/bin/analyze.rs" +[[bin]] +name = "bench-compare" +path = "src/bin/compare.rs" + [dependencies] ab_glyph = "0.2.32" anyhow = "1.0.102" diff --git a/bench/tools/src/bin/analyze.rs b/bench/tools/src/bin/analyze.rs index 9b0b006..968c8d0 100644 --- a/bench/tools/src/bin/analyze.rs +++ b/bench/tools/src/bin/analyze.rs @@ -1,15 +1,13 @@ -//! bench-analyze — plot + summarize walrus vs wal-g vs pgbackrest runs -//! (Rust port of plot.py; tiny-skia rasterizes the canvas, ab_glyph draws text, -//! tiny-skia's png-format writes the PNG). +//! bench-analyze, plot + summarize walrus vs wal-g vs pgbackrest runs //! -//! Reads the 1 Hz sampler CSVs for one or more run dirs and emits, into --out: +//! Reads sampler CSVs and emits: //! mem_over_time.png two panels: VmRSS (top), VmPeak (bottom) //! backlog.png *.ready backlog over time (archive keep-up) //! upload_rate.png tx_bytes upload rate (MB/s) //! cpu.png daemon CPU % over time -//! Replicas of a variant are aggregated: bold = median, band = min..max. +//! Replicas aggregate to median, band = min..max //! -//! Plus self-describing raw exports (every row carries run metadata): +//! Raw exports: //! samples_.csv long table, one row per sample per run //! summary_.csv one row per run: metadata + aggregates //! summary.json same per-run aggregates as JSON @@ -17,17 +15,19 @@ use std::collections::{BTreeMap, HashMap}; use std::path::Path; -use ab_glyph::{Font, FontRef, PxScale, ScaleFont, point}; use anyhow::{Context, Result}; use clap::Parser; -use tiny_skia::{ - Color, FillRule, LineCap, LineJoin, Paint, Path as SkPath, PathBuilder, Pixmap, Rect, Stroke, - StrokeDash, Transform, +use tiny_skia::{Color, Pixmap}; + +use bench_tools::viz::{ + FG, Fonts, HEADER_H, MUTED, SEL, Style, W, fill_poly, fmt_num, load_fonts, median, nice_ticks, + stroke_poly, style_for, text_at, text_center, text_left_mid, text_right, text_vert, text_width, + variant_of, variants_ordered, }; const KB: f64 = 1024.0; const MB: f64 = 1024.0 * 1024.0; -const WAL_SEG_MB: f64 = 16.0; // archived_count -> MB approximation +const WAL_SEG_MB: f64 = 16.0; // archived_count to MB const META_KEYS: [&str; 7] = [ "daemon", @@ -59,43 +59,16 @@ const SAMPLE_COLS: [&str; 17] = [ "archived_mb", ]; -#[derive(Clone, Copy)] -struct Rgb(u8, u8, u8); - -const BG: Rgb = Rgb(0x29, 0x25, 0x22); -const FLOAT: Rgb = Rgb(0x34, 0x30, 0x2C); -const SEL: Rgb = Rgb(0x40, 0x3A, 0x36); -const MUTED: Rgb = Rgb(0xC1, 0xA7, 0x8E); -const FG: Rgb = Rgb(0xEC, 0xE1, 0xD7); - -fn variant_color(variant: &str) -> Option { - match variant { - "walrus" | "walrus-serial" => Some(Rgb(0xFA, 0xFF, 0x69)), - "walg" => Some(Rgb(0xFC, 0x3F, 0x1D)), - "pgbackrest" => Some(Rgb(0x27, 0x68, 0x9D)), - _ => None, - } -} - -const FALLBACK: [Rgb; 6] = [ - Rgb(0xA3, 0xA9, 0xCE), - Rgb(0x85, 0xB6, 0x95), - Rgb(0xCF, 0x9B, 0xC2), - Rgb(0x89, 0xB3, 0xB6), - Rgb(0xE4, 0x9B, 0x5D), - Rgb(0xB3, 0x80, 0xB0), -]; - #[derive(Parser)] #[command(about = "plot + summarize bench runs")] struct Args { - /// run directory (repeatable; pair with --label in order) + /// run directory, repeatable #[arg(long = "run", required = true)] runs: Vec, - /// label for the matching --run (repeatable) + /// label matching --run, repeatable #[arg(long = "label", required = true)] labels: Vec, - /// output directory for plots + exports + /// output directory #[arg(long)] out: String, /// timestamp tag for output filenames (default: now, UTC) @@ -111,27 +84,6 @@ fn pf(s: Option<&String>) -> Option { if t.is_empty() { None } else { t.parse().ok() } } -fn variant_of(label: &str) -> String { - // strip a trailing -b - if let Some(idx) = label.rfind("-b") { - let suffix = &label[idx + 2..]; - if !suffix.is_empty() && suffix.bytes().all(|b| b.is_ascii_digit()) { - return label[..idx].to_string(); - } - } - label.to_string() -} - -fn box_of(label: &str) -> String { - if let Some(idx) = label.rfind("-b") { - let suffix = &label[idx + 2..]; - if !suffix.is_empty() && suffix.bytes().all(|b| b.is_ascii_digit()) { - return suffix.to_string(); - } - } - "0".to_string() -} - fn read_provenance(dir: &Path) -> HashMap { let mut meta = HashMap::new(); if let Ok(text) = std::fs::read_to_string(dir.join("provenance.txt")) { @@ -225,7 +177,7 @@ impl Sample { } } -/// Extracts a plotted metric from a sample (None = absent at that tick). +/// Plotted metric getter type Getter = Box Option>; struct Run { @@ -239,8 +191,7 @@ struct Run { fn load_run(dir: &str, label: &str) -> Option { let d = Path::new(dir); - // Degraded cells (failed burst workers, receiver shipped nothing) are stamped - // INVALID by the drivers; exclude them so a weaker workload is not averaged in. + // Skip degraded cells if d.join("INVALID").exists() { eprintln!("warning: {dir} marked INVALID, skipping"); return None; @@ -275,7 +226,7 @@ fn load_run(dir: &str, label: &str) -> Option { }) .collect(); - // tx upload rate: derivative over consecutive net samples, keyed by later ts. + // Upload rate from consecutive net samples let mut net: HashMap> = HashMap::new(); let (mut prev_ts, mut prev_tx): (Option, Option) = (None, None); for r in read_csv(d, "net.csv").unwrap_or_default() { @@ -293,7 +244,7 @@ fn load_run(dir: &str, label: &str) -> Option { } let variant = variant_of(label); - let boxid = box_of(label); + let boxid = bench_tools::viz::box_of(label); let ts0 = pf(mem[0].get("ts")).unwrap_or(0.0); let div = |r: &HashMap, k: &str, denom: f64| pf(r.get(k)).map(|v| v / denom); @@ -341,41 +292,6 @@ fn load_run(dir: &str, label: &str) -> Option { // -------------------------------------------------------------------------- // Aggregation // -------------------------------------------------------------------------- -#[derive(Clone)] -struct Style { - color: Rgb, - dashed: bool, - z: i32, -} - -fn style_for(variant: &str, idx: usize) -> Style { - Style { - color: variant_color(variant).unwrap_or(FALLBACK[idx % FALLBACK.len()]), - dashed: variant.ends_with("-serial"), - z: if variant.starts_with("walrus") { 10 } else { 4 }, - } -} - -fn variants_ordered(labels: &[String]) -> Vec { - let mut vs: Vec = labels.iter().map(|l| variant_of(l)).collect(); - vs.sort(); - vs.dedup(); - vs.sort_by_key(|v| (!v.starts_with("walrus"), v.clone())); - vs -} - -fn median(xs: &mut [f64]) -> f64 { - xs.sort_by(f64::total_cmp); - let n = xs.len(); - if n == 0 { - 0.0 - } else if n % 2 == 1 { - xs[n / 2] - } else { - (xs[n / 2 - 1] + xs[n / 2]) / 2.0 - } -} - struct Series { label: String, xs: Vec, @@ -391,7 +307,7 @@ fn panel_series( style_map: &HashMap, get: impl Fn(&Sample) -> Option, ) -> Vec { - // variant -> elapsed-second -> values + // variant -> elapsed second -> values let mut buckets: HashMap<&str, BTreeMap>> = HashMap::new(); for s in samples { if let Some(v) = get(s) { @@ -429,12 +345,7 @@ fn panel_series( } // -------------------------------------------------------------------------- -// Rendering (tiny-skia raster + ab_glyph text) -// -// We own a small canvas: tiny-skia gives AA strokes/fills and the PNG encoder -// (png-format feature); ab_glyph rasterizes glyph coverage that we blend in. -// No plotters, no `image`. Dashing is tiny-skia's native StrokeDash (pixel -// space), so the old data-space dash_segments hack is gone. +// Rendering // -------------------------------------------------------------------------- struct Panel { title: String, @@ -442,311 +353,8 @@ struct Panel { series: Vec, } -const W: u32 = 1180; -const HEADER_H: u32 = 48; const PANEL_H: u32 = 340; -struct Fonts { - regular: FontRef<'static>, - bold: FontRef<'static>, -} - -fn load_fonts() -> Result { - let load = |b: &'static [u8]| { - FontRef::try_from_slice(b).map_err(|_| anyhow::anyhow!("embedded font parse failed")) - }; - Ok(Fonts { - regular: load(dejavu::sans::regular())?, - bold: load(dejavu::sans::bold())?, - }) -} - -fn paint_rgb(c: Rgb, a: u8) -> Paint<'static> { - let mut p = Paint::default(); - p.set_color_rgba8(c.0, c.1, c.2, a); - p.anti_alias = true; - p -} - -fn polyline(pts: &[(f32, f32)]) -> Option { - let mut pb = PathBuilder::new(); - let (x0, y0) = *pts.first()?; - pb.move_to(x0, y0); - for &(x, y) in &pts[1..] { - pb.line_to(x, y); - } - pb.finish() -} - -fn stroke_poly( - pm: &mut Pixmap, - pts: &[(f32, f32)], - c: Rgb, - a: u8, - width: f32, - dash: Option<[f32; 2]>, -) { - let Some(path) = polyline(pts) else { return }; - let mut stroke = Stroke { - width, - line_cap: LineCap::Round, - line_join: LineJoin::Round, - ..Default::default() - }; - if let Some([on, off]) = dash { - stroke.dash = StrokeDash::new(vec![on, off], 0.0); - } - pm.stroke_path( - &path, - &paint_rgb(c, a), - &stroke, - Transform::identity(), - None, - ); -} - -fn fill_poly(pm: &mut Pixmap, pts: &[(f32, f32)], c: Rgb, a: u8) { - let mut pb = PathBuilder::new(); - let Some(&(x0, y0)) = pts.first() else { return }; - pb.move_to(x0, y0); - for &(x, y) in &pts[1..] { - pb.line_to(x, y); - } - pb.close(); - let Some(path) = pb.finish() else { return }; - pm.fill_path( - &path, - &paint_rgb(c, a), - FillRule::Winding, - Transform::identity(), - None, - ); -} - -fn fill_rect(pm: &mut Pixmap, x: f32, y: f32, w: f32, h: f32, c: Rgb, a: u8) { - if let Some(r) = Rect::from_xywh(x, y, w, h) { - pm.fill_rect(r, &paint_rgb(c, a), Transform::identity(), None); - } -} - -fn stroke_rect(pm: &mut Pixmap, x: f32, y: f32, w: f32, h: f32, c: Rgb, width: f32) { - stroke_poly( - pm, - &[(x, y), (x + w, y), (x + w, y + h), (x, y + h), (x, y)], - c, - 255, - width, - None, - ); -} - -/// "Nice" axis ticks: a round step (1/2/5 x 10^k) spanning [lo, hi]. -fn nice_ticks(lo: f64, hi: f64, target: usize) -> Vec { - if hi <= lo || target == 0 { - return vec![lo]; - } - let raw = (hi - lo) / target as f64; - let mag = 10f64.powf(raw.log10().floor()); - let norm = raw / mag; - let step = mag - * if norm < 1.5 { - 1.0 - } else if norm < 3.0 { - 2.0 - } else if norm < 7.0 { - 5.0 - } else { - 10.0 - }; - let mut t = (lo / step).ceil() * step; - let mut out = Vec::new(); - while t <= hi + step * 1e-9 { - out.push(t); - t += step; - } - out -} - -// --- text: rasterize a string into a premultiplied pixmap, then blit ---------- -fn text_width(font: &FontRef<'static>, px: f32, text: &str) -> f32 { - let sf = font.as_scaled(PxScale::from(px)); - let mut w = 0.0; - let mut prev = None; - for ch in text.chars() { - let g = font.glyph_id(ch); - if let Some(p) = prev { - w += sf.kern(p, g); - } - w += sf.h_advance(g); - prev = Some(g); - } - w -} - -fn text_pixmap(font: &FontRef<'static>, px: f32, text: &str, c: Rgb) -> Option { - let sf = font.as_scaled(PxScale::from(px)); - let w = (text_width(font, px, text).ceil() as u32 + 2).max(1); - let h = ((sf.ascent() - sf.descent()).ceil() as u32 + 2).max(1); - let mut pm = Pixmap::new(w, h)?; - let baseline = sf.ascent() + 1.0; - let data = pm.data_mut(); - let mut caret = 1.0f32; - let mut prev = None; - for ch in text.chars() { - let gid = font.glyph_id(ch); - if let Some(p) = prev { - caret += sf.kern(p, gid); - } - let glyph = gid.with_scale_and_position(PxScale::from(px), point(caret, baseline)); - caret += sf.h_advance(gid); - prev = Some(gid); - let Some(og) = font.outline_glyph(glyph) else { - continue; - }; - let bb = og.px_bounds(); - og.draw(|gx, gy, cov| { - let x = bb.min.x as i32 + gx as i32; - let y = bb.min.y as i32 + gy as i32; - if x < 0 || y < 0 || x as u32 >= w || y as u32 >= h { - return; - } - let a = (cov.clamp(0.0, 1.0) * 255.0).round() as u8; - if a == 0 { - return; - } - let i = ((y as u32 * w + x as u32) * 4) as usize; - // glyph boxes can overlap; keep the stronger coverage. premultiplied. - if a >= data[i + 3] { - let pre = |v: u8| ((v as u16 * a as u16) / 255) as u8; - data[i] = pre(c.0); - data[i + 1] = pre(c.1); - data[i + 2] = pre(c.2); - data[i + 3] = a; - } - }); - } - Some(pm) -} - -/// Blit a premultiplied src onto an opaque dst (src-over keeps dst opaque). -/// rotate_ccw=true places src rotated 90° counter-clockwise (vertical y-labels). -fn blit(dst: &mut Pixmap, src: &Pixmap, dx: i32, dy: i32, rotate_ccw: bool) { - let (dw, dh) = (dst.width(), dst.height()); - let (sw, sh) = (src.width(), src.height()); - let s = src.data(); - let d = dst.data_mut(); - for sy in 0..sh { - for sx in 0..sw { - let si = ((sy * sw + sx) * 4) as usize; - let a = s[si + 3]; - if a == 0 { - continue; - } - let (rx, ry) = if rotate_ccw { - (sy as i32, (sw - 1 - sx) as i32) - } else { - (sx as i32, sy as i32) - }; - let (px_, py_) = (dx + rx, dy + ry); - if px_ < 0 || py_ < 0 || px_ as u32 >= dw || py_ as u32 >= dh { - continue; - } - let di = ((py_ as u32 * dw + px_ as u32) * 4) as usize; - let inv = (255 - a) as u16; - for k in 0..3 { - d[di + k] = (s[si + k] as u16 + d[di + k] as u16 * inv / 255) as u8; - } - d[di + 3] = 255; - } - } -} - -fn text_at( - pm: &mut Pixmap, - font: &FontRef<'static>, - x: f32, - top: f32, - px: f32, - text: &str, - c: Rgb, -) { - if let Some(t) = text_pixmap(font, px, text, c) { - blit(pm, &t, x.round() as i32, top.round() as i32, false); - } -} - -fn text_center( - pm: &mut Pixmap, - font: &FontRef<'static>, - cx: f32, - top: f32, - px: f32, - text: &str, - c: Rgb, -) { - if let Some(t) = text_pixmap(font, px, text, c) { - blit( - pm, - &t, - (cx - t.width() as f32 / 2.0).round() as i32, - top.round() as i32, - false, - ); - } -} - -fn text_right( - pm: &mut Pixmap, - font: &FontRef<'static>, - right: f32, - cy: f32, - px: f32, - text: &str, - c: Rgb, -) { - if let Some(t) = text_pixmap(font, px, text, c) { - let x = (right - t.width() as f32).round() as i32; - let y = (cy - t.height() as f32 / 2.0).round() as i32; - blit(pm, &t, x, y, false); - } -} - -fn text_left_mid( - pm: &mut Pixmap, - font: &FontRef<'static>, - x: f32, - cy: f32, - px: f32, - text: &str, - c: Rgb, -) { - if let Some(t) = text_pixmap(font, px, text, c) { - blit( - pm, - &t, - x.round() as i32, - (cy - t.height() as f32 / 2.0).round() as i32, - false, - ); - } -} - -fn text_vert( - pm: &mut Pixmap, - font: &FontRef<'static>, - left: f32, - cy: f32, - px: f32, - text: &str, - c: Rgb, -) { - if let Some(t) = text_pixmap(font, px, text, c) { - // rotated box is t.height() wide x t.width() tall; center on cy. - let top = (cy - t.width() as f32 / 2.0).round() as i32; - blit(pm, &t, left.round() as i32, top, true); - } -} - fn render( panels: &[Panel], out_path: &Path, @@ -765,7 +373,12 @@ fn render( .max(1.0); let mut pm = Pixmap::new(W, height).context("alloc pixmap")?; - pm.fill(Color::from_rgba8(BG.0, BG.1, BG.2, 255)); + pm.fill(Color::from_rgba8( + bench_tools::viz::BG.0, + bench_tools::viz::BG.1, + bench_tools::viz::BG.2, + 255, + )); text_at(&mut pm, &fonts.bold, 24.0, 8.0, 20.0, header, FG); if !suffix.is_empty() { text_at(&mut pm, &fonts.regular, 24.0, 30.0, 14.0, suffix, MUTED); @@ -834,7 +447,7 @@ fn draw_panel( MUTED, ); } - // axis spines (left + bottom), slightly heavier + // Axis spines stroke_poly( pm, &[(left, top), (left, bottom), (right, bottom)], @@ -863,7 +476,7 @@ fn draw_panel( MUTED, ); - // bands first (under every line) + // Bands below lines for s in &panel.series { if s.xs.len() < 2 { continue; @@ -877,7 +490,7 @@ fn draw_panel( fill_poly(pm, &poly, s.style.color, 36); } - // median lines by ascending z (walrus rides on top) + // Higher z draws on top let mut ordered: Vec<&Series> = panel.series.iter().collect(); ordered.sort_by_key(|s| s.style.z); for s in ordered { @@ -903,8 +516,8 @@ fn draw_legend(pm: &mut Pixmap, series: &[Series], x: f32, y: f32, fonts: &Fonts .fold(0.0_f32, f32::max); let bw = pad + swatch + gap + tw + pad; let bh = pad * 2.0 + row * series.len() as f32; - fill_rect(pm, x, y, bw, bh, FLOAT, 255); - stroke_rect(pm, x, y, bw, bh, SEL, 1.0); + bench_tools::viz::fill_rect(pm, x, y, bw, bh, bench_tools::viz::FLOAT, 255); + bench_tools::viz::stroke_rect(pm, x, y, bw, bh, SEL, 1.0); for (i, s) in series.iter().enumerate() { let cy = y + pad + row * i as f32 + row / 2.0; let lx = x + pad; @@ -924,17 +537,6 @@ fn draw_legend(pm: &mut Pixmap, series: &[Series], x: f32, y: f32, fonts: &Fonts // -------------------------------------------------------------------------- // Summary + formatting // -------------------------------------------------------------------------- -fn fmt_num(v: f64) -> String { - if v.is_finite() && v.fract() == 0.0 && v.abs() < 1e15 { - format!("{}", v as i64) - } else { - // %g-ish: trim trailing zeros from a 6-sig-figure rendering. - let s = format!("{v:.6}"); - let s = s.trim_end_matches('0').trim_end_matches('.'); - s.to_string() - } -} - fn summarize(run: &Run) -> Vec<(String, serde_json::Value)> { use serde_json::json; let collect = |get: &dyn Fn(&Sample) -> Option| -> Vec { @@ -1155,7 +757,7 @@ fn main() -> Result<()> { w.flush()?; let rows: Vec> = runs.iter().map(summarize).collect(); - // CSV field order = first-seen union across rows. + // CSV field order follows first-seen union let mut fields: Vec = Vec::new(); for row in &rows { for (k, _) in row { @@ -1178,7 +780,7 @@ fn main() -> Result<()> { } sw.flush()?; - // summary.json: BTreeMap-backed objects => keys sorted (matches sort_keys). + // BTreeMap keeps summary.json keys sorted let runs_json: Vec = rows .iter() .map(|row| { @@ -1198,36 +800,3 @@ fn main() -> Result<()> { ); Ok(()) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn variant_and_box() { - assert_eq!(variant_of("walrus-serial-b0"), "walrus-serial"); - assert_eq!(box_of("walrus-serial-b0"), "0"); - assert_eq!(variant_of("walrus-r1"), "walrus-r1"); // -r1 is not -b - assert_eq!(box_of("walrus"), "0"); - assert_eq!(variant_of("pgbackrest-b12"), "pgbackrest"); - assert_eq!(box_of("pgbackrest-b12"), "12"); - } - - #[test] - fn median_odd_even() { - assert_eq!(median(&mut [3.0, 1.0, 2.0]), 2.0); - assert_eq!(median(&mut [4.0, 1.0, 3.0, 2.0]), 2.5); - } - - #[test] - fn fmt_num_int_vs_float() { - assert_eq!(fmt_num(42.0), "42"); - assert_eq!(fmt_num(2.5), "2.5"); - } - - #[test] - fn variant_order_walrus_first() { - let v = variants_ordered(&["walg-b0".into(), "walrus-b0".into(), "pgbackrest-b0".into()]); - assert_eq!(v[0], "walrus"); - } -} diff --git a/bench/tools/src/bin/compare.rs b/bench/tools/src/bin/compare.rs new file mode 100644 index 0000000..69e1faf --- /dev/null +++ b/bench/tools/src/bin/compare.rs @@ -0,0 +1,589 @@ +//! bench-compare, grouped-bar comparison across tools +//! +//! Reads op_metrics.txt, cpu.csv, and mem.csv from each --run. Emits: +//! backup_compare.png stacked bar panels: size, duration, CPU, RSS, VM +//! ops_summary.md per-op table (size / elapsed / CPU / RSS) +//! ops_compare_.csv one row per (op, variant): every metric +//! +//! Replicas aggregate to median + +use std::collections::HashMap; +use std::path::Path; + +use anyhow::{Context, Result}; +use clap::Parser; +use tiny_skia::{Color, Pixmap}; + +use bench_tools::viz::{ + BG, FG, Fonts, HEADER_H, MUTED, SEL, Style, W, fill_rect, fmt_num, load_fonts, median, + nice_ticks, stroke_poly, stroke_rect, style_for, text_at, text_center, text_left_mid, + text_right, text_vert, text_width, variant_of, variants_ordered, +}; + +/// Backup ops in display order; wal-receive is opt-in via --ops +const DEFAULT_OPS: [&str; 5] = [ + "backup-send", + "backup-delta", + "backup-delta-sidecar", + "backup-delta-summaries", + "backup-fetch", +]; + +const PANEL_H: u32 = 300; + +#[derive(Parser)] +#[command(about = "grouped-bar comparison of per-op backup metrics across tools")] +struct Args { + /// run directory, repeatable + #[arg(long = "run", required = true)] + runs: Vec, + /// label matching --run, repeatable + #[arg(long = "label", required = true)] + labels: Vec, + /// output directory + #[arg(long)] + out: String, + /// comma-separated op order/filter + #[arg(long)] + ops: Option, + /// timestamp tag for output filenames (default: now, UTC) + #[arg(long)] + stamp: Option, +} + +#[derive(Clone, Copy)] +enum Unit { + Gb, + Sec, + Pct, + Mb, +} + +fn fmt_val(v: f64, unit: Unit) -> String { + match unit { + Unit::Gb if v < 10.0 => format!("{v:.2}"), + Unit::Gb if v < 100.0 => format!("{v:.1}"), + Unit::Sec if v < 10.0 => format!("{v:.1}"), + Unit::Pct | Unit::Mb | Unit::Gb | Unit::Sec => format!("{v:.0}"), + } +} + +// -------------------------------------------------------------------------- +// Ingest +// -------------------------------------------------------------------------- +fn read_kv(path: &Path) -> HashMap { + let mut m = HashMap::new(); + if let Ok(text) = std::fs::read_to_string(path) { + for line in text.lines() { + if line.trim_start().starts_with('#') { + continue; + } + if let Some((k, v)) = line.split_once('=') { + m.insert(k.trim().to_string(), v.trim().to_string()); + } + } + } + m +} + +/// Aggregate CSV columns, skipping blanks +fn col_agg(dir: &Path, file: &str, cols: &[&str], reduce: impl Fn(f64, f64) -> f64) -> Option { + let mut rdr = csv::ReaderBuilder::new() + .flexible(true) + .from_path(dir.join(file)) + .ok()?; + let headers = rdr.headers().ok()?.clone(); + let idx: Vec = cols + .iter() + .filter_map(|c| headers.iter().position(|h| h == *c)) + .collect(); + if idx.is_empty() { + return None; + } + let mut acc: Option = None; + for rec in rdr.records().flatten() { + for &i in &idx { + if let Some(v) = rec.get(i).and_then(|s| s.trim().parse::().ok()) { + acc = Some(acc.map_or(v, |a| reduce(a, v))); + } + } + } + acc +} + +struct Cell { + op: String, + variant: String, + size_gb: Option, + dur_s: Option, + peak_cpu: Option, + mean_cpu: Option, + peak_rss_mb: Option, + peak_vm_mb: Option, +} + +fn load_cell(dir: &str, label: &str) -> Option { + let d = Path::new(dir); + if d.join("INVALID").exists() { + eprintln!("warning: {dir} marked INVALID, skipping"); + return None; + } + let m = read_kv(&d.join("op_metrics.txt")); + let op = m.get("op").cloned(); + let Some(op) = op else { + eprintln!("warning: {dir} has no op_metrics.txt op=, skipping"); + return None; + }; + let f = |k: &str| m.get(k).and_then(|s| s.parse::().ok()); + let peak_cpu = col_agg(d, "cpu.csv", &["pct_cpu"], f64::max); + let mean = { + let mut rdr = csv::ReaderBuilder::new() + .flexible(true) + .from_path(d.join("cpu.csv")) + .ok(); + rdr.as_mut().and_then(|r| { + let hi = r.headers().ok()?.iter().position(|h| h == "pct_cpu")?; + let (mut sum, mut n) = (0.0, 0u32); + for rec in r.records().flatten() { + if let Some(v) = rec.get(hi).and_then(|s| s.trim().parse::().ok()) { + sum += v; + n += 1; + } + } + (n > 0).then(|| sum / n as f64) + }) + }; + Some(Cell { + op, + variant: variant_of(label), + size_gb: f("bytes_processed").map(|b| b / 1e9), + dur_s: f("elapsed_s"), + peak_cpu, + mean_cpu: mean, + // vmhwm is peak resident high-water; fall back to vmrss + peak_rss_mb: col_agg(d, "mem.csv", &["vmhwm_kb", "vmrss_kb"], f64::max) + .map(|kb| kb / 1024.0), + // vmpeak is peak virtual address reservation (Go runtime inflates this) + peak_vm_mb: col_agg(d, "mem.csv", &["vmpeak_kb"], f64::max).map(|kb| kb / 1024.0), + }) +} + +/// Median metric for (op, variant) +fn agg(cells: &[&Cell], get: impl Fn(&Cell) -> Option) -> Option { + let mut vs: Vec = cells.iter().filter_map(|c| get(c)).collect(); + if vs.is_empty() { + None + } else { + Some(median(&mut vs)) + } +} + +// -------------------------------------------------------------------------- +// Bar rendering +// -------------------------------------------------------------------------- +#[derive(Clone, Copy)] +enum Scale { + Linear, + Log, +} + +struct BarPanel { + title: String, + ylabel: String, + unit: Unit, + scale: Scale, + /// (op, variant) value + vals: HashMap<(String, String), f64>, +} + +/// Log ticks inside [lo, hi] +fn log_ticks(lo: f64, hi: f64) -> Vec { + let mut out = Vec::new(); + let k0 = lo.log10().floor() as i32; + let k1 = hi.log10().ceil() as i32; + for k in k0..=k1 { + for m in [1.0, 2.0, 5.0] { + let v = m * 10f64.powi(k); + if v >= lo * 0.999 && v <= hi * 1.001 { + out.push(v); + } + } + } + if out.is_empty() { + out.push(lo); + } + out +} + +fn draw_bar_panel( + pm: &mut Pixmap, + panel: &BarPanel, + py0: u32, + ops: &[String], + variants: &[String], + style_map: &HashMap, + fonts: &Fonts, +) { + let maxv = panel.vals.values().copied().fold(0.0_f64, f64::max); + let (lo, hi) = match panel.scale { + Scale::Linear => (0.0, if maxv > 0.0 { maxv * 1.08 } else { 1.0 }), + Scale::Log => { + let minv = panel + .vals + .values() + .copied() + .filter(|v| *v > 0.0) + .fold(f64::INFINITY, f64::min); + let minv = if minv.is_finite() { minv } else { 1.0 }; + ( + 10f64.powi(minv.log10().floor() as i32), + 10f64.powi((maxv.max(minv) * 1.0001).log10().ceil() as i32), + ) + } + }; + + let left = 84.0_f32; + let right = W as f32 - 16.0; + let top = py0 as f32 + 38.0; + let bottom = py0 as f32 + PANEL_H as f32 - 40.0; + + text_at( + pm, + &fonts.bold, + left, + py0 as f32 + 12.0, + 16.0, + &panel.title, + FG, + ); + + // y to pixel, clamped to baseline + let sy = |v: f64| -> f32 { + match panel.scale { + Scale::Linear => { + bottom - ((v - lo) / (hi - lo)).clamp(0.0, 1.0) as f32 * (bottom - top) + } + Scale::Log => { + let v = v.max(lo); + let f = (v.log10() - lo.log10()) / (hi.log10() - lo.log10()); + bottom - f.clamp(0.0, 1.0) as f32 * (bottom - top) + } + } + }; + + let ticks = match panel.scale { + Scale::Linear => nice_ticks(0.0, hi, 5), + Scale::Log => log_ticks(lo, hi), + }; + for t in &ticks { + let yy = sy(*t); + stroke_poly(pm, &[(left, yy), (right, yy)], SEL, 255, 1.0, None); + text_right( + pm, + &fonts.regular, + left - 8.0, + yy, + 13.0, + &fmt_num(*t), + MUTED, + ); + } + stroke_poly( + pm, + &[(left, top), (left, bottom), (right, bottom)], + SEL, + 255, + 1.5, + None, + ); + text_vert( + pm, + &fonts.regular, + 16.0, + (top + bottom) / 2.0, + 13.0, + &panel.ylabel, + MUTED, + ); + + let n_groups = ops.len().max(1); + let group_w = (right - left) / n_groups as f32; + let n_series = variants.len().max(1); + let inner = group_w * 0.78; + let slot = inner / n_series as f32; + let barw = slot * 0.86; + + for (gi, op) in ops.iter().enumerate() { + let gx = left + group_w * gi as f32; + let xc = gx + group_w / 2.0; + // Drop backup- prefix + let short = op.strip_prefix("backup-").unwrap_or(op); + text_center(pm, &fonts.regular, xc, bottom + 8.0, 13.0, short, MUTED); + + for (si, variant) in variants.iter().enumerate() { + let Some(&v) = panel.vals.get(&(op.clone(), variant.clone())) else { + continue; + }; + if v <= 0.0 { + continue; + } + let bx = gx + (group_w - inner) / 2.0 + slot * si as f32 + (slot - barw) / 2.0; + let by = sy(v); + let color = style_map[variant].color; + fill_rect(pm, bx, by, barw, bottom - by, color, 235); + stroke_rect(pm, bx, by, barw, bottom - by, BG, 1.0); + text_center( + pm, + &fonts.regular, + bx + barw / 2.0, + by - 14.0, + 11.0, + &fmt_val(v, panel.unit), + FG, + ); + } + } +} + +fn draw_hlegend( + pm: &mut Pixmap, + variants: &[String], + style_map: &HashMap, + fonts: &Fonts, +) { + let (sw, gap, fs) = (16.0_f32, 6.0_f32, 13.0_f32); + // Place legend from right edge + let mut x = W as f32 - 16.0; + for variant in variants.iter().rev() { + let label_w = text_width(&fonts.regular, fs, variant); + x -= label_w; + text_left_mid(pm, &fonts.regular, x, 24.0, fs, variant, FG); + x -= gap + sw; + fill_rect( + pm, + x, + 24.0 - sw / 2.0, + sw, + sw, + style_map[variant].color, + 235, + ); + stroke_rect(pm, x, 24.0 - sw / 2.0, sw, sw, SEL, 1.0); + x -= 16.0; + } +} + +fn main() -> Result<()> { + let args = Args::parse(); + if args.runs.len() != args.labels.len() { + anyhow::bail!("number of --run and --label must match"); + } + let fonts = load_fonts()?; + let stamp = args + .stamp + .clone() + .unwrap_or_else(|| chrono::Utc::now().format("%Y%m%dT%H%M%SZ").to_string()); + let out = Path::new(&args.out); + std::fs::create_dir_all(out).with_context(|| format!("mkdir {out:?}"))?; + + let cells: Vec = args + .runs + .iter() + .zip(&args.labels) + .filter_map(|(d, l)| load_cell(d, l)) + .collect(); + if cells.is_empty() { + anyhow::bail!("no loadable runs"); + } + + // Ops present, requested order first + let order: Vec = match &args.ops { + Some(s) => s.split(',').map(|o| o.trim().to_string()).collect(), + None => DEFAULT_OPS.iter().map(|s| s.to_string()).collect(), + }; + let mut ops: Vec = order + .iter() + .filter(|o| cells.iter().any(|c| &c.op == *o)) + .cloned() + .collect(); + // Append extra ops, stable + deduped + for c in &cells { + if !ops.contains(&c.op) { + ops.push(c.op.clone()); + } + } + + let labels: Vec = cells.iter().map(|c| c.variant.clone()).collect(); + let variants = variants_ordered(&labels); + let style_map: HashMap = variants + .iter() + .enumerate() + .map(|(i, v)| (v.clone(), style_for(v, i))) + .collect(); + + // Build aggregated metric map + let build = |get: &dyn Fn(&Cell) -> Option| -> HashMap<(String, String), f64> { + let mut out = HashMap::new(); + for op in &ops { + for variant in &variants { + let group: Vec<&Cell> = cells + .iter() + .filter(|c| &c.op == op && &c.variant == variant) + .collect(); + if let Some(v) = agg(&group, get) { + out.insert((op.clone(), variant.clone()), v); + } + } + } + out + }; + + let panels = [ + BarPanel { + title: "backup size — stored bytes (delta = increment, fetch = restored)".into(), + ylabel: "size (GB)".into(), + unit: Unit::Gb, + scale: Scale::Linear, + vals: build(&|c| c.size_gb), + }, + BarPanel { + title: "duration — wall-clock of the op (log scale)".into(), + ylabel: "elapsed (s)".into(), + unit: Unit::Sec, + scale: Scale::Log, + vals: build(&|c| c.dur_s), + }, + BarPanel { + title: "CPU during op — peak sampled utilization (>100% = multi-core)".into(), + ylabel: "peak CPU (%)".into(), + unit: Unit::Pct, + scale: Scale::Linear, + vals: build(&|c| c.peak_cpu), + }, + BarPanel { + title: "memory during op — peak resident set (VmHWM)".into(), + ylabel: "peak RSS (MB)".into(), + unit: Unit::Mb, + scale: Scale::Linear, + vals: build(&|c| c.peak_rss_mb), + }, + BarPanel { + title: "virtual memory during op — peak address reservation (VmPeak)".into(), + ylabel: "peak VM (MB)".into(), + unit: Unit::Mb, + scale: Scale::Linear, + vals: build(&|c| c.peak_vm_mb), + }, + ]; + + let height = HEADER_H + panels.len() as u32 * PANEL_H; + let mut pm = Pixmap::new(W, height).context("alloc pixmap")?; + pm.fill(Color::from_rgba8(BG.0, BG.1, BG.2, 255)); + text_at( + &mut pm, + &fonts.bold, + 16.0, + 8.0, + 18.0, + "backup comparison: walrus vs wal-g vs pgbackrest", + FG, + ); + text_at(&mut pm, &fonts.regular, 16.0, 30.0, 12.0, &stamp, MUTED); + draw_hlegend(&mut pm, &variants, &style_map, &fonts); + for (i, panel) in panels.iter().enumerate() { + draw_bar_panel( + &mut pm, + panel, + HEADER_H + i as u32 * PANEL_H, + &ops, + &variants, + &style_map, + &fonts, + ); + } + let png = out.join("backup_compare.png"); + pm.save_png(&png) + .map_err(|e| anyhow::anyhow!("save {png:?}: {e}"))?; + + // --- markdown table ----------------------------------------------------- + let mut md = String::from("# Backup comparison\n"); + for op in &ops { + let rows: Vec<&Cell> = cells.iter().filter(|c| &c.op == op).collect(); + if rows.is_empty() { + continue; + } + md.push_str(&format!("\n### {op}\n\n")); + md.push_str( + "| variant | size_GB | elapsed_s | peak_CPU_% | mean_CPU_% | peak_RSS_MB | peak_VM_MB |\n", + ); + md.push_str("|---|--:|--:|--:|--:|--:|--:|\n"); + for variant in &variants { + let group: Vec<&Cell> = rows + .iter() + .copied() + .filter(|c| &c.variant == variant) + .collect(); + if group.is_empty() { + continue; + } + let cell = |get: &dyn Fn(&Cell) -> Option, p: usize| { + agg(&group, get).map_or("—".to_string(), |v| format!("{v:.p$}")) + }; + md.push_str(&format!( + "| {} | {} | {} | {} | {} | {} | {} |\n", + variant, + cell(&|c| c.size_gb, 2), + cell(&|c| c.dur_s, 1), + cell(&|c| c.peak_cpu, 0), + cell(&|c| c.mean_cpu, 0), + cell(&|c| c.peak_rss_mb, 0), + cell(&|c| c.peak_vm_mb, 0), + )); + } + } + std::fs::write(out.join("ops_summary.md"), &md)?; + + // --- csv export --------------------------------------------------------- + let csv_path = out.join(format!("ops_compare_{stamp}.csv")); + let mut w = csv::Writer::from_path(&csv_path)?; + w.write_record([ + "op", + "variant", + "size_gb", + "elapsed_s", + "peak_cpu_pct", + "mean_cpu_pct", + "peak_rss_mb", + "peak_vm_mb", + ])?; + let num = |v: Option| v.map_or(String::new(), |x| fmt_num((x * 1000.0).round() / 1000.0)); + for op in &ops { + for variant in &variants { + let group: Vec<&Cell> = cells + .iter() + .filter(|c| &c.op == op && &c.variant == variant) + .collect(); + if group.is_empty() { + continue; + } + w.write_record([ + op.clone(), + variant.clone(), + num(agg(&group, |c| c.size_gb)), + num(agg(&group, |c| c.dur_s)), + num(agg(&group, |c| c.peak_cpu)), + num(agg(&group, |c| c.mean_cpu)), + num(agg(&group, |c| c.peak_rss_mb)), + num(agg(&group, |c| c.peak_vm_mb)), + ])?; + } + } + w.flush()?; + + println!( + "wrote {} + ops_summary.md + {}", + png.display(), + csv_path.display() + ); + Ok(()) +} diff --git a/bench/tools/src/bin/sampler.rs b/bench/tools/src/bin/sampler.rs index 0b74ae9..9fe64c2 100644 --- a/bench/tools/src/bin/sampler.rs +++ b/bench/tools/src/bin/sampler.rs @@ -1,17 +1,13 @@ -//! bench-sampler — 1 Hz on-SUT resource sampler +//! bench-sampler, 1 Hz on-SUT resource sampler //! -//! Writes one CSV per metric family, sharing a float-epoch `ts`. Schemas are -//! fixed by the bench contract (consumed by bench-analyze and plot.py alike): +//! Writes one CSV per metric family, sharing `ts`: //! mem.csv: ts,vmpeak_kb,vmsize_kb,vmhwm_kb,vmrss_kb,rssanon_kb,cg_current_bytes,cg_peak_bytes //! cpu.csv: ts,pct_usr,pct_sys,pct_cpu //! wal.csv: ts,wal_bytes //! archive.csv: ts,archived_count,failed_count,ready_backlog,last_archived_age_s //! net.csv: ts,tx_bytes,rx_bytes //! -//! Memory/CPU come from /proc, network from /sys, PG metrics from a long-lived -//! `psql` reused across ticks (result framing via a sentinel row, respawned if -//! it dies). Daemonless archivers (pgbackrest) are followed by --proc-match, -//! which rescans /proc each tick and aggregates over the matching process tree. +//! Reads /proc, /sys, and PostgreSQL via persistent psql //! //! Self-test (no PostgreSQL): //! sleep 30 & bench-sampler --pid $! --iface lo --no-pg \ @@ -31,7 +27,7 @@ use clap::Parser; const SENTINEL: &str = "__SAMPLER_EOT__"; -// Per-tick PG queries; each yields exactly one tuple line, framed by SENTINEL. +// Per-tick PG queries, framed by SENTINEL const PG_QUERIES: [&str; 2] = [ "SELECT pg_wal_lsn_diff(pg_current_wal_lsn(),'0/0');", "SELECT archived_count, failed_count, \ @@ -45,24 +41,22 @@ struct Args { /// PID to sample #[arg(long)] pid: Option, - /// systemd unit; MainPID resolved via systemctl show (auto cgroup too) + /// systemd unit #[arg(long)] unit: Option, - /// do not exit if no PID can be resolved (mem/cpu columns left blank) + /// allow blank mem/cpu when no PID resolves #[arg(long = "no-pid-required")] no_pid_required: bool, - /// aggregate mem/cpu over ALL processes whose comm equals this name, - /// rescanned every tick (daemonless archivers). Excludes --pid/--unit. + /// aggregate mem/cpu over processes with matching comm #[arg(long = "proc-match")] proc_match: Option, - /// archiver shorthand: walg|walrus -> that unit's MainPID, pgbackrest -> - /// proc-match (daemonless). Fills --unit/--proc-match when neither is given. + /// archiver shorthand #[arg(long, value_parser = ["walg", "walrus", "pgbackrest"])] daemon: Option, - /// cgroup v2 dir (default: auto from --unit) + /// cgroup v2 dir #[arg(long)] cgroup: Option, - /// network iface (default: auto default-route iface) + /// network iface #[arg(long)] iface: Option, /// PGDATA path @@ -74,16 +68,16 @@ struct Args { /// sample interval seconds #[arg(long, default_value_t = 1.0)] interval: f64, - /// number of ticks to take (default: until SIGTERM) + /// number of ticks #[arg(long)] duration: Option, - /// psql conninfo (default: local socket, db walbench) + /// psql conninfo #[arg( long, default_value = "host=/var/run/postgresql user=postgres dbname=walbench" )] pg: String, - /// skip wal/archive queries (CSVs get headers only); PG-free self-test + /// skip WAL/archive queries #[arg(long = "no-pg")] no_pg: bool, } @@ -93,13 +87,10 @@ fn clk_tck() -> f64 { if v > 0 { v as f64 } else { 100.0 } } -// Diagnostic events go to stderr (bare). The orchestrator redirects the -// sampler's stderr to sampler.log in the result dir (run.sh/run_op.sh), so -// events land there co-located with the CSVs; standalone runs print to the -// terminal. Pipe through awk for timestamps if wanted; no own log file. +// Diagnostic events go to stderr; drivers redirect it to sampler.log // -------------------------------------------------------------------------- -// CSV sink: header on open, one comma-joined row per tick, flushed each write. +// CSV sink // -------------------------------------------------------------------------- struct CsvSink { fh: BufWriter, @@ -150,9 +141,7 @@ fn resolve_cgroup_path(unit: &str) -> Option { Path::new(&path).is_dir().then_some(path) } -/// Map an archiver token to a sampling target (unit, proc_match): walg/walrus -/// follow their systemd unit's MainPID; pgbackrest is daemonless (PG forks -/// archive-push per segment) so it is followed by proc-match. +/// Map archiver token to sampling target fn daemon_target(daemon: &str) -> (Option, Option) { match daemon { "walg" => (Some("wal-g.service".into()), None), @@ -181,7 +170,7 @@ fn list_pids_by_comm(name: &str) -> Vec { } fn detect_default_iface() -> Option { - // /proc/net/route: Iface col 0, Destination col 1 (00000000 = default). + // /proc/net/route uses 00000000 for default destination let data = fs::read_to_string("/proc/net/route").ok()?; for line in data.lines().skip(1) { let f: Vec<&str> = line.split_whitespace().collect(); @@ -195,7 +184,7 @@ fn detect_default_iface() -> Option { // -------------------------------------------------------------------------- // /proc readers // -------------------------------------------------------------------------- -/// From /proc//status (kB) +/// /proc//status memory fields, kB #[derive(Default, Clone, Copy)] struct MemStats { vmpeak: Option, @@ -225,7 +214,7 @@ impl MemStats { }) } - /// Field-wise sum, missing treated as 0; result always present. + /// Field-wise sum, missing treated as 0 fn add(&mut self, o: MemStats) { self.vmpeak = Some(self.vmpeak.unwrap_or(0) + o.vmpeak.unwrap_or(0)); self.vmsize = Some(self.vmsize.unwrap_or(0) + o.vmsize.unwrap_or(0)); @@ -254,13 +243,12 @@ fn read_proc_status(pid: i32) -> MemStats { out } -/// (utime, stime) in clock ticks from /proc//stat. comm (field 2) may hold -/// spaces/parens, so split after the final ')'. None if the process is gone. +/// utime/stime ticks from /proc//stat fn read_proc_cpu_jiffies(pid: i32) -> Option<(u64, u64)> { let data = fs::read_to_string(format!("/proc/{pid}/stat")).ok()?; let rparen = data.rfind(')')?; let rest: Vec<&str> = data[rparen + 1..].split_whitespace().collect(); - // rest[0]=state; utime is overall field 14 -> index 11, stime 15 -> 12. + // utime field 14 -> index 11, stime field 15 -> index 12 if rest.len() < 13 { return None; } @@ -308,7 +296,7 @@ impl PsqlConn { } fn spawn(&mut self) { - // -A unaligned, -t tuples-only, -q quiet, -X no psqlrc, -F '|' field sep. + // -A unaligned, -t tuples-only, -q quiet, -X no psqlrc let mut child = match Command::new("psql") .args([ "-Atq", @@ -442,7 +430,7 @@ fn parse_pg(lines: &[String]) -> PgRow { out.archived_count = nonempty(parts[0].trim()); out.failed_count = nonempty(parts[1].trim()); let age = parts[2].trim(); - // -1 sentinel => never archived yet => blank. + // -1 sentinel means never archived out.last_archived_age_s = (age != "-1").then(|| nonempty(age)).flatten(); } } @@ -480,8 +468,7 @@ impl Sampler { let outdir = Path::new(&args.outdir); fs::create_dir_all(outdir).with_context(|| format!("mkdir {outdir:?}"))?; - // --daemon is a bench shorthand: fill unit/proc_match from the token - // unless an explicit flag already set one. + // --daemon fills unit/proc_match unless explicit flags did let (mut unit, mut proc_match) = (args.unit.clone(), args.proc_match.clone()); if let Some(d) = &args.daemon { let (u, pm) = daemon_target(d); @@ -587,16 +574,14 @@ impl Sampler { fn sample_mem(&mut self, ts: f64) { let vals = if let Some(pm) = &self.proc_match { - // Sum each metric over the live tree. Per-proc lifetime highs - // (VmPeak/VmHWM) summed over the live set are a lower bound on the - // tree peak; run-level peak is max-over-time downstream. + // Sum live process tree; run-level peak is max-over-time downstream let mut agg = MemStats::default(); let mut found = false; for pid in list_pids_by_comm(pm) { agg.add(read_proc_status(pid)); found = true; } - // found => summed tree; else genuine 0 (async drained + exited). + // No matches means async worker drained + exited if found { agg } else { MemStats::ZERO } } else if let Some(pid) = self.pid { read_proc_status(pid) @@ -635,8 +620,7 @@ impl Sampler { if elapsed > 0.0 { let du = cur.0.saturating_sub(prev.0) as f64; let ds = cur.1.saturating_sub(prev.1) as f64; - // ticks -> CPU-seconds / wall-seconds * 100. May exceed 100 on - // multi-threaded daemons. + // CPU percent may exceed 100 on multi-threaded daemons let u = 100.0 * (du / self.clk_tck) / elapsed; let s = 100.0 * (ds / self.clk_tck) / elapsed; usr = format!("{:.2}", u.max(0.0)); @@ -652,9 +636,7 @@ impl Sampler { } fn sample_cpu_proctree(&mut self, ts: f64) { - // Per-PID cumulative ticks diffed tick-over-tick; a freshly forked PID - // contributes its whole counter (~one interval), a vanished one drops - // out. Only combined pct_cpu is meaningful across a churning set. + // Diff per-PID ticks across churning process set let pm = self.proc_match.clone().unwrap(); let mut cur_map: HashMap = HashMap::new(); for pid in list_pids_by_comm(&pm) { @@ -763,9 +745,9 @@ fn run(args: &Args, sampler: &mut Sampler, stop: &Arc) { next_at += interval; let now = Instant::now(); if next_at <= now { - next_at = now; // fell behind; resync, no burst catch-up + next_at = now; // resync, no burst catch-up } else { - // Interruptible sleep so SIGTERM is honored promptly. + // Keep SIGTERM responsive while !stop.load(Ordering::Relaxed) && Instant::now() < next_at { let remaining = next_at.saturating_duration_since(Instant::now()); std::thread::sleep(remaining.min(Duration::from_millis(100))); @@ -801,7 +783,7 @@ mod tests { #[test] fn parse_pg_never_archived() { - // -1 age sentinel collapses to blank. + // -1 age sentinel becomes blank let r = parse_pg(&["10".into(), "0|0|-1".into()]); assert_eq!(r.last_archived_age_s, None); } diff --git a/bench/tools/src/lib.rs b/bench/tools/src/lib.rs new file mode 100644 index 0000000..65715b9 --- /dev/null +++ b/bench/tools/src/lib.rs @@ -0,0 +1,3 @@ +//! Shared rendering for bench charts + +pub mod viz; diff --git a/bench/tools/src/viz.rs b/bench/tools/src/viz.rs new file mode 100644 index 0000000..9f54ff0 --- /dev/null +++ b/bench/tools/src/viz.rs @@ -0,0 +1,446 @@ +//! Shared chart palette, geometry, raster, and text helpers + +use ab_glyph::{Font, FontRef, PxScale, ScaleFont, point}; +use anyhow::Result; +use tiny_skia::{ + FillRule, LineCap, LineJoin, Paint, Path as SkPath, PathBuilder, Pixmap, Rect, Stroke, + StrokeDash, Transform, +}; + +// ── palette ──────────────────────────────────────────────────────────────── +#[derive(Clone, Copy)] +pub struct Rgb(pub u8, pub u8, pub u8); + +pub const BG: Rgb = Rgb(0x29, 0x25, 0x22); +pub const FLOAT: Rgb = Rgb(0x34, 0x30, 0x2C); +pub const SEL: Rgb = Rgb(0x40, 0x3A, 0x36); +pub const MUTED: Rgb = Rgb(0xC1, 0xA7, 0x8E); +pub const FG: Rgb = Rgb(0xEC, 0xE1, 0xD7); + +/// Per-tool color, falling back by index +pub fn variant_color(variant: &str) -> Option { + match variant { + "walrus" | "walrus-serial" => Some(Rgb(0xFA, 0xFF, 0x69)), + "walg" => Some(Rgb(0xFC, 0x3F, 0x1D)), + "pgbackrest" => Some(Rgb(0x27, 0x68, 0x9D)), + _ => None, + } +} + +pub const FALLBACK: [Rgb; 6] = [ + Rgb(0xA3, 0xA9, 0xCE), + Rgb(0x85, 0xB6, 0x95), + Rgb(0xCF, 0x9B, 0xC2), + Rgb(0x89, 0xB3, 0xB6), + Rgb(0xE4, 0x9B, 0x5D), + Rgb(0xB3, 0x80, 0xB0), +]; + +// ── shared canvas geometry ─────────────────────────────────────────────────── +pub const W: u32 = 1180; +pub const HEADER_H: u32 = 48; + +// ── per-variant style ──────────────────────────────────────────────────────── +#[derive(Clone)] +pub struct Style { + pub color: Rgb, + pub dashed: bool, + pub z: i32, +} + +pub fn style_for(variant: &str, idx: usize) -> Style { + Style { + color: variant_color(variant).unwrap_or(FALLBACK[idx % FALLBACK.len()]), + dashed: variant.ends_with("-serial"), + z: if variant.starts_with("walrus") { 10 } else { 4 }, + } +} + +/// Strip trailing `-b` replica suffix +pub fn variant_of(label: &str) -> String { + if let Some(idx) = label.rfind("-b") { + let suffix = &label[idx + 2..]; + if !suffix.is_empty() && suffix.bytes().all(|b| b.is_ascii_digit()) { + return label[..idx].to_string(); + } + } + label.to_string() +} + +pub fn box_of(label: &str) -> String { + if let Some(idx) = label.rfind("-b") { + let suffix = &label[idx + 2..]; + if !suffix.is_empty() && suffix.bytes().all(|b| b.is_ascii_digit()) { + return suffix.to_string(); + } + } + "0".to_string() +} + +/// Distinct variants, walrus first +pub fn variants_ordered(labels: &[String]) -> Vec { + let mut vs: Vec = labels.iter().map(|l| variant_of(l)).collect(); + vs.sort(); + vs.dedup(); + vs.sort_by_key(|v| (!v.starts_with("walrus"), v.clone())); + vs +} + +pub fn median(xs: &mut [f64]) -> f64 { + xs.sort_by(f64::total_cmp); + let n = xs.len(); + if n == 0 { + 0.0 + } else if n % 2 == 1 { + xs[n / 2] + } else { + (xs[n / 2 - 1] + xs[n / 2]) / 2.0 + } +} + +// ── fonts ──────────────────────────────────────────────────────────────────── +pub struct Fonts { + pub regular: FontRef<'static>, + pub bold: FontRef<'static>, +} + +pub fn load_fonts() -> Result { + let load = |b: &'static [u8]| { + FontRef::try_from_slice(b).map_err(|_| anyhow::anyhow!("embedded font parse failed")) + }; + Ok(Fonts { + regular: load(dejavu::sans::regular())?, + bold: load(dejavu::sans::bold())?, + }) +} + +// ── shapes ───────────────────────────────────────────────────────────────── +pub fn paint_rgb(c: Rgb, a: u8) -> Paint<'static> { + let mut p = Paint::default(); + p.set_color_rgba8(c.0, c.1, c.2, a); + p.anti_alias = true; + p +} + +fn polyline(pts: &[(f32, f32)]) -> Option { + let mut pb = PathBuilder::new(); + let (x0, y0) = *pts.first()?; + pb.move_to(x0, y0); + for &(x, y) in &pts[1..] { + pb.line_to(x, y); + } + pb.finish() +} + +pub fn stroke_poly( + pm: &mut Pixmap, + pts: &[(f32, f32)], + c: Rgb, + a: u8, + width: f32, + dash: Option<[f32; 2]>, +) { + let Some(path) = polyline(pts) else { return }; + let mut stroke = Stroke { + width, + line_cap: LineCap::Round, + line_join: LineJoin::Round, + ..Default::default() + }; + if let Some([on, off]) = dash { + stroke.dash = StrokeDash::new(vec![on, off], 0.0); + } + pm.stroke_path( + &path, + &paint_rgb(c, a), + &stroke, + Transform::identity(), + None, + ); +} + +pub fn fill_poly(pm: &mut Pixmap, pts: &[(f32, f32)], c: Rgb, a: u8) { + let mut pb = PathBuilder::new(); + let Some(&(x0, y0)) = pts.first() else { return }; + pb.move_to(x0, y0); + for &(x, y) in &pts[1..] { + pb.line_to(x, y); + } + pb.close(); + let Some(path) = pb.finish() else { return }; + pm.fill_path( + &path, + &paint_rgb(c, a), + FillRule::Winding, + Transform::identity(), + None, + ); +} + +pub fn fill_rect(pm: &mut Pixmap, x: f32, y: f32, w: f32, h: f32, c: Rgb, a: u8) { + if let Some(r) = Rect::from_xywh(x, y, w, h) { + pm.fill_rect(r, &paint_rgb(c, a), Transform::identity(), None); + } +} + +pub fn stroke_rect(pm: &mut Pixmap, x: f32, y: f32, w: f32, h: f32, c: Rgb, width: f32) { + stroke_poly( + pm, + &[(x, y), (x + w, y), (x + w, y + h), (x, y + h), (x, y)], + c, + 255, + width, + None, + ); +} + +/// Round axis ticks spanning [lo, hi] +pub fn nice_ticks(lo: f64, hi: f64, target: usize) -> Vec { + if hi <= lo || target == 0 { + return vec![lo]; + } + let raw = (hi - lo) / target as f64; + let mag = 10f64.powf(raw.log10().floor()); + let norm = raw / mag; + let step = mag + * if norm < 1.5 { + 1.0 + } else if norm < 3.0 { + 2.0 + } else if norm < 7.0 { + 5.0 + } else { + 10.0 + }; + let mut t = (lo / step).ceil() * step; + let mut out = Vec::new(); + while t <= hi + step * 1e-9 { + out.push(t); + t += step; + } + out +} + +// ── text ───────────────────────────────────────────────────────────────────── +pub fn text_width(font: &FontRef<'static>, px: f32, text: &str) -> f32 { + let sf = font.as_scaled(PxScale::from(px)); + let mut w = 0.0; + let mut prev = None; + for ch in text.chars() { + let g = font.glyph_id(ch); + if let Some(p) = prev { + w += sf.kern(p, g); + } + w += sf.h_advance(g); + prev = Some(g); + } + w +} + +fn text_pixmap(font: &FontRef<'static>, px: f32, text: &str, c: Rgb) -> Option { + let sf = font.as_scaled(PxScale::from(px)); + let w = (text_width(font, px, text).ceil() as u32 + 2).max(1); + let h = ((sf.ascent() - sf.descent()).ceil() as u32 + 2).max(1); + let mut pm = Pixmap::new(w, h)?; + let baseline = sf.ascent() + 1.0; + let data = pm.data_mut(); + let mut caret = 1.0f32; + let mut prev = None; + for ch in text.chars() { + let gid = font.glyph_id(ch); + if let Some(p) = prev { + caret += sf.kern(p, gid); + } + let glyph = gid.with_scale_and_position(PxScale::from(px), point(caret, baseline)); + caret += sf.h_advance(gid); + prev = Some(gid); + let Some(og) = font.outline_glyph(glyph) else { + continue; + }; + let bb = og.px_bounds(); + og.draw(|gx, gy, cov| { + let x = bb.min.x as i32 + gx as i32; + let y = bb.min.y as i32 + gy as i32; + if x < 0 || y < 0 || x as u32 >= w || y as u32 >= h { + return; + } + let a = (cov.clamp(0.0, 1.0) * 255.0).round() as u8; + if a == 0 { + return; + } + let i = ((y as u32 * w + x as u32) * 4) as usize; + // Keep stronger coverage when glyph boxes overlap + if a >= data[i + 3] { + let pre = |v: u8| ((v as u16 * a as u16) / 255) as u8; + data[i] = pre(c.0); + data[i + 1] = pre(c.1); + data[i + 2] = pre(c.2); + data[i + 3] = a; + } + }); + } + Some(pm) +} + +/// Blit premultiplied src onto opaque dst +fn blit(dst: &mut Pixmap, src: &Pixmap, dx: i32, dy: i32, rotate_ccw: bool) { + let (dw, dh) = (dst.width(), dst.height()); + let (sw, sh) = (src.width(), src.height()); + let s = src.data(); + let d = dst.data_mut(); + for sy in 0..sh { + for sx in 0..sw { + let si = ((sy * sw + sx) * 4) as usize; + let a = s[si + 3]; + if a == 0 { + continue; + } + let (rx, ry) = if rotate_ccw { + (sy as i32, (sw - 1 - sx) as i32) + } else { + (sx as i32, sy as i32) + }; + let (px_, py_) = (dx + rx, dy + ry); + if px_ < 0 || py_ < 0 || px_ as u32 >= dw || py_ as u32 >= dh { + continue; + } + let di = ((py_ as u32 * dw + px_ as u32) * 4) as usize; + let inv = (255 - a) as u16; + for k in 0..3 { + d[di + k] = (s[si + k] as u16 + d[di + k] as u16 * inv / 255) as u8; + } + d[di + 3] = 255; + } + } +} + +pub fn text_at( + pm: &mut Pixmap, + font: &FontRef<'static>, + x: f32, + top: f32, + px: f32, + text: &str, + c: Rgb, +) { + if let Some(t) = text_pixmap(font, px, text, c) { + blit(pm, &t, x.round() as i32, top.round() as i32, false); + } +} + +pub fn text_center( + pm: &mut Pixmap, + font: &FontRef<'static>, + cx: f32, + top: f32, + px: f32, + text: &str, + c: Rgb, +) { + if let Some(t) = text_pixmap(font, px, text, c) { + blit( + pm, + &t, + (cx - t.width() as f32 / 2.0).round() as i32, + top.round() as i32, + false, + ); + } +} + +pub fn text_right( + pm: &mut Pixmap, + font: &FontRef<'static>, + right: f32, + cy: f32, + px: f32, + text: &str, + c: Rgb, +) { + if let Some(t) = text_pixmap(font, px, text, c) { + let x = (right - t.width() as f32).round() as i32; + let y = (cy - t.height() as f32 / 2.0).round() as i32; + blit(pm, &t, x, y, false); + } +} + +pub fn text_left_mid( + pm: &mut Pixmap, + font: &FontRef<'static>, + x: f32, + cy: f32, + px: f32, + text: &str, + c: Rgb, +) { + if let Some(t) = text_pixmap(font, px, text, c) { + blit( + pm, + &t, + x.round() as i32, + (cy - t.height() as f32 / 2.0).round() as i32, + false, + ); + } +} + +pub fn text_vert( + pm: &mut Pixmap, + font: &FontRef<'static>, + left: f32, + cy: f32, + px: f32, + text: &str, + c: Rgb, +) { + if let Some(t) = text_pixmap(font, px, text, c) { + // Rotated box is t.height() wide x t.width() tall + let top = (cy - t.width() as f32 / 2.0).round() as i32; + blit(pm, &t, left.round() as i32, top, true); + } +} + +// ── number formatting ───────────────────────────────────────────────────────── +pub fn fmt_num(v: f64) -> String { + if v.is_finite() && v.fract() == 0.0 && v.abs() < 1e15 { + format!("{}", v as i64) + } else { + // Trim trailing zeros from six-place rendering + let s = format!("{v:.6}"); + let s = s.trim_end_matches('0').trim_end_matches('.'); + s.to_string() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn variant_and_box() { + assert_eq!(variant_of("walrus-serial-b0"), "walrus-serial"); + assert_eq!(box_of("walrus-serial-b0"), "0"); + assert_eq!(variant_of("walrus-r1"), "walrus-r1"); // not replica suffix + assert_eq!(box_of("walrus"), "0"); + assert_eq!(variant_of("pgbackrest-b12"), "pgbackrest"); + assert_eq!(box_of("pgbackrest-b12"), "12"); + } + + #[test] + fn median_odd_even() { + assert_eq!(median(&mut [3.0, 1.0, 2.0]), 2.0); + assert_eq!(median(&mut [4.0, 1.0, 3.0, 2.0]), 2.5); + } + + #[test] + fn fmt_num_int_vs_float() { + assert_eq!(fmt_num(42.0), "42"); + assert_eq!(fmt_num(2.5), "2.5"); + } + + #[test] + fn variant_order_walrus_first() { + let v = variants_ordered(&["walg-b0".into(), "walrus-b0".into(), "pgbackrest-b0".into()]); + assert_eq!(v[0], "walrus"); + } +} diff --git a/ci/cross_tool_encryption.sh b/ci/cross_tool_encryption.sh index 4c12a0a..fcfc327 100755 --- a/ci/cross_tool_encryption.sh +++ b/ci/cross_tool_encryption.sh @@ -11,11 +11,11 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" # Exported so the archive_command + restore_command subprocesses inherit it too export WALG_LIBSODIUM_KEY='walrus_test_libsodium_key_32byte' -cross_roundtrip "$WALRUS_BIN" "$WALG_BIN" +cross_roundtrip "$WALRUS_BIN" "$WALG_BIN" "$PGDATA" echo "cross_tool_encryption forward OK" bucket_reset -cross_roundtrip "$WALG_BIN" "$WALRUS_BIN" +cross_roundtrip "$WALG_BIN" "$WALRUS_BIN" "$PGDATA" echo "cross_tool_encryption reverse OK" echo "cross_tool_encryption OK" diff --git a/ci/cross_tool_forward.sh b/ci/cross_tool_forward.sh index 64bc168..a2e4926 100755 --- a/ci/cross_tool_forward.sh +++ b/ci/cross_tool_forward.sh @@ -17,7 +17,7 @@ pgbench -p "$PGPORT" -h "$PGHOST" -i -s 1 postgres psql -p "$PGPORT" -h "$PGHOST" -c "CHECKPOINT" postgres pg_dumpall -p "$PGPORT" -h "$PGHOST" -f "$WORKROOT/dump1.sql" -walrus backup-push +walrus backup-push "$PGDATA" psql -p "$PGPORT" -h "$PGHOST" -c "SELECT pg_switch_wal()" postgres sleep 3 diff --git a/ci/cross_tool_lzma.sh b/ci/cross_tool_lzma.sh index d413b9a..807b84a 100755 --- a/ci/cross_tool_lzma.sh +++ b/ci/cross_tool_lzma.sh @@ -10,11 +10,11 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" # lib.sh respects a pre-set method; the archive_command inlines it too export WALG_COMPRESSION_METHOD=lzma -cross_roundtrip "$WALRUS_BIN" "$WALG_BIN" +cross_roundtrip "$WALRUS_BIN" "$WALG_BIN" "$PGDATA" echo "cross_tool_lzma forward OK" bucket_reset -cross_roundtrip "$WALG_BIN" "$WALRUS_BIN" +cross_roundtrip "$WALG_BIN" "$WALRUS_BIN" "$PGDATA" echo "cross_tool_lzma reverse OK" echo "cross_tool_lzma OK" diff --git a/ci/cross_tool_retention.sh b/ci/cross_tool_retention.sh index 9cb16fd..710fdf0 100755 --- a/ci/cross_tool_retention.sh +++ b/ci/cross_tool_retention.sh @@ -22,7 +22,7 @@ push_three() { local _i for _i in 1 2 3; do psql -p "$PGPORT" -h "$PGHOST" -c "CHECKPOINT" postgres - if [ "$tool" = "$WALRUS_BIN" ]; then walrus backup-push; else walg backup-push "$PGDATA"; fi + if [ "$tool" = "$WALRUS_BIN" ]; then walrus backup-push "$PGDATA"; else walg backup-push "$PGDATA"; fi psql -p "$PGPORT" -h "$PGHOST" -c "SELECT pg_switch_wal()" postgres sleep 1 done diff --git a/ci/cross_tool_stream.sh b/ci/cross_tool_stream.sh new file mode 100755 index 0000000..a6b244a --- /dev/null +++ b/ci/cross_tool_stream.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Cross-tool streaming BASE_BACKUP interop. Both tools take a full backup over a +# replication connection (no PGDATA arg → wal-g's remote/streaming path, walrus's +# BASE_BACKUP path), the other restores and replays, dumps compared. Asserts the +# streamed tar layout (part_NNN.tar + pg_control.tar tee) is mutually readable. +# Full backups only: delta is unavailable in either tool's streaming mode. +# Forward: walrus streams, wal-g restores. Reverse: wal-g streams, walrus restores. +set -euxo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +. "$SCRIPT_DIR/lib.sh" + +# No source arg → cross_roundtrip drives both tools via streaming BASE_BACKUP +cross_roundtrip "$WALRUS_BIN" "$WALG_BIN" +echo "cross_tool_stream forward OK" + +bucket_reset +cross_roundtrip "$WALG_BIN" "$WALRUS_BIN" +echo "cross_tool_stream reverse OK" + +echo "cross_tool_stream OK" diff --git a/ci/delta_sidecar.sh b/ci/delta_sidecar.sh new file mode 100755 index 0000000..e7d0dd4 --- /dev/null +++ b/ci/delta_sidecar.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +# Delta backup via WAL sidecars (WALG_USE_WAL_DELTA). The archiver records a +# `_delta` sidecar per complete 16-segment group; backup-push's delta map +# then folds the whole group instead of re-parsing its raw WAL. Group 0 never +# finalizes (no preceding segment seeds its boundary head, and segment 0 is never +# written), so the first foldable group is 16 — the run must cross ~32 segments +# for `build_delta_map_from_sidecars` to fold rather than fall back to a raw walk. +# +# walrus-only: parent full, spread real heap changes across a full group, take a +# 1-step delta whose map folds the sidecar, then restore the chain + replay. +set -euxo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +. "$SCRIPT_DIR/lib.sh" + +# The archive_command subprocess inherits this from the postmaster's env, so +# wal-push records sidecars (pg_archive_on doesn't inline it). +export WALG_USE_WAL_DELTA=1 + +pg_initdb +pg_archive_on "$WALRUS_BIN" +pg_start + +psql -p "$PGPORT" -h "$PGHOST" -c \ + "CREATE TABLE t (id int primary key, v text) WITH (autovacuum_enabled=false)" postgres +psql -p "$PGPORT" -h "$PGHOST" -c \ + "INSERT INTO t SELECT g, repeat('a', 200) FROM generate_series(1, 2000) g" postgres +psql -p "$PGPORT" -h "$PGHOST" -c "CHECKPOINT" postgres + +# parent full (delta detection off) +export WALG_DELTA_MAX_STEPS=0 +walrus backup-push "$PGDATA" + +# Spread real heap changes across ~40 segments so the complete group 16 +# (segments 16-31) carries changes only its sidecar fold can recover. Each +# pg_switch_wal closes a segment, so the archiver records every one and finalizes +# the group's sidecar on its last segment. +for i in $(seq 1 40); do + psql -p "$PGPORT" -h "$PGHOST" -qc \ + "UPDATE t SET v = repeat(chr(65 + ($i % 26)), 200) WHERE id % 8 = ($i % 8)" postgres + psql -p "$PGPORT" -h "$PGHOST" -qtAc "SELECT pg_switch_wal()" postgres >/dev/null +done +psql -p "$PGPORT" -h "$PGHOST" -c "CHECKPOINT" postgres +pg_dumpall -p "$PGPORT" -h "$PGHOST" -f "$WORKROOT/dump1.sql" + +# Group 16's sidecar uploads only once its last segment (...001F = 31) is +# recorded. Poll for it: its presence proves recording finalized a complete +# group, and is the precondition for the fold below. +sidecar= +for _ in $(seq 1 120); do + sidecar=$(find "$WALG_FILE_PREFIX" -name '000000010000000000000010_delta*' -print -quit) + [ -n "$sidecar" ] && break + sleep 1 +done +[ -n "$sidecar" ] || { + echo "FAIL: group-16 delta sidecar never finalized" + find "$WALG_FILE_PREFIX" -name '*_delta*' || true + exit 1 +} +echo "sidecar: $sidecar" + +# 1-step delta off the parent; its map must fold the sidecar, not raw-walk it +export WALG_DELTA_MAX_STEPS=1 +walrus backup-push "$PGDATA" 2>"$WORKROOT/delta.log" +cat "$WORKROOT/delta.log" +unset WALG_DELTA_MAX_STEPS + +grep -q "delta map:" "$WORKROOT/delta.log" || { echo "FAIL: no delta map built"; exit 1; } +if grep -q "delta sidecars unusable" "$WORKROOT/delta.log"; then + echo "FAIL: fell back to full raw-WAL walk instead of folding sidecars" + exit 1 +fi +if grep -q "000000010000000000000010_delta absent" "$WORKROOT/delta.log"; then + echo "FAIL: group-16 sidecar present but raw-walked instead of folded" + exit 1 +fi + +walrus backup-list | tee "$WORKROOT/list.txt" +grep -E '_D_' "$WORKROOT/list.txt" || { echo "FAIL: no delta backup written"; exit 1; } + +# Restore the chain (parent full + folded-delta increment) and replay. Changes +# in segments 16-31 are reconstructable only through the folded sidecar, so a +# matching dump proves the fold recovered them. +pg_drop +mkdir -p "$PGDATA" +chmod 700 "$PGDATA" +walrus backup-fetch "$PGDATA" LATEST + +pg_recovery_conf "$WALRUS_BIN wal-fetch %f %p" +pg_start +for _ in $(seq 1 60); do + if psql -p "$PGPORT" -h "$PGHOST" -tAc 'SELECT pg_is_in_recovery()' postgres 2>/dev/null | grep -qx f; then + break + fi + sleep 1 +done + +pg_dumpall -p "$PGPORT" -h "$PGHOST" -f "$WORKROOT/dump2.sql" +diff -I '^\\\(restrict\|unrestrict\) ' "$WORKROOT/dump1.sql" "$WORKROOT/dump2.sql" +pg_drop + +echo "delta_sidecar OK" diff --git a/ci/lib.sh b/ci/lib.sh index 411d387..ad3f401 100755 --- a/ci/lib.sh +++ b/ci/lib.sh @@ -113,11 +113,14 @@ pg_replication_on() { # wal_keep_size retains recent segments so START_REPLICATION from the # current segment boundary can't race a checkpoint that recycles it # ("requested WAL segment ... has already been removed") on an otherwise - # idle cluster. + # idle cluster. Sized large enough that a delta push's WAL-walk reaches its + # parent's start segment: the suite runs tests in parallel, so concurrent WAL + # widens the parent->delta gap (observed past 160MB), and archive_mode is off + # here, so a segment recycled before the delta reads it is gone for good. cat >>"$PGDATA/postgresql.conf" </dev/null @@ -241,9 +250,9 @@ cross_delta_roundtrip() { sleep 3 pg_dumpall -p "$PGPORT" -h "$PGHOST" -f "$WORKROOT/dump1.sql" - # 1-step delta off the parent + # 1-step delta off the parent. Deltas read local PGDATA (fs source) export WALG_DELTA_MAX_STEPS=1 - if [ "$writer" = "$WALRUS_BIN" ]; then walrus backup-push; else walg backup-push "$PGDATA"; fi + if [ "$writer" = "$WALRUS_BIN" ]; then walrus backup-push "$PGDATA"; else walg backup-push "$PGDATA"; fi psql -p "$PGPORT" -h "$PGHOST" -c "SELECT pg_switch_wal()" postgres sleep 3 unset WALG_DELTA_MAX_STEPS diff --git a/src/config/mod.rs b/src/config/mod.rs index bbcd23a..85fc965 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -968,4 +968,37 @@ mod tests { assert!(DeltaSettings::from_env().is_err()); } } + + #[tokio::test] + async fn throttle_wraps_reader_when_rate_limited() { + use tokio::io::AsyncReadExt; + let payload = vec![7u8; 64]; + let mk = || -> crate::compression::AsyncReader { + Box::pin(std::io::Cursor::new(payload.clone())) + }; + // non-zero limits take the RateLimited-wrapping arm; bytes pass through + let s = Settings { + network_rate_limit: 1 << 20, + disk_rate_limit: 1 << 20, + ..Default::default() + }; + for mut r in [s.throttle_network(mk()), s.throttle_disk(mk())] { + let mut got = Vec::new(); + r.read_to_end(&mut got).await.unwrap(); + assert_eq!(got, payload); + } + } + + #[test] + fn s3_credentials_incomplete_static_is_error() { + // access key without its secret is a hard error, never a silent IMDS + // fallback + let _g = EnvGuard::new(&[ + ("AWS_ACCESS_KEY_ID", Some("AKIAEXAMPLE")), + ("AWS_ACCESS_KEY", None), + ("AWS_SECRET_ACCESS_KEY", None), + ("AWS_SECRET_KEY", None), + ]); + assert!(s3_credentials(None).is_err()); + } } diff --git a/src/crypto/libsodium.rs b/src/crypto/libsodium.rs index 63ef42a..980fd50 100644 --- a/src/crypto/libsodium.rs +++ b/src/crypto/libsodium.rs @@ -582,4 +582,131 @@ mod tests { assert!(t.apply(&b64).is_ok()); assert!(t.apply("aGVsbG8=").is_err()); } + + #[test] + fn crypter_reports_its_name() { + assert_eq!(LibsodiumCrypter::new(key()).name(), "libsodium"); + } + + #[test] + fn key_transform_from_name_maps_aliases_and_rejects_unknown() { + assert_eq!(KeyTransform::from_name("").unwrap(), KeyTransform::None); + assert_eq!(KeyTransform::from_name("none").unwrap(), KeyTransform::None); + // case-insensitive + assert_eq!(KeyTransform::from_name("HEX").unwrap(), KeyTransform::Hex); + assert_eq!( + KeyTransform::from_name("Base64").unwrap(), + KeyTransform::Base64 + ); + assert!(KeyTransform::from_name("rot13").is_err()); + } + + #[test] + fn from_path_reads_trims_and_validates_key_file() { + let dir = tempfile::tempdir().unwrap(); + let key_file = dir.path().join("key"); + // surrounding whitespace is trimmed before the none-transform padding + std::fs::write(&key_file, b" 0123456789012345678901234567 \n").unwrap(); + let c = + LibsodiumCrypter::from_path(key_file.to_str().unwrap(), KeyTransform::None).unwrap(); + assert_eq!(&c.key[..28], b"0123456789012345678901234567"); + + // missing file surfaces a read error + assert!( + LibsodiumCrypter::from_path( + dir.path().join("absent").to_str().unwrap(), + KeyTransform::None + ) + .is_err() + ); + + // non-UTF-8 contents rejected before transform + let bad = dir.path().join("bin"); + std::fs::write(&bad, [0xff, 0xfe, 0x00]).unwrap(); + assert!(LibsodiumCrypter::from_path(bad.to_str().unwrap(), KeyTransform::None).is_err()); + } + + /// Yields `data` verbatim, then errors on the next poll. Drives the + /// inner-read error and EOF branches of the secretstream readers + struct FailAfter { + data: Vec, + pos: usize, + } + + impl AsyncRead for FailAfter { + fn poll_read( + mut self: Pin<&mut Self>, + _cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + if self.pos < self.data.len() { + let n = buf.remaining().min(self.data.len() - self.pos); + let start = self.pos; + buf.put_slice(&self.data[start..start + n]); + self.pos += n; + return Poll::Ready(Ok(())); + } + Poll::Ready(Err(std::io::Error::other("boom"))) + } + } + + #[tokio::test] + async fn encrypt_propagates_inner_read_error() { + let c = LibsodiumCrypter::new(key()); + let mut enc = c.encrypt_reader(Box::pin(FailAfter { + data: Vec::new(), + pos: 0, + })); + let mut out = Vec::new(); + // header drains first, then the inner error surfaces + assert!(enc.read_to_end(&mut out).await.is_err()); + } + + #[tokio::test] + async fn decrypt_errors_on_eof_before_header() { + let c = LibsodiumCrypter::new(key()); + let mut dec = c.decrypt_reader(Box::pin(Cursor::new(Vec::new()))); + let mut out = Vec::new(); + assert!(dec.read_to_end(&mut out).await.is_err()); + } + + #[tokio::test] + async fn decrypt_propagates_inner_error_after_header() { + let c = LibsodiumCrypter::new(key()); + // 24 arbitrary header bytes init the pull stream; the next read errors + let mut dec = c.decrypt_reader(Box::pin(FailAfter { + data: vec![0u8; HEADER_BYTES], + pos: 0, + })); + let mut out = Vec::new(); + assert!(dec.read_to_end(&mut out).await.is_err()); + } + + #[tokio::test] + async fn decrypt_errors_on_header_only_stream() { + // a valid header followed by no chunk at all: ends without a FINAL tag + let plain: &[u8] = &[]; + let c = LibsodiumCrypter::new(key()); + let mut enc = c.encrypt_reader(Box::pin(Cursor::new(plain.to_vec()))); + let mut ct = Vec::new(); + enc.read_to_end(&mut ct).await.unwrap(); + let header_only = ct[..HEADER_BYTES].to_vec(); + let mut dec = c.decrypt_reader(Box::pin(Cursor::new(header_only))); + let mut out = Vec::new(); + assert!(dec.read_to_end(&mut out).await.is_err()); + } + + #[tokio::test] + async fn decrypt_errors_on_sub_abytes_tail() { + // header + a tail shorter than the per-chunk overhead can't authenticate + let c = LibsodiumCrypter::new(key()); + let mut enc = c.encrypt_reader(Box::pin(Cursor::new(Vec::new()))); + let mut ct = Vec::new(); + enc.read_to_end(&mut ct).await.unwrap(); + let mut short = ct[..HEADER_BYTES].to_vec(); + short.extend_from_slice(&ct[HEADER_BYTES..HEADER_BYTES + 4]); // 4 < ABYTES + let mut dec = c.decrypt_reader(Box::pin(Cursor::new(short))); + let mut out = Vec::new(); + assert!(dec.read_to_end(&mut out).await.is_err()); + } } diff --git a/src/daemon/mod.rs b/src/daemon/mod.rs index ee886db..1e0c0b2 100644 --- a/src/daemon/mod.rs +++ b/src/daemon/mod.rs @@ -245,4 +245,67 @@ mod tests { let p = resolve_pgdata_path("000000010000000000000001", None, true); assert_eq!(p, Path::new("000000010000000000000001")); } + + fn test_daemon(store: &Path, push_timeout: Duration) -> Arc { + let settings = Settings { + storage: crate::config::StorageSettings::Fs { + path: store.to_string_lossy().into(), + }, + compression: crate::compression::Method::None, + ..Default::default() + }; + let storage: DynStorage = Arc::new(crate::storage::fs::FsStorage::new(store).unwrap()); + Arc::new(Daemon { + uploader: Arc::new(Uploader::new(Arc::new(settings), storage)), + pgdata: None, + push_timeout, + }) + } + + #[tokio::test] + async fn dispatch_rejects_unsupported_message_type() { + let dir = tempfile::tempdir().unwrap(); + let d = test_daemon(&dir.path().join("store"), DEFAULT_PUSH_TIMEOUT); + // a response-only type is never a valid request + assert!(dispatch(MessageType::Ok, Vec::new(), &d).await.is_err()); + } + + #[tokio::test] + async fn wal_fetch_requires_two_args() { + let dir = tempfile::tempdir().unwrap(); + let d = test_daemon(&dir.path().join("store"), DEFAULT_PUSH_TIMEOUT); + let body = protocol::encode_args(&["only-one-arg"]).unwrap(); + assert!(dispatch(MessageType::WalFetch, body, &d).await.is_err()); + } + + #[tokio::test] + async fn wal_fetch_missing_archive_maps_to_non_existence() { + let dir = tempfile::tempdir().unwrap(); + let d = test_daemon(&dir.path().join("store"), DEFAULT_PUSH_TIMEOUT); + let dst = dir.path().join("dst-seg"); + let body = + protocol::encode_args(&["000000010000000000000099", dst.to_str().unwrap()]).unwrap(); + let resp = dispatch(MessageType::WalFetch, body, &d).await.unwrap(); + assert_eq!(resp, MessageType::ArchiveNonExistence); + } + + #[tokio::test] + async fn wal_push_with_zero_timeout_skips_deadline() { + let dir = tempfile::tempdir().unwrap(); + let store = dir.path().join("store"); + // zero timeout takes the no-deadline branch + let d = test_daemon(&store, Duration::ZERO); + let seg = "000000010000000000000001"; + let seg_path = dir.path().join(seg); + std::fs::write(&seg_path, vec![0u8; 32]).unwrap(); + let resp = dispatch( + MessageType::WalPush, + seg_path.to_str().unwrap().as_bytes().to_vec(), + &d, + ) + .await + .unwrap(); + assert_eq!(resp, MessageType::Ok); + assert!(store.join(crate::pg::WAL_FOLDER).join(seg).exists()); + } } diff --git a/src/daemon/protocol.rs b/src/daemon/protocol.rs index 2e41560..710bf8a 100644 --- a/src/daemon/protocol.rs +++ b/src/daemon/protocol.rs @@ -156,4 +156,68 @@ mod tests { let args = parse_args(&body).unwrap(); assert_eq!(args, vec!["000000010000000000000001", "/dst/path"]); } + + #[test] + fn from_byte_maps_all_types_and_rejects_unknown() { + assert_eq!(MessageType::from_byte(b'C'), Some(MessageType::Check)); + assert_eq!(MessageType::from_byte(b'O'), Some(MessageType::Ok)); + assert_eq!(MessageType::from_byte(b'E'), Some(MessageType::Error)); + assert_eq!( + MessageType::from_byte(b'N'), + Some(MessageType::ArchiveNonExistence) + ); + assert_eq!(MessageType::from_byte(b'F'), Some(MessageType::WalPush)); + assert_eq!(MessageType::from_byte(b'f'), Some(MessageType::WalFetch)); + assert_eq!(MessageType::from_byte(b'?'), None); + } + + #[tokio::test] + async fn read_message_rejects_length_below_header() { + // total_len=2 is < the 3-byte header, so body math would underflow + let bytes = vec![b'O', 0x00, 0x02]; + let mut cur = Cursor::new(bytes); + assert!(read_message(&mut cur).await.is_err()); + } + + #[tokio::test] + async fn read_message_rejects_unknown_type_byte() { + let bytes = vec![b'?', 0x00, 0x03]; + let mut cur = Cursor::new(bytes); + assert!(read_message(&mut cur).await.is_err()); + } + + #[tokio::test] + async fn write_message_rejects_oversized_body() { + // one arg pushes total past u16::MAX + let big = "x".repeat(u16::MAX as usize); + let mut buf: Vec = Vec::new(); + assert!( + write_message(&mut buf, MessageType::WalPush, &[&big]) + .await + .is_err() + ); + } + + #[test] + fn encode_args_rejects_too_many_and_too_long() { + let many: Vec<&str> = vec!["x"; 256]; + assert!(encode_args(&many).is_err()); + let long = "y".repeat(u16::MAX as usize + 1); + assert!(encode_args(&[&long]).is_err()); + } + + #[test] + fn parse_args_empty_body_is_no_args() { + assert!(parse_args(&[]).unwrap().is_empty()); + } + + #[test] + fn parse_args_rejects_truncation_and_trailing() { + // count=2 but no length bytes follow + assert!(parse_args(&[2]).is_err()); + // declares a 5-byte arg with only 1 byte present + assert!(parse_args(&[1, 0, 5, b'a']).is_err()); + // a valid 1-byte arg followed by a stray trailing byte + assert!(parse_args(&[1, 0, 1, b'a', b'x']).is_err()); + } } diff --git a/src/daemon/uploader.rs b/src/daemon/uploader.rs index 315863e..0d2ae24 100644 --- a/src/daemon/uploader.rs +++ b/src/daemon/uploader.rs @@ -395,6 +395,83 @@ mod tests { assert!(!archived(dir, &segs[0])); } + #[tokio::test] + async fn non_segment_file_pushed_straight_through() { + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + let up = uploader(dir, 2); + // a .history file isn't a segment: bypasses look-ahead/dedup + let hist = dir.join("00000002.history"); + std::fs::write(&hist, b"1\t0/3000000\t\n").unwrap(); + up.wal_push(&hist).await.unwrap(); + let archived = dir + .join("store") + .join(crate::pg::WAL_FOLDER) + .join("00000002.history"); + assert_eq!(std::fs::read(&archived).unwrap(), b"1\t0/3000000\t\n"); + } + + #[tokio::test] + async fn foreground_joins_inflight_upload() { + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + let segs = seed_segments(dir, 1); + let up = uploader(dir, 1); // no look-ahead + // Pre-register the segment as in-flight (no driver); the foreground + // wal_push must join this shared future rather than start a duplicate + { + let fut = up.make_upload(dir.join(segs[0].format())); + up.state.lock().unwrap().inflight.insert(segs[0], fut); + } + up.wal_push(&dir.join(segs[0].format())).await.unwrap(); + assert!(archived(dir, &segs[0])); + } + + #[tokio::test] + async fn scan_ready_missing_dir_and_non_ready_entries() { + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + let after = SegmentName { + timeline: 1, + log_id: 0, + seg_no: 0, + }; + // no archive_status/ dir yet -> empty + assert!(scan_ready(dir, after).await.is_empty()); + // a non-.ready entry is ignored + let status = dir.join("archive_status"); + std::fs::create_dir_all(&status).unwrap(); + std::fs::write(status.join("000000010000000000000001.done"), b"").unwrap(); + std::fs::write(status.join("000000010000000000000002.ready"), b"").unwrap(); + let got = scan_ready(dir, after).await; + assert_eq!(got.len(), 1, "only the .ready marker is a candidate"); + assert_eq!(got[0].seg_no, 2); + } + + #[test] + fn mark_done_evicts_oldest_past_cap() { + let mut st = State { + inflight: HashMap::new(), + done: HashSet::new(), + done_order: VecDeque::new(), + }; + let seg = |n| SegmentName { + timeline: 1, + log_id: 0, + seg_no: n, + }; + st.mark_done(seg(1), 2); + st.mark_done(seg(2), 2); + st.mark_done(seg(3), 2); // evicts seg 1 + assert!(!st.done.contains(&seg(1)), "oldest evicted at cap"); + assert!(st.done.contains(&seg(2))); + assert!(st.done.contains(&seg(3))); + assert_eq!(st.done_order.len(), 2); + // re-marking an existing entry is a no-op (no spurious growth) + st.mark_done(seg(3), 2); + assert_eq!(st.done_order.len(), 2); + } + #[tokio::test] async fn lookahead_capped_by_cumulative_inflight() { let tmp = tempfile::tempdir().unwrap(); diff --git a/src/main.rs b/src/main.rs index bd7e004..e2a10d9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -53,4 +53,4 @@ fn cap_malloc_arenas(n: usize) { } #[cfg(not(all(target_os = "linux", target_env = "gnu")))] -fn cap_malloc_arenas() {} +fn cap_malloc_arenas(_: usize) {} diff --git a/src/pg/backup/copy.rs b/src/pg/backup/copy.rs index 8b42285..faacd45 100644 --- a/src/pg/backup/copy.rs +++ b/src/pg/backup/copy.rs @@ -191,6 +191,8 @@ async fn collect_wal_keys( #[cfg(test)] mod tests { + use std::num::NonZeroU64; + use super::*; use crate::pg::backup::{ BackupSentinelDto, BackupSentinelDtoV2, format_backup_name, sentinel_key, @@ -215,8 +217,8 @@ mod tests { async fn seed_backup(store: &DynStorage, name: &str, start_lsn: u64, finish_lsn: u64) { let v2 = BackupSentinelDtoV2 { sentinel: BackupSentinelDto { - backup_start_lsn: Some(start_lsn), - backup_finish_lsn: Some(finish_lsn), + backup_start_lsn: NonZeroU64::new(start_lsn), + backup_finish_lsn: NonZeroU64::new(finish_lsn), pg_version: 160003, ..Default::default() }, diff --git a/src/pg/backup/delete.rs b/src/pg/backup/delete.rs index db13f00..f34d386 100644 --- a/src/pg/backup/delete.rs +++ b/src/pg/backup/delete.rs @@ -108,8 +108,8 @@ async fn fetch_record(storage: &DynStorage, name: &str) -> Result name: name.to_string(), timeline, start_seg_no, - start_lsn: v2.sentinel.backup_start_lsn.unwrap_or(0), - finish_lsn: v2.sentinel.backup_finish_lsn.unwrap_or(0), + start_lsn: v2.sentinel.backup_start_lsn.map_or(0, |l| l.get()), + finish_lsn: v2.sentinel.backup_finish_lsn.map_or(0, |l| l.get()), start_time: v2.start_time, is_permanent: v2.is_permanent, increment_from: v2.sentinel.increment_from.clone(), diff --git a/src/pg/backup/delta.rs b/src/pg/backup/delta.rs index 2240a10..4763df5 100644 --- a/src/pg/backup/delta.rs +++ b/src/pg/backup/delta.rs @@ -22,9 +22,11 @@ //! RelFileNode. Block numbers in the delta map are global (segment id × //! `BLOCKS_IN_REL_FILE` + intra-segment offset) -use std::collections::{BTreeMap, BTreeSet, HashSet}; +use std::collections::{BTreeMap, HashSet}; use std::io::{self, Read}; +use std::path::{Path, PathBuf}; use std::sync::Arc; +use std::time::Duration; use anyhow::{Context, Result, anyhow}; use roaring::RoaringBitmap; @@ -32,16 +34,20 @@ use thiserror::Error; use tokio_util::io::SyncIoBridge; use crate::compression; +use crate::concurrency::BoundedTasks; use crate::pg::backup::fetch::fetch_sentinel; use crate::pg::backup::wal_delta::{ WAL_FILES_IN_DELTA, delta_group_name, delta_group_no, delta_storage_key, seg_name_from_global, }; -use crate::pg::backup::{BackupSentinelDtoV2, increment, name_from_sentinel_key}; +use crate::pg::backup::{BackupSentinelDtoV2, format_pg_lsn, increment, name_from_sentinel_key}; use crate::pg::wal::segment::{SegmentName, wal_segment_size}; use crate::pg::walparser::{ - BlockLocation, ParsePageError, RelFileNode, WalParser, extract_locations_from_wal_file, + BlockLocation, ParsePageError, RelFileNode, SegmentBoundary, WalParser, + extract_block_locations, extract_locations_from_wal_file, parse_record_from_bytes, + walk_segment_locations, }; -use crate::storage::DynStorage; +use crate::retry::{RetryPolicy, with_retry}; +use crate::storage::{DynStorage, StorageError}; pub const PG_PAGE_SIZE: u64 = 8192; /// PG's per-file size cap before splitting into `.` segments @@ -100,11 +106,19 @@ impl PagedFileDeltaMap { } } - /// Return the bitmap of changed blocks for a paged-file path. Returns + /// Per-rel set union of another map. Disjoint LSN sub-ranges of one delta + /// (eg summaries + a raw-walked gap) compose by changed-block union + pub fn merge(&mut self, other: PagedFileDeltaMap) { + for (rel, blocks) in other.by_rel { + *self.by_rel.entry(rel).or_default() |= blocks; + } + } + + /// Return the changed blocks for a paged-file path, ascending. Returns /// `None` if the rel isn't in the map (file unchanged). /// Blocks are returned in *segment-relative* offsets (0..BLOCKS_IN_REL_FILE) /// for the segment id derived from the trailing `.` of `path` - pub fn blocks_for(&self, path: &str) -> Result>, DeltaError> { + pub fn blocks_for(&self, path: &str) -> Result>, DeltaError> { let rel = get_rel_file_node_from(path)?; let Some(blocks) = self.by_rel.get(&rel) else { return Ok(None); @@ -112,7 +126,7 @@ impl PagedFileDeltaMap { let seg_id = get_rel_file_id_from(path)?; let lo = seg_id as u32 * BLOCKS_IN_REL_FILE; let hi = lo.saturating_add(BLOCKS_IN_REL_FILE); - let shifted: BTreeSet = blocks.range(lo..hi).map(|b| b - lo).collect(); + let shifted: Vec = blocks.range(lo..hi).map(|b| b - lo).collect(); Ok(Some(shifted)) } @@ -342,9 +356,14 @@ pub async fn configure_delta_parent( start_lsn: effective_v2 .sentinel .backup_start_lsn - .ok_or_else(|| anyhow!("parent BackupStartLSN missing after revalidation"))?, + .ok_or_else(|| anyhow!("parent BackupStartLSN missing after revalidation"))? + .get(), timeline, - finish_lsn: effective_v2.sentinel.backup_finish_lsn.unwrap_or(start_lsn), + finish_lsn: effective_v2 + .sentinel + .backup_finish_lsn + .unwrap_or(start_lsn) + .get(), increment_full_name: effective_v2 .sentinel .increment_full_name @@ -465,9 +484,15 @@ async fn find_by_user_data( /// groups (O(touched relations)) and parses only the trailing partial group's /// raw WAL. Mirrors wal-g `getDeltaMap`. /// -/// Falls back to a full raw-WAL walk if any sidecar is missing — wal-g hard -/// errors here, but the fallback keeps buckets archived without -/// `WALG_USE_WAL_DELTA` working unchanged +/// A missing sidecar for a complete group is raw-walked in place (one group's +/// reparse); only an unreadable sidecar or raw-walk failure falls back to a full +/// raw-WAL walk of the range — wal-g hard errors here, but the fallback keeps +/// buckets archived without `WALG_USE_WAL_DELTA` working unchanged +/// +/// `wal_dir` is the local `pg_wal` when the push reads a local data dir; raw +/// segments are served from there (uncompressed, no S3 round-trip), falling +/// back to the archive only for segments PG has already recycled. `None` for a +/// remote replication source, which has no local WAL pub async fn build_delta_map_from_wal( settings: &crate::config::Settings, storage: &DynStorage, @@ -475,6 +500,7 @@ pub async fn build_delta_map_from_wal( start_lsn: u64, end_lsn: u64, compression: compression::Method, + wal_dir: Option<&Path>, ) -> Result { if end_lsn <= start_lsn { return Ok(PagedFileDeltaMap::new()); @@ -486,6 +512,7 @@ pub async fn build_delta_map_from_wal( start_lsn, end_lsn, compression, + wal_dir, ) .await { @@ -493,7 +520,9 @@ pub async fn build_delta_map_from_wal( Err(e) => { tracing::warn!( target = "backup_push", - "delta sidecars unusable ({e:#}); re-parsing raw WAL [{start_lsn:X}, {end_lsn:X})", + "delta sidecars unusable ({e:#}); re-parsing raw WAL [{}, {})", + format_pg_lsn(start_lsn), + format_pg_lsn(end_lsn), ); build_delta_map_from_wal_full( settings, @@ -502,16 +531,33 @@ pub async fn build_delta_map_from_wal( start_lsn, end_lsn, compression, + wal_dir, ) .await } } } -/// Sidecar-driven build: delta files for whole groups + a raw-WAL walk of the -/// final partial group, seeded from the last sidecar's parser state so records -/// crossing the group boundary stitch correctly. Any missing sidecar errors so -/// the caller can fall back +/// Sidecar-driven build: a raw-WAL walk of the leading partial group, delta +/// files for the whole groups between, and a raw-WAL walk of the trailing +/// partial group. One `WalParser` threads across every group so a sidecar-less +/// group's raw walk stitches its leading boundary record from the prior group's +/// trailing head; a fold adopts the sidecar's own saved parser, authoritative +/// whichever path produced the previous group. +/// +/// A complete group whose sidecar is absent is raw-walked rather than erroring +/// the whole range. The archiver can't finalize a group whose preceding segment +/// it never recorded (no prev_head to seed) — the first complete group after a +/// recording start that lands on a group boundary — so that one object is +/// legitimately missing; walking it raw costs one group's reparse instead of a +/// full-range fallback. Other errors (corrupt/undecodable sidecar) still +/// propagate so the caller falls back to the full walk. +/// +/// `start_lsn` lands mid-group, so its group's sidecar would cover pre-`start_lsn` +/// segments the parent full never archived (the group never finalized, so no +/// object exists). First usable sidecar is the next group-aligned boundary; the +/// leading partial is walked raw, mirroring wal-g `getDeltaMap` which uses a +/// delta file only for a segment beginning a complete in-range group async fn build_delta_map_from_sidecars( settings: &crate::config::Settings, storage: &DynStorage, @@ -519,44 +565,90 @@ async fn build_delta_map_from_sidecars( start_lsn: u64, end_lsn: u64, compression: compression::Method, + wal_dir: Option<&Path>, ) -> Result { let seg_size = wal_segment_size(); let n = WAL_FILES_IN_DELTA; - let first_used_delta = delta_group_no(lsn_to_seg(start_lsn, seg_size)); + let start_seg = lsn_to_seg(start_lsn, seg_size); let first_not_used_delta = delta_group_no(lsn_to_seg(end_lsn, seg_size)); if first_not_used_delta < n { - anyhow::bail!("range [{start_lsn:X}, {end_lsn:X}) has no complete delta group ahead of it"); + anyhow::bail!( + "range [{}, {}) has no complete delta group ahead of it", + format_pg_lsn(start_lsn), + format_pg_lsn(end_lsn) + ); } let last_complete_group = first_not_used_delta - n; + // First group-aligned boundary at/after start_seg; leading partial walked raw + let lead_group = delta_group_no(start_seg); + let first_complete = if start_seg == lead_group { + lead_group + } else { + lead_group + n + }; + if first_complete > last_complete_group { + anyhow::bail!( + "range [{}, {}) spans no complete delta group", + format_pg_lsn(start_lsn), + format_pg_lsn(end_lsn) + ); + } + + let ctx = WalWalkCtx { + settings, + storage, + timeline, + seg_size, + compression, + wal_dir, + }; let mut delta = PagedFileDeltaMap::new(); - // Complete groups strictly before the last - let mut g = first_used_delta; - while g < last_complete_group { + // Threaded across every group: a sidecar-less group's raw walk stitches its + // leading boundary record from the prior group's trailing head; a fold + // replaces it with the sidecar's self-contained saved parser + let mut parser = WalParser::new(); + + // Leading partial group: raw WAL from start_seg to the first aligned group. + // Records attribute by start position, so this and the first sidecar partition + // cleanly + if start_seg < first_complete { + walk_segments_pipelined(&ctx, start_seg, first_complete - 1, &mut parser, &mut delta) + .await?; + } + + // Every complete group: fold its sidecar when present, else raw-walk the + // group (the archiver leaves no sidecar for a group it started recording on a + // boundary, with no preceding segment to seed prev_head) + let mut g = first_complete; + while g <= last_complete_group { let name = delta_group_name(timeline, g, seg_size); - delta = fold_sidecar_into_map(settings, storage, &name, compression, delta) + let key = delta_storage_key(&name, compression); + if storage + .exists(&key) .await - .with_context(|| format!("delta sidecar {name}"))? - .0; + .with_context(|| format!("stat {key}"))? + { + let (d, p) = fold_sidecar_into_map(settings, storage, &name, compression, delta) + .await + .with_context(|| format!("delta sidecar {name}"))?; + delta = d; + parser = p; + } else { + tracing::info!( + target = "backup_push", + "delta sidecar {name} absent; raw-walking group" + ); + walk_segments_pipelined(&ctx, g, g + n - 1, &mut parser, &mut delta).await?; + } g += n; } - // Last complete group: its locations + parser seed for the tail walk - let last_name = delta_group_name(timeline, last_complete_group, seg_size); - let (d, mut parser) = fold_sidecar_into_map(settings, storage, &last_name, compression, delta) - .await - .with_context(|| format!("delta sidecar {last_name}"))?; - delta = d; - // Trailing partial group: raw WAL from the group start up to end_lsn + // Trailing partial group: raw WAL from the group start up to end_lsn, seeded + // from the last complete group's trailing head let tail_first = first_not_used_delta; let tail_last = lsn_to_seg(end_lsn.saturating_sub(1), seg_size); - for s in tail_first..=tail_last { - let name = seg_name_from_global(timeline, s, seg_size).format(); - let locations = fetch_and_parse_segment(settings, storage, &name, compression, &mut parser) - .await - .with_context(|| format!("tail wal segment {name}"))?; - delta.add_locations(locations); - } + walk_segments_pipelined(&ctx, tail_first, tail_last, &mut parser, &mut delta).await?; Ok(delta) } @@ -585,7 +677,10 @@ async fn fold_sidecar_into_map( } /// Sync side of [`fold_sidecar_into_map`]: read tuples until the all-zero -/// terminator (or EOF), then the parser state +/// terminator, then the parser state. EOF before the terminator means a +/// truncated sidecar (interrupted upload/finalization) — error rather than +/// accept a partial map and a bogus empty parser, so the caller falls back to +/// a complete raw-WAL walk fn fold_sidecar_stream( decoded: compression::AsyncReader, mut map: PagedFileDeltaMap, @@ -596,7 +691,7 @@ fn fold_sidecar_stream( match r.read_exact(&mut buf) { Ok(()) => {} Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => { - return Ok((map, WalParser::new())); + anyhow::bail!("sidecar truncated: EOF before terminal tuple") } Err(e) => return Err(anyhow::Error::from(e).context("read sidecar tuple")), } @@ -615,7 +710,16 @@ fn fold_sidecar_stream( } /// Full raw-WAL walk of `[start_lsn, end_lsn)`: parse every segment. Fallback -/// when sidecars are absent; O(WAL volume). Bad segments are skipped (logged) +/// when sidecars are absent; O(WAL volume). A missing or corrupt segment errors: +/// the whole range is required WAL, so the caller takes a full backup rather than +/// recording a delta that silently omits the skipped segment's pages +/// +/// Each segment parses independently with a fresh parser, fanned across cores, +/// so the changed-block walk scales past the single-thread CPU bound the serial +/// walk hit. Records crossing a segment boundary are stitched from the saved +/// head/tail fragments. Degrades to the serial threaded walk when a record +/// spans more than one segment (longer than a segment), which the per-segment +/// model can't represent async fn build_delta_map_from_wal_full( settings: &crate::config::Settings, storage: &DynStorage, @@ -623,75 +727,317 @@ async fn build_delta_map_from_wal_full( start_lsn: u64, end_lsn: u64, compression: compression::Method, + wal_dir: Option<&Path>, ) -> Result { let seg_size = wal_segment_size(); - let mut delta = PagedFileDeltaMap::new(); - let mut parser = WalParser::new(); if end_lsn <= start_lsn { - return Ok(delta); + return Ok(PagedFileDeltaMap::new()); } let first_seg = lsn_to_seg(start_lsn, seg_size); let last_seg = lsn_to_seg(end_lsn.saturating_sub(1), seg_size); - for seg in first_seg..=last_seg { - let name = seg_name_from_global(timeline, seg, seg_size).format(); - let locations = - match fetch_and_parse_segment(settings, storage, &name, compression, &mut parser).await - { - Ok(l) => l, - Err(e) => { - tracing::warn!(target = "backup_push", "segment {name}: {e:#}; skipping"); - continue; - } - }; - delta.add_locations(locations); + let ctx = WalWalkCtx { + settings, + storage, + timeline, + seg_size, + compression, + wal_dir, + }; + if let Some(delta) = parse_segments_parallel(&ctx, first_seg, last_seg).await? { + return Ok(delta); } + tracing::warn!( + target = "backup_push", + "parallel reparse hit a record spanning >1 segment; serial walk [{first_seg}, {last_seg}]", + ); + let mut delta = PagedFileDeltaMap::new(); + let mut parser = WalParser::new(); + walk_segments_pipelined(&ctx, first_seg, last_seg, &mut parser, &mut delta).await?; Ok(delta) } +/// Shared, `'static` slice of [`WalWalkCtx`] for the spawned fetch+parse tasks +struct SegFetch { + settings: crate::config::Settings, + storage: DynStorage, + timeline: u32, + seg_size: u64, + compression: compression::Method, + wal_dir: Option, +} + +/// One segment's parse result, tagged by its offset from `first_seg` so the +/// completion handler can place fragments in segment order for boundary stitch +struct SegOut { + rel: usize, + result: Result<(Vec, SegmentBoundary)>, +} + +/// Parse `[first_seg, last_seg]` raw WAL concurrently: fetch + parse each +/// segment independently on a bounded fan-out, union the per-segment changed +/// blocks, then stitch the record crossing each segment boundary back together +/// from the saved head/tail fragments. +/// +/// Returns `Ok(None)` when the range holds a record spanning more than one +/// segment boundary — the per-segment model can't reconstruct it, so the caller +/// re-runs the threaded serial walk. A missing or corrupt segment is a hard +/// error: every segment in the range is required WAL, so skipping one would drop +/// its changed pages from the increment and silently restore stale parent data +async fn parse_segments_parallel( + ctx: &WalWalkCtx<'_>, + first_seg: u64, + last_seg: u64, +) -> Result> { + let count = (last_seg - first_seg + 1) as usize; + let fetch = Arc::new(SegFetch { + settings: ctx.settings.clone(), + storage: ctx.storage.clone(), + timeline: ctx.timeline, + seg_size: ctx.seg_size, + compression: ctx.compression, + wal_dir: ctx.wal_dir.map(Path::to_path_buf), + }); + + let mut delta = PagedFileDeltaMap::new(); + let mut fragments: Vec> = (0..count).map(|_| None).collect(); + + let concurrency = ctx.settings.download_concurrency + 1; + { + let delta = &mut delta; + let fragments = &mut fragments; + let timeline = ctx.timeline; + let seg_size = ctx.seg_size; + let mut tasks = BoundedTasks::new(concurrency, "wal-parse", move |out: SegOut| { + let rel = out.rel; + let (locs, boundary) = out.result.with_context(|| { + let name = seg_name_from_global(timeline, first_seg + rel as u64, seg_size); + format!("wal segment {}", name.format()) + })?; + delta.add_locations(locs); + fragments[rel] = Some(boundary); + Ok(()) + }); + for seg in first_seg..=last_seg { + let fetch = fetch.clone(); + let rel = (seg - first_seg) as usize; + tasks + .spawn(async move { + let result = fetch_and_walk_segment(&fetch, seg).await; + SegOut { rel, result } + }) + .await?; + } + tasks.join().await?; + } + + // A fragment ending mid-record but not at a record start means a record + // spans more than this segment + the next — pairwise stitching can't + // recover it. Bail to the threaded serial walk + if fragments + .iter() + .flatten() + .any(|f| !f.trailing_is_record_start && !f.trailing_head.is_empty()) + { + return Ok(None); + } + + // Stitch each boundary record: head of segment i + leading tail of i+1. + // Every fragment is present: a fetch/parse error aborts join() above, so a + // None here is an internal invariant break, not a recoverable gap + for rel in 0..count.saturating_sub(1) { + let (Some(head), Some(tail)) = (&fragments[rel], &fragments[rel + 1]) else { + anyhow::bail!("missing parsed WAL fragment at offset {rel} after successful walk"); + }; + if head.trailing_head.is_empty() { + continue; // record ended exactly at the boundary + } + let mut data = Vec::with_capacity(head.trailing_head.len() + tail.leading_tail.len()); + data.extend_from_slice(&head.trailing_head); + data.extend_from_slice(&tail.leading_tail); + match parse_record_from_bytes(&data, head.page_magic) { + Ok(rec) => delta.add_locations(extract_block_locations(std::slice::from_ref(&rec))), + Err(e) => { + let name = seg_name_from_global(ctx.timeline, first_seg + rel as u64, ctx.seg_size) + .format(); + tracing::warn!( + target = "backup_push", + "boundary record after segment {name} unparseable ({e}); serial walk", + ); + return Ok(None); + } + } + } + Ok(Some(delta)) +} + +/// Fetch one segment and parse it with a fresh per-segment parser on the +/// blocking pool, returning its in-segment block locations + boundary fragments +async fn fetch_and_walk_segment( + fetch: &SegFetch, + seg: u64, +) -> Result<(Vec, SegmentBoundary)> { + let name = seg_name_from_global(fetch.timeline, seg, fetch.seg_size).format(); + let buf = fetch_segment( + &fetch.settings, + &fetch.storage, + fetch.compression, + fetch.wal_dir.as_deref(), + &name, + ) + .await?; + tokio::task::spawn_blocking(move || { + let mut locs = Vec::new(); + let boundary = walk_segment_locations(&buf, |l| locs.push(l)) + .with_context(|| format!("parse segment {name}"))?; + Ok((locs, boundary)) + }) + .await + .context("join segment parse")? +} + fn lsn_to_seg(lsn: u64, seg_size: u64) -> u64 { lsn / seg_size } -async fn fetch_and_parse_segment( +/// Per-walk invariants shared across every segment of a raw-WAL walk +struct WalWalkCtx<'a> { + settings: &'a crate::config::Settings, + storage: &'a DynStorage, + timeline: u32, + seg_size: u64, + compression: compression::Method, + /// Local `pg_wal` to read raw segments from before falling back to the + /// archive; `None` when the push has no local data dir + wal_dir: Option<&'a Path>, +} + +/// Walk raw WAL segments `[first_seg, last_seg]` into `delta`, prefetching the +/// next segment's bytes while parsing the current one (download+decode overlaps +/// the CPU-bound parse). Parsing stays serial: WAL records span segment +/// boundaries, so `parser` state threads across iterations. A missing or corrupt +/// segment is a hard error: every segment in the range is required WAL, so +/// skipping one would silently drop its changed pages from the increment +async fn walk_segments_pipelined( + ctx: &WalWalkCtx<'_>, + first_seg: u64, + last_seg: u64, + parser: &mut WalParser, + delta: &mut PagedFileDeltaMap, +) -> Result<()> { + if first_seg > last_seg { + return Ok(()); + } + let name = |s: u64| seg_name_from_global(ctx.timeline, s, ctx.seg_size).format(); + + // Prime the pipeline, then carry each prefetch into the next iteration; the + // prefetch yields None past the last segment, ending the loop + let mut pending = Some( + fetch_segment( + ctx.settings, + ctx.storage, + ctx.compression, + ctx.wal_dir, + &name(first_seg), + ) + .await, + ); + let mut seg = first_seg; + while let Some(fetched) = pending.take() { + let cur = name(seg); + let buf = fetched.with_context(|| format!("wal segment {cur}"))?; + // Parse current segment on the blocking pool while the next prefetches. + // Parser is moved in and returned via the join handle so cross-segment + // record-stitching state survives + let parser_in = std::mem::take(parser); + let parse_handle = tokio::task::spawn_blocking(move || { + let mut parser_in = parser_in; + let res = extract_locations_from_wal_file(&mut parser_in, io::Cursor::new(buf)); + (parser_in, res) + }); + + let (joined, next) = tokio::join!(parse_handle, async { + if seg < last_seg { + Some( + fetch_segment( + ctx.settings, + ctx.storage, + ctx.compression, + ctx.wal_dir, + &name(seg + 1), + ) + .await, + ) + } else { + None + } + }); + + let (parser_out, locs) = joined.context("join segment walk")?; + *parser = parser_out; + delta.add_locations(locs.with_context(|| format!("parse segment {cur}"))?); + + pending = next; + seg += 1; + } + Ok(()) +} + +/// Read one WAL segment fully into memory so the next segment can prefetch while +/// this one parses. Bounded at ~2 × seg_size in flight. +/// Bounded wait for the archiver to ship a just-switched-out WAL segment. +/// full-jitter, ~11s worst case (sum of capped backoffs over 10 attempts) +const WAL_ARCHIVE_WAIT: RetryPolicy = RetryPolicy { + max_attempts: 10, + base_delay: Duration::from_millis(100), + max_delay: Duration::from_secs(2), + jitter: true, +}; + +async fn fetch_segment( settings: &crate::config::Settings, storage: &DynStorage, - name: &str, compression: compression::Method, - parser: &mut WalParser, -) -> Result> { + wal_dir: Option<&Path>, + name: &str, +) -> Result> { + if let Some(dir) = wal_dir { + match tokio::fs::read(dir.join(name)).await { + Ok(buf) => return Ok(buf), + Err(e) if e.kind() == io::ErrorKind::NotFound => {} + Err(e) => return Err(e).with_context(|| format!("read local wal segment {name}")), + } + } let ext = compression.extension(); let key = if ext.is_empty() { format!("{}/{}", crate::pg::WAL_FOLDER, name) } else { format!("{}/{}.{}", crate::pg::WAL_FOLDER, name, ext) }; - let r = storage - .get(&key) - .await - .with_context(|| format!("get {key}"))?; - let decrypted = settings.decrypt(r); - let decoded = compression::decode(compression, decrypted); - - // Bridge the async reader to sync inside spawn_blocking so - // extract_locations_from_wal_file streams page-by-page rather than - // materialising the 16 MiB segment in a Vec. Parser is moved in and - // returned via the join handle so caller-side stitching state - // survives across segments - let parser_in = std::mem::take(parser); - let (parser_out, locs) = tokio::task::spawn_blocking(move || { - let mut parser_in = parser_in; - let sync_r = SyncIoBridge::new(decoded); - let res = extract_locations_from_wal_file(&mut parser_in, sync_r); - (parser_in, res) - }) + // A delta range's trailing segment is the one BASE_BACKUP forced a switch out + // of at start (PG do_pg_backup_start -> RequestXLogSwitch), so PG's async + // archive_command may not have shipped it to the bucket yet. It is switched + // out, so it will arrive — wait out archiver lag on NotFound before the caller + // gives up to a full backup. Transient errors are already retried in storage + let r = with_retry( + &WAL_ARCHIVE_WAIT, + |e: &StorageError| matches!(e, StorageError::NotFound(_)), + || async { storage.get(&key).await }, + ) .await - .context("join segment walk")?; - *parser = parser_out; - locs.with_context(|| format!("parse segment {name}")) + .with_context(|| format!("get {key}"))?; + let decrypted = settings.decrypt(r); + let mut decoded = compression::decode(compression, decrypted); + let mut buf = Vec::new(); + tokio::io::AsyncReadExt::read_to_end(&mut decoded, &mut buf) + .await + .with_context(|| format!("read segment {name}"))?; + Ok(buf) } #[cfg(test)] mod tests { + use std::num::NonZeroU64; + use super::*; use crate::pg::wal::segment::DEFAULT_WAL_SEG_SIZE; @@ -706,14 +1052,14 @@ mod tests { let sentinel = |from: Option<&str>, fmt: increment::Format| BackupSentinelDtoV2 { sentinel: BackupSentinelDto { - backup_start_lsn: Some(seg), - increment_from_lsn: from.map(|_| seg / 2), + backup_start_lsn: NonZeroU64::new(seg), + increment_from_lsn: from.and_then(|_| NonZeroU64::new(seg / 2)), increment_from: from.map(String::from), increment_full_name: from.map(String::from), increment_count: from.map(|_| 1), increment_format: fmt, pg_version: 170000, - backup_finish_lsn: Some(seg + 1), + backup_finish_lsn: NonZeroU64::new(seg + 1), ..Default::default() }, hostname: "h".into(), @@ -814,13 +1160,13 @@ mod tests { }); let seg0 = m.blocks_for("base/16384/16385").unwrap().unwrap(); - assert_eq!(seg0, [5].iter().copied().collect::>()); + assert_eq!(seg0, vec![5u32]); let seg1 = m.blocks_for("base/16384/16385.1").unwrap().unwrap(); - assert_eq!(seg1, [3].iter().copied().collect::>()); + assert_eq!(seg1, vec![3u32]); let seg2 = m.blocks_for("base/16384/16385.2").unwrap().unwrap(); - assert_eq!(seg2, [9].iter().copied().collect::>()); + assert_eq!(seg2, vec![9u32]); let seg3 = m.blocks_for("base/16384/16385.3").unwrap().unwrap(); assert!(seg3.is_empty()); // file has segment but no dirty blocks @@ -868,8 +1214,8 @@ mod tests { use crate::pg::backup::{BackupSentinelDto, sentinel_key}; let v2 = BackupSentinelDtoV2 { sentinel: BackupSentinelDto { - backup_start_lsn: Some(DEFAULT_WAL_SEG_SIZE), - backup_finish_lsn: Some(DEFAULT_WAL_SEG_SIZE + 1), + backup_start_lsn: NonZeroU64::new(DEFAULT_WAL_SEG_SIZE), + backup_finish_lsn: NonZeroU64::new(DEFAULT_WAL_SEG_SIZE + 1), pg_version: 170000, user_data: Some(user_data), ..Default::default() @@ -957,4 +1303,61 @@ mod tests { assert_eq!(blocks.into_iter().collect::>(), vec![7u32]); assert!(parser.current_record_data().is_empty()); } + + #[tokio::test] + async fn fold_sidecar_truncated_before_terminator_errors() { + use crate::pg::walparser::write_location_tuples; + use crate::storage::fs::FsStorage; + let dir = tempfile::tempdir().unwrap(); + let storage: DynStorage = Arc::new(FsStorage::new(dir.path()).unwrap()); + let settings = crate::config::Settings::default(); + let method = compression::Method::None; + + // Tuples with no terminator + parser state: truncated upload + let mut raw = Vec::new(); + write_location_tuples( + &mut raw, + &[BlockLocation::new(DEFAULT_SPC_NODE, 16384, 16385, 7)], + ) + .unwrap(); + + let group = delta_group_name(1, 0, DEFAULT_WAL_SEG_SIZE); + let key = delta_storage_key(&group, method); + let len = raw.len() as u64; + let r: crate::compression::AsyncReader = Box::pin(std::io::Cursor::new(raw)); + storage.put(&key, r, Some(len)).await.unwrap(); + + let err = fold_sidecar_into_map( + &settings, + &storage, + &group, + method, + PagedFileDeltaMap::new(), + ) + .await + .unwrap_err(); + assert!(format!("{err:#}").contains("truncated"), "{err:#}"); + } + + #[test] + fn merge_unions_overlapping_rels() { + // Summaries + a raw-walked gap each touch the same rel; merge must union + // their blocks, not replace + let mut a = PagedFileDeltaMap::new(); + a.add_location(BlockLocation::new(DEFAULT_SPC_NODE, 16384, 16385, 1)); + a.add_location(BlockLocation::new(DEFAULT_SPC_NODE, 16384, 16385, 3)); + let mut b = PagedFileDeltaMap::new(); + b.add_location(BlockLocation::new(DEFAULT_SPC_NODE, 16384, 16385, 3)); + b.add_location(BlockLocation::new(DEFAULT_SPC_NODE, 16384, 16385, 5)); + b.add_location(BlockLocation::new(DEFAULT_SPC_NODE, 16384, 16386, 0)); + a.merge(b); + assert_eq!( + a.blocks_for("base/16384/16385").unwrap().unwrap(), + vec![1u32, 3, 5] + ); + assert_eq!( + a.blocks_for("base/16384/16386").unwrap().unwrap(), + vec![0u32] + ); + } } diff --git a/src/pg/backup/fs_push.rs b/src/pg/backup/fs_push.rs index 816713c..f465e26 100644 --- a/src/pg/backup/fs_push.rs +++ b/src/pg/backup/fs_push.rs @@ -12,6 +12,7 @@ //! several S3 connections and CPU cores run at once instead of one use std::collections::HashMap; +use std::num::NonZeroU64; use std::os::unix::fs::PermissionsExt; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -31,18 +32,21 @@ use crate::pg::backup::delta; use crate::pg::backup::increment::Format as IncrementFormat; use crate::pg::backup::push::{self, Finalize, PushArgs}; use crate::pg::backup::tar_streamer::{ - DeltaClass, DeltaContext, IncrementBodyReader, PartWriter, classify_for_delta, + DeltaClass, DeltaContext, IncrementBodyReader, PG_PAGE_HEADER_SIZE, PartWriter, + classify_for_delta, increment_class_for_blocks, page_changed_since, }; use crate::pg::backup::{ - BACKUP_NAME_PREFIX, FileDescription, TablespaceSpec, format_backup_name, format_pg_lsn, - parse_pg_lsn, tar_part_key, + FileDescription, TablespaceSpec, format_backup_name, format_pg_lsn, parse_pg_lsn, tar_part_key, }; use crate::pg::replication::PgConfig; use crate::pg::replication::base_backup::ChannelReader; use crate::pg::replication::conn::ReplicationConn; use crate::storage::DynStorage; +// walk-relative path used to detect pg_control during the tree walk const PG_CONTROL_ENTRY: &str = "global/pg_control"; +// tar entry name for the pg_control tee, restores without a files_metadata entry +const TAR_PG_CONTROL_ENTRY: &str = "/global/pg_control"; /// Coalesce file-body reads. tokio_tar copies each body through io::copy's 8 KB /// buffer, and every tokio::fs::File read is a blocking-pool dispatch; reading a @@ -257,13 +261,6 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> let seg_size = crate::pg::wal::segment::wal_segment_size(); let base_name = format_backup_name(timeline, start_lsn, seg_size); - let backup_name = match parent.as_ref() { - Some(p) => format!( - "{base_name}_D_{}", - p.name.strip_prefix(BACKUP_NAME_PREFIX).unwrap_or(&p.name), - ), - None => base_name.clone(), - }; // Build the delta map now that the upper LSN bound is known. Failure drops // to a full backup (wal-g semantics: a partial delta is worse than a full) @@ -279,6 +276,13 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> ) .await; + // Delta backups get a `_D_` suffix (wal-g + // convention). delete/list/show all key off this. Chosen only after the + // delta map built: a failed build falls back to a full, so name must not + // claim `_D_` when sentinel reports FULL + let backup_name = + push::resolve_backup_name(&base_name, parent.as_ref(), delta_context.is_some()); + let tar_size = if args.tar_size_threshold == 0 { crate::pg::backup::tar_streamer::DEFAULT_TAR_SIZE_THRESHOLD } else { @@ -544,13 +548,110 @@ async fn append_entry( Ok(false) } DeltaClass::Increment { - header_bytes, - blocks, - total_size, + mut header_bytes, + mut blocks, + mut total_size, } => { - let Some(mut file) = open_walked(&e.abs).await? else { + // WAL/summary candidates over-mark: every block touched in the window, + // including pages settled below the parent's start LSN. Drop those to + // match wal-g selectivity. One fd for prepass and body so a concurrent + // unlink can't swap the file + let format = delta_context + .as_ref() + .expect("increment implies delta context") + .format; + let parent_start_lsn = delta_context.as_ref().and_then(|c| c.parent_start_lsn); + let cand_count = blocks.len(); + let abs = e.abs.clone(); + let cand = std::mem::take(&mut blocks); + let (std_file, kept) = tokio::task::spawn_blocking( + move || -> Result<(Option, Vec)> { + let file = match std::fs::File::open(&abs) { + Ok(f) => f, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => { + return Ok((None, Vec::new())); + } + Err(err) => { + return Err(err).with_context(|| format!("open {}", abs.display())); + } + }; + let kept = match parent_start_lsn { + Some(lsn) => filter_changed_blocks(&file, &cand, lsn.get()) + .with_context(|| format!("page-lsn filter {}", abs.display()))?, + None => cand, + }; + Ok((Some(file), kept)) + }, + ) + .await + .context("join page-lsn filter")??; + + // Vanished mid-backup (DROP TABLE etc.); omit, matching wal-g + let Some(std_file) = std_file else { + tracing::warn!( + target = "backup_push", + "{} vanished during backup; skipping", + e.abs.display(), + ); return Ok(false); }; + + // Filter only drops in order, so equal counts ⇒ unchanged set: keep + // original header. Otherwise re-encode + if kept.len() != cand_count { + match increment_class_for_blocks(format, e.size, kept) { + DeltaClass::Skip => { + res.files.insert( + e.tar_path.clone(), + FileDescription { + is_incremented: false, + is_skipped: true, + mtime: mtime_dt(e.mtime), + updates_count: 0, + }, + ); + return Ok(false); + } + DeltaClass::Increment { + header_bytes: h, + blocks: b, + total_size: t, + } => { + header_bytes = h; + blocks = b; + total_size = t; + } + // Re-encode writes to a Vec so can't fail; ship full + // defensively rather than emit a malformed delta + DeltaClass::Passthrough => { + let file = BufReader::with_capacity( + FILE_READ_BUF, + tokio::fs::File::from_std(std_file), + ); + let body = FixedSizeReader::new(file, e.size); + let mut h = header(e, EntryType::Regular, e.size); + builder + .append_data(&mut h, &e.tar_path, body) + .await + .with_context(|| format!("append {}", e.tar_path))?; + res.files.insert( + e.tar_path.clone(), + FileDescription { + is_incremented: false, + is_skipped: false, + mtime: mtime_dt(e.mtime), + updates_count: 0, + }, + ); + return Ok(true); + } + } + } else { + blocks = kept; + } + + let mut file = + BufReader::with_capacity(FILE_READ_BUF, tokio::fs::File::from_std(std_file)); let mut h = header(e, EntryType::Regular, total_size); let body = IncrementBodyReader::new(header_bytes, &mut file, blocks, e.size); builder @@ -592,6 +693,35 @@ async fn append_entry( } } +/// Trim the WAL/summary candidate set to blocks whose on-disk page changed +/// at/after `parent_start_lsn` (wal-g `SelectNewValidPage` selectivity). One +/// positioned read of the 24-byte page header per candidate; a short read (file +/// truncated/torn since the walk) keeps the block, so the filter never drops a +/// possibly-changed page. `blocks` is ascending; the result preserves order, so +/// the caller can detect "nothing trimmed" by length alone +fn filter_changed_blocks( + file: &std::fs::File, + blocks: &[u32], + parent_start_lsn: u64, +) -> std::io::Result> { + use std::os::unix::fs::FileExt; + let mut hdr = [0u8; PG_PAGE_HEADER_SIZE]; + let mut kept = Vec::with_capacity(blocks.len()); + for &b in blocks { + let offset = b as u64 * delta::PG_PAGE_SIZE; + match file.read_exact_at(&mut hdr, offset) { + Ok(()) => { + if page_changed_since(&hdr, parent_start_lsn) { + kept.push(b); + } + } + Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => kept.push(b), + Err(e) => return Err(e), + } + } + Ok(kept) +} + /// Open a walked file, tolerating it vanishing between the walk and the pack: /// DROP TABLE unlinks a relation, pg_internal.init is recreated, etc. Returns /// None on ENOENT so the caller omits it — matching wal-g, which skips a file @@ -695,11 +825,16 @@ async fn build_pg_control_tar(abs: &Path) -> Result { .with_context(|| format!("read {}", abs.display()))?; let mut b = Builder::new(Vec::new()); let mut h = Header::new_gnu(); + // leading-slash name matches wal-g; set_path rejects absolute paths, so + // write the name bytes directly (fits in the 100-byte GNU name field) + let name = TAR_PG_CONTROL_ENTRY.as_bytes(); + h.as_old_mut().name[..name.len()].copy_from_slice(name); h.set_size(data.len() as u64); h.set_mode(0o600); h.set_mtime(0); h.set_entry_type(EntryType::Regular); - b.append_data(&mut h, PG_CONTROL_ENTRY, &data[..]) + h.set_cksum(); + b.append(&h, &data[..]) .await .context("append pg_control tee")?; b.finish().await.context("finish pg_control tar")?; @@ -924,8 +1059,19 @@ async fn build_delta_context( return None; } let map = if args.delta_from_wal_summaries { - push::build_delta_map_from_summaries(Some(pgdata), timeline, p.start_lsn, start_lsn) + push::build_delta_map_from_summaries( + settings, + storage, + Some(pgdata), + timeline, + p.start_lsn, + start_lsn, + ) + .await } else { + // Serve the walk from the local pg_wal; the archive is the fallback for + // segments PG has already recycled + let wal_dir = pgdata.join("pg_wal"); delta::build_delta_map_from_wal( settings, storage, @@ -933,6 +1079,7 @@ async fn build_delta_context( p.start_lsn, start_lsn, settings.compression, + Some(&wal_dir), ) .await }; @@ -947,6 +1094,9 @@ async fn build_delta_context( map: Arc::new(map), format: increment_format, parent_files: p.parent_files.clone(), + // fs source reads page headers to trim blocks settled below + // the parent (page-LSN final-state filter, wal-g selectivity) + parent_start_lsn: NonZeroU64::new(p.start_lsn), }) } Err(e) => { @@ -1338,4 +1488,151 @@ mod tests { assert!(res.files.contains_key("base/1/1234")); assert!(!res.files.contains_key("base/1/5678")); } + + // ─── page-LSN final-state filter (item 3) ─────────────────────────────── + + const PAGE: usize = delta::PG_PAGE_SIZE as usize; + + /// One paged relation file with a valid header per block carrying the given + /// LSN. Bytes past the header are zero — enough for the filter & wi1 decode + fn paged_file_with_lsns(lsns: &[u64]) -> Vec { + let mut body = vec![0u8; lsns.len() * PAGE]; + for (i, &lsn) in lsns.iter().enumerate() { + let o = i * PAGE; + body[o..o + 4].copy_from_slice(&((lsn >> 32) as u32).to_le_bytes()); // pd_lsn hi + body[o + 4..o + 8].copy_from_slice(&(lsn as u32).to_le_bytes()); // pd_lsn lo + body[o + 12..o + 14].copy_from_slice(&24u16.to_le_bytes()); // pd_lower + body[o + 14..o + 16].copy_from_slice(&(PAGE as u16).to_le_bytes()); // pd_upper + body[o + 16..o + 18].copy_from_slice(&(PAGE as u16).to_le_bytes()); // pd_special + body[o + 18..o + 20].copy_from_slice(&(0x2000u16 | 4).to_le_bytes()); // BLCKSZ|v4 + } + body + } + + #[test] + fn filter_changed_blocks_drops_settled() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("rel"); + std::fs::write(&path, paged_file_with_lsns(&[100, 150, 250, 300])).unwrap(); + let f = std::fs::File::open(&path).unwrap(); + + // parent start 200: blocks 0,1 settled below ⇒ dropped; 2,3 kept + assert_eq!( + filter_changed_blocks(&f, &[0, 1, 2, 3], 200).unwrap(), + vec![2, 3] + ); + // all candidates below parent ⇒ empty + assert!(filter_changed_blocks(&f, &[0, 1], 200).unwrap().is_empty()); + // candidate past EOF (file has 4 blocks): short read keeps it + assert_eq!(filter_changed_blocks(&f, &[9], 200).unwrap(), vec![9]); + } + + fn rel(db: u32, rel_node: u32) -> crate::pg::walparser::RelFileNode { + crate::pg::walparser::RelFileNode { + spc_node: delta::DEFAULT_SPC_NODE, + db_node: db, + rel_node, + } + } + + /// End-to-end through pack_worker: a paged file whose WAL-candidate blocks + /// include one settled below the parent (trimmed out of the increment) and a + /// file whose only candidate settled below (skipped entirely) + #[tokio::test] + async fn delta_page_lsn_filter_trims_and_skips() { + use crate::pg::backup::delta::PagedFileDeltaMap; + use crate::pg::backup::increment::read_increment_header; + use crate::pg::walparser::BlockLocation; + + let dir = tempfile::tempdir().unwrap(); + let root = dir.path().join("pgdata"); + write_file(&root, "PG_VERSION", b"16"); + // trimmed: blocks 1 (lsn 150 < 200, drop) & 3 (lsn 300, keep) + write_file( + &root, + "base/16384/16400", + &paged_file_with_lsns(&[100, 150, 100, 300]), + ); + // skipped: only dirty block 1 (lsn 150 < 200) settled below parent + write_file(&root, "base/16384/16401", &paged_file_with_lsns(&[50, 150])); + + let mut map = PagedFileDeltaMap::new(); + map.add_location(BlockLocation { + rel: rel(16384, 16400), + block_no: 1, + }); + map.add_location(BlockLocation { + rel: rel(16384, 16400), + block_no: 3, + }); + map.add_location(BlockLocation { + rel: rel(16384, 16401), + block_no: 1, + }); + + let parent_files: Arc> = Arc::new( + ["base/16384/16400", "base/16384/16401"] + .iter() + .map(|s| s.to_string()) + .collect(), + ); + let ctx = DeltaContext { + map: Arc::new(map), + format: IncrementFormat::Wi1, + parent_files, + parent_start_lsn: NonZeroU64::new(200), + }; + + let batch_rx = walk_batches(&root, 1 << 30).await; + let store_dir = tempfile::tempdir().unwrap(); + let storage: DynStorage = Arc::new(FsStorage::new(store_dir.path()).unwrap()); + let settings = Settings { + compression: Method::None, + ..Default::default() + }; + let name = "base_delta"; + let res = pack_worker( + batch_rx, + Arc::new(AtomicU32::new(0)), + settings, + storage.clone(), + name.to_string(), + Some(ctx), + ) + .await + .unwrap(); + + // 16400 trimmed to an increment; 16401 settled-only ⇒ skipped + let m400 = res.files.get("base/16384/16400").expect("16400 meta"); + assert!(m400.is_incremented && !m400.is_skipped); + let m401 = res.files.get("base/16384/16401").expect("16401 meta"); + assert!(m401.is_skipped && !m401.is_incremented); + + // Decode the 16400 increment from the emitted parts: only block 3 survives + let mut inc_blocks = None; + for file_no in 1..=res.max_file_no { + let key = tar_part_key(name, file_no, ""); + let mut bytes = Vec::new(); + storage + .get(&key) + .await + .unwrap() + .read_to_end(&mut bytes) + .await + .unwrap(); + let mut ar = tar::Archive::new(&bytes[..]); + for e in ar.entries().unwrap() { + let mut e = e.unwrap(); + if e.path().unwrap().to_string_lossy() == "base/16384/16400" { + let mut body = Vec::new(); + e.read_to_end(&mut body).unwrap(); + let h = read_increment_header(&body[..]).unwrap(); + inc_blocks = Some(h.blocks); + } + // 16401 was skipped: it must not appear in any part + assert_ne!(e.path().unwrap().to_string_lossy(), "base/16384/16401"); + } + } + assert_eq!(inc_blocks, Some(vec![3]), "settled block 1 must be trimmed"); + } } diff --git a/src/pg/backup/list.rs b/src/pg/backup/list.rs index 5729a7c..a10aac3 100644 --- a/src/pg/backup/list.rs +++ b/src/pg/backup/list.rs @@ -1,6 +1,8 @@ //! backup-list: enumerate sentinel files under basebackups_005/, fetch each, //! print backup names with start/finish times and LSNs +use std::num::NonZeroU64; + use anyhow::{Context, Result}; use futures::StreamExt; @@ -13,8 +15,8 @@ pub struct BackupSummary { pub name: String, pub start_time: Option>, pub finish_time: Option>, - pub start_lsn: Option, - pub finish_lsn: Option, + pub start_lsn: Option, + pub finish_lsn: Option, pub pg_version: i32, pub hostname: Option, pub is_permanent: bool, @@ -129,8 +131,8 @@ mod tests { fn sentinel(host: &str, ts: i64, perm: bool) -> BackupSentinelDtoV2 { BackupSentinelDtoV2 { sentinel: BackupSentinelDto { - backup_start_lsn: Some(0x0200_0000), - backup_finish_lsn: Some(0x0200_1000), + backup_start_lsn: NonZeroU64::new(0x0200_0000), + backup_finish_lsn: NonZeroU64::new(0x0200_1000), pg_version: 160003, uncompressed_size: 2048, compressed_size: 1024, diff --git a/src/pg/backup/mod.rs b/src/pg/backup/mod.rs index ecf4d48..3f26865 100644 --- a/src/pg/backup/mod.rs +++ b/src/pg/backup/mod.rs @@ -3,6 +3,7 @@ //! Wire format mirrors wal-g so walrus and wal-g can share buckets use std::collections::HashMap; +use std::num::NonZeroU64; use anyhow::{Context, Result, anyhow}; use chrono::{DateTime, Utc}; @@ -123,8 +124,17 @@ pub fn parse_pg_lsn(s: &str) -> Result { Ok((hi << 32) | lo) } -pub fn format_pg_lsn(lsn: u64) -> String { - format!("{:X}/{:X}", lsn >> 32, lsn as u32) +/// Canonical postgres LSN rendering `hi/lo` in uppercase hex. Returns a +/// `Display` adapter so callers format in place without allocating; use +/// `.to_string()` when an owned `String` is required +pub fn format_pg_lsn(lsn: u64) -> impl std::fmt::Display { + struct PgLsn(u64); + impl std::fmt::Display for PgLsn { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:X}/{:X}", self.0 >> 32, self.0 as u32) + } + } + PgLsn(lsn) } /// Match `base_<24hex>` and optional `_D_<24hex>` delta and `_<8hex>` LSN @@ -309,10 +319,15 @@ pub struct FileDescription { /// Sentinel: subset of wal-g BackupSentinelDto. Skips delta-backup fields we do not produce #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct BackupSentinelDto { - #[serde(rename = "LSN", default)] - pub backup_start_lsn: Option, - #[serde(rename = "DeltaLSN", default, skip_serializing_if = "Option::is_none")] - pub increment_from_lsn: Option, + #[serde(rename = "LSN", default, with = "lsn_opt")] + pub backup_start_lsn: Option, + #[serde( + rename = "DeltaLSN", + default, + with = "lsn_opt", + skip_serializing_if = "Option::is_none" + )] + pub increment_from_lsn: Option, #[serde(rename = "DeltaFrom", default, skip_serializing_if = "Option::is_none")] pub increment_from: Option, #[serde( @@ -339,8 +354,8 @@ pub struct BackupSentinelDto { #[serde(rename = "PgVersion", default)] pub pg_version: i32, - #[serde(rename = "FinishLSN", default)] - pub backup_finish_lsn: Option, + #[serde(rename = "FinishLSN", default, with = "lsn_opt")] + pub backup_finish_lsn: Option, #[serde( rename = "SystemIdentifier", default, @@ -496,6 +511,22 @@ fn is_zero_i64(v: &i64) -> bool { *v == 0 } +/// Serde for LSN fields shared with wal-g: serialize as a plain JSON number, +/// deserialize mapping 0 (InvalidXLogRecPtr) / null / absent -> None so reading +/// foreign metadata never errors on a missing LSN +mod lsn_opt { + use super::NonZeroU64; + use serde::{Deserialize, Deserializer, Serialize, Serializer}; + + pub fn serialize(v: &Option, s: S) -> Result { + v.map(NonZeroU64::get).serialize(s) + } + + pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { + Ok(Option::::deserialize(d)?.and_then(NonZeroU64::new)) + } +} + fn is_false(v: &bool) -> bool { !*v } @@ -537,12 +568,12 @@ mod tests { #[test] fn formats_lsn_uppercase() { - assert_eq!(format_pg_lsn(0x0300_0000), "0/3000000"); - assert_eq!(format_pg_lsn((2u64 << 32) | 0xab), "2/AB"); + assert_eq!(format_pg_lsn(0x0300_0000).to_string(), "0/3000000"); + assert_eq!(format_pg_lsn((2u64 << 32) | 0xab).to_string(), "2/AB"); // high word > 10 separates hex from decimal: "2A" vs decimal "42" - assert_eq!(format_pg_lsn((0x2A_u64 << 32) | 0x16), "2A/16"); - assert_eq!(format_pg_lsn((0xFF_u64 << 32) | 0xFF), "FF/FF"); - assert_eq!(format_pg_lsn(u64::MAX), "FFFFFFFF/FFFFFFFF"); + assert_eq!(format_pg_lsn((0x2A_u64 << 32) | 0x16).to_string(), "2A/16"); + assert_eq!(format_pg_lsn((0xFF_u64 << 32) | 0xFF).to_string(), "FF/FF"); + assert_eq!(format_pg_lsn(u64::MAX).to_string(), "FFFFFFFF/FFFFFFFF"); } #[test] @@ -555,7 +586,7 @@ mod tests { (0xA_u64 << 32) | 0xDEAD_BEEF, u64::MAX, ] { - assert_eq!(parse_pg_lsn(&format_pg_lsn(lsn)).unwrap(), lsn); + assert_eq!(parse_pg_lsn(&format_pg_lsn(lsn).to_string()).unwrap(), lsn); } } @@ -581,9 +612,9 @@ mod tests { #[test] fn sentinel_v1_serde_roundtrip() { let s = BackupSentinelDto { - backup_start_lsn: Some(0x0300_0000), + backup_start_lsn: NonZeroU64::new(0x0300_0000), pg_version: 160003, - backup_finish_lsn: Some(0x0300_1000), + backup_finish_lsn: NonZeroU64::new(0x0300_1000), system_identifier: Some(7000000000000000000), uncompressed_size: 1024, compressed_size: 512, @@ -597,22 +628,34 @@ mod tests { assert!(j.contains("\"PgVersion\":160003")); assert!(j.contains("\"FilesMetadataDisabled\":true")); let back: BackupSentinelDto = serde_json::from_str(&j).unwrap(); - assert_eq!(back.backup_start_lsn, Some(0x0300_0000)); + assert_eq!(back.backup_start_lsn, NonZeroU64::new(0x0300_0000)); assert_eq!(back.system_identifier, Some(7000000000000000000)); } + #[test] + fn lsn_zero_null_absent_deserialize_to_none() { + // 0 = InvalidXLogRecPtr; foreign/zero metadata must read as None, not error + let back: BackupSentinelDto = serde_json::from_str( + r#"{"LSN":0,"FinishLSN":null,"UncompressedSize":0,"CompressedSize":0}"#, + ) + .unwrap(); + assert_eq!(back.backup_start_lsn, None); + assert_eq!(back.backup_finish_lsn, None); + assert_eq!(back.increment_from_lsn, None); + } + #[test] fn increment_format_sentinel_field() { use increment::Format; let mut s = BackupSentinelDto { - backup_start_lsn: Some(1), - increment_from_lsn: Some(0), + backup_start_lsn: NonZeroU64::new(1), + increment_from_lsn: NonZeroU64::new(1), increment_from: Some("base_x".into()), increment_full_name: Some("base_x".into()), increment_count: Some(1), increment_format: Format::Native, pg_version: 170000, - backup_finish_lsn: Some(2), + backup_finish_lsn: NonZeroU64::new(2), ..Default::default() }; // Native deltas record the format @@ -658,9 +701,9 @@ mod tests { fn sentinel_v2_extra_fields_present() { let s = BackupSentinelDtoV2 { sentinel: BackupSentinelDto { - backup_start_lsn: Some(1), + backup_start_lsn: NonZeroU64::new(1), pg_version: 160003, - backup_finish_lsn: Some(2), + backup_finish_lsn: NonZeroU64::new(2), files_metadata_disabled: true, ..Default::default() }, diff --git a/src/pg/backup/push.rs b/src/pg/backup/push.rs index 81372ae..37ae4d4 100644 --- a/src/pg/backup/push.rs +++ b/src/pg/backup/push.rs @@ -11,6 +11,7 @@ //! Local PGDATA is optional; absent it, the sentinel records the PG-reported //! `data_directory` and we never touch the local filesystem +use std::num::NonZeroU64; use std::path::PathBuf; use std::pin::Pin; use std::sync::Arc; @@ -217,16 +218,6 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> let seg_size = crate::pg::wal::segment::wal_segment_size(); let base_name = format_backup_name(info.timeline, info.start_lsn, seg_size); debug_assert!(base_name.starts_with(BACKUP_NAME_PREFIX)); - // Delta backups get a `_D_` suffix - // (wal-g convention). delete/list/show all key off this - let resolved_name = match parent.as_ref() { - Some(p) => format!( - "{base_name}_D_{}", - p.name.strip_prefix(BACKUP_NAME_PREFIX).unwrap_or(&p.name), - ), - None => base_name.clone(), - }; - backup_name = Some(resolved_name); tracing::info!( target = "backup_push", "BASE_BACKUP started: lsn={} timeline={} tablespaces={}", @@ -250,11 +241,15 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> ); } else if args.delta_from_wal_summaries { match build_delta_map_from_summaries( + settings, + &storage, args.pgdata.as_deref(), info.timeline, p.start_lsn, info.start_lsn, - ) { + ) + .await + { Ok(map) => { tracing::info!( target = "backup_push", @@ -267,6 +262,8 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> map: Arc::new(map), format: increment_format, parent_files: p.parent_files.clone(), + // stream source has no random page access + parent_start_lsn: None, }); } Err(e) => { @@ -285,6 +282,7 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> p.start_lsn, info.start_lsn, settings.compression, + None, ) .await { @@ -300,6 +298,8 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> map: Arc::new(map), format: increment_format, parent_files: p.parent_files.clone(), + // stream source has no random page access + parent_start_lsn: None, }); } Err(e) => { @@ -312,6 +312,12 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> } } } + + backup_name = Some(resolve_backup_name( + &base_name, + parent.as_ref(), + delta_context.is_some(), + )); } BackupEvent::Archive { meta, body } => { let name = backup_name @@ -557,31 +563,17 @@ pub(crate) async fn finalize_backup(f: Finalize<'_>) -> Result<()> { let hostname = hostname().unwrap_or_default(); let finish_time = chrono::Utc::now(); - // Wire the parent linkage into the sentinel only when increment - // generation actually ran (delta_context is set). If the delta map - // build failed earlier, parent stays informational but the sentinel - // must claim FULL — otherwise restore would walk a chain whose - // increments don't exist let (incr_from_lsn, incr_from_name, incr_full_name, incr_count, incr_format) = - match (parent, delta_context) { - (Some(p), Some(ctx)) => ( - Some(p.start_lsn), - Some(p.name.clone()), - Some(resolve_increment_full_name(p)), - Some(p.increment_count as i32), - ctx.format, - ), - _ => (None, None, None, None, IncrementFormat::default()), - }; + increment_sentinel_fields(parent, delta_context); let sentinel = BackupSentinelDto { - backup_start_lsn: Some(start_lsn), + backup_start_lsn: NonZeroU64::new(start_lsn), increment_from_lsn: incr_from_lsn, increment_from: incr_from_name, increment_full_name: incr_full_name, increment_count: incr_count, increment_format: incr_format, pg_version, - backup_finish_lsn: Some(end_lsn), + backup_finish_lsn: NonZeroU64::new(end_lsn), system_identifier: Some(system_identifier), uncompressed_size, compressed_size, @@ -725,6 +717,26 @@ pub(crate) fn wrap_counted_reader(input: AsyncReader, counter: Arc) - #[allow(dead_code)] fn _bytes_marker(_: BytesMut) {} +/// Resolve the stored backup name. A delta backup gets a +/// `_D_` suffix (wal-g convention; delete/list/show key +/// off it), but only when increment generation actually ran (`has_delta`). A +/// failed delta-map build falls back to a full, so the name must stay plain — +/// the sentinel reports FULL and a `_D_` name would claim a chain the +/// increments don't back +pub(crate) fn resolve_backup_name( + base_name: &str, + parent: Option<&PrevBackupInfo>, + has_delta: bool, +) -> String { + match parent { + Some(p) if has_delta => format!( + "{base_name}_D_{}", + p.name.strip_prefix(BACKUP_NAME_PREFIX).unwrap_or(&p.name), + ), + _ => base_name.to_string(), + } +} + /// Pick the chain-root name to record under `DeltaFullName`. /// `PrevBackupInfo.increment_full_name` is empty when the parent IS the /// chain root (no further indirection in V2 sentinel), in which case the @@ -737,9 +749,50 @@ fn resolve_increment_full_name(p: &PrevBackupInfo) -> String { } } +/// Sentinel increment linkage `(from_lsn, from_name, full_name, count, format)`, +/// wired only when increment generation actually ran (`delta_context` set). A +/// failed delta build leaves `delta_context` None so every field stays empty & +/// the sentinel reports FULL — restore must not walk a chain whose increments +/// were never written. Shared by both backup-push source paths via +/// `finalize_backup` +fn increment_sentinel_fields( + parent: Option<&PrevBackupInfo>, + delta_context: Option<&DeltaContext>, +) -> ( + Option, + Option, + Option, + Option, + IncrementFormat, +) { + match (parent, delta_context) { + (Some(p), Some(ctx)) => ( + NonZeroU64::new(p.start_lsn), + Some(p.name.clone()), + Some(resolve_increment_full_name(p)), + Some(p.increment_count as i32), + ctx.format, + ), + _ => (None, None, None, None, IncrementFormat::default()), + } +} + /// PG17 wal-summaries → delta map. Returns an error if local PGDATA is absent -/// since the summaries live on the server's filesystem -pub(crate) fn build_delta_map_from_summaries( +/// since the summaries live on the server's filesystem. +/// +/// Summaries rarely span the whole `[first_used, first_not_used)` request: +/// retention drops the oldest (head gap) and the summarizer trails the backup +/// LSN (tail gap). Each uncovered span is raw-walked from the local `pg_wal` +/// (archive fallback for recycled segments) and unioned in — a few segments vs +/// re-uploading the cluster as a full backup, and no summarizer-lag race. Same +/// WAL walk as the non-summaries path, so segments resolve identically. +/// +/// No overlapping summaries at all (`NoSummariesForRange`) degrades to the same +/// raw walk over the entire range rather than failing to a full backup; only a +/// gap *between* present summaries stays fatal (see `select_for_range`) +pub(crate) async fn build_delta_map_from_summaries( + settings: &Settings, + storage: &DynStorage, pgdata: Option<&std::path::Path>, timeline: u32, first_used_lsn: u64, @@ -747,17 +800,73 @@ pub(crate) fn build_delta_map_from_summaries( ) -> Result { let pgdata = pgdata.ok_or_else(|| anyhow!("--delta-from-wal-summaries requires local PGDATA"))?; - let map = crate::pg::wal_summaries::read_for_range( + let (mut map, covered_start, covered_end) = match crate::pg::wal_summaries::read_for_range( pgdata, timeline, first_used_lsn, first_not_used_lsn, - ) - .with_context(|| { - format!( - "read WAL summaries [{first_used_lsn:X}, {first_not_used_lsn:X}) timeline {timeline}" + ) { + Ok(v) => v, + // no summaries overlap range (retention dropped all, or summarizer not + // yet caught up): treat whole span as one gap, raw-walked below — same + // mechanism as a head/tail gap, just covering everything. A gap *between* + // present summaries stays fatal (see select_for_range) + Err(crate::pg::wal_summaries::SummaryError::NoSummariesForRange { .. }) => { + tracing::info!( + target = "backup_push", + "no WAL summaries cover [{}, {}) timeline {timeline}; raw-walking whole range", + format_pg_lsn(first_used_lsn), + format_pg_lsn(first_not_used_lsn), + ); + ( + crate::pg::backup::delta::PagedFileDeltaMap::new(), + first_used_lsn, + first_used_lsn, + ) + } + Err(e) => { + return Err(e).with_context(|| { + format!( + "read WAL summaries [{}, {}) timeline {timeline}", + format_pg_lsn(first_used_lsn), + format_pg_lsn(first_not_used_lsn) + ) + }); + } + }; + let wal_dir = pgdata.join("pg_wal"); + for (lo, hi) in [ + (first_used_lsn, covered_start), + (covered_end, first_not_used_lsn), + ] { + if lo >= hi { + continue; + } + tracing::info!( + target = "backup_push", + "WAL summaries miss [{}, {}); raw-walking it", + format_pg_lsn(lo), + format_pg_lsn(hi), + ); + let gap = delta::build_delta_map_from_wal( + settings, + storage, + timeline, + lo, + hi, + settings.compression, + Some(&wal_dir), ) - })?; + .await + .with_context(|| { + format!( + "raw-walk WAL summary gap [{}, {}) timeline {timeline}", + format_pg_lsn(lo), + format_pg_lsn(hi) + ) + })?; + map.merge(gap); + } Ok(map) } @@ -765,11 +874,155 @@ pub(crate) fn build_delta_map_from_summaries( mod tests { use super::*; + fn sample_parent(name: &str) -> PrevBackupInfo { + PrevBackupInfo { + name: name.into(), + start_lsn: 0x0200_0000, + timeline: 1, + finish_lsn: 0, + increment_full_name: String::new(), + increment_count: 1, + is_permanent: false, + system_identifier: None, + user_data: None, + parent_increment_format: None, + parent_files: Arc::new(std::collections::HashSet::new()), + } + } + + /// Smallest valid WAL summary: magic, the 24-byte zero sentinel (no entries), + /// then the CRC32C over both. Parses to an empty change set with valid + /// coverage, isolating downstream gap-walk behaviour from summary contents + fn minimal_summary() -> Vec { + // BLOCK_REF_TABLE_MAGIC (postgres common/blkreftable.c) + let mut bytes = 0x652b_137bu32.to_le_bytes().to_vec(); + bytes.extend_from_slice(&[0u8; 24]); // sentinel: end of entries + let mut h = crate::pg::wal_summaries::Crc32cHasher::new(); + h.update(&bytes); + bytes.extend_from_slice(&h.finalize().to_le_bytes()); + bytes + } + #[test] - fn delta_map_from_summaries_requires_pgdata() { + fn delta_failure_yields_plain_full_name() { + // wal-g `_D_` suffix only when increment generation ran; a failed delta + // map build falls back to a full whose name must not claim a chain the + // increments don't back + let parent = sample_parent("base_000000010000000000000002"); + let base = "base_000000010000000000000005"; + + // Delta map built → `_D_` suffix + assert_eq!( + resolve_backup_name(base, Some(&parent), true), + "base_000000010000000000000005_D_000000010000000000000002", + ); + // Delta map build failed → plain full name, never `_D_` + let full = resolve_backup_name(base, Some(&parent), false); + assert_eq!(full, base); + assert!( + !full.contains("_D_"), + "full fallback must not claim a chain" + ); + // No parent at all → plain full name + assert_eq!(resolve_backup_name(base, None, true), base); + } + + #[tokio::test] + async fn delta_map_from_summaries_requires_pgdata() { // Summaries live on the PG host filesystem; without local PGDATA the map // can't be read, so the wrapper must bail before touching disk - let err = build_delta_map_from_summaries(None, 1, 0x100, 0x200).unwrap_err(); + let tmp = tempfile::tempdir().unwrap(); + let settings = Settings::default(); + let storage: DynStorage = Arc::new(crate::storage::fs::FsStorage::new(tmp.path()).unwrap()); + let err = build_delta_map_from_summaries(&settings, &storage, None, 1, 0x100, 0x200) + .await + .unwrap_err(); assert!(format!("{err:#}").contains("PGDATA"), "{err:#}"); } + + #[tokio::test] + async fn summary_tail_gap_missing_wal_fails_to_full() { + // Summaries cover [seg1, seg2); the tail gap [seg2, seg2+100) must raw-walk + // segment 2, absent from both local pg_wal and the archive. That walk + // errors, so the whole delta-map build fails — the push then takes a full + // backup with a plain base name (no `_D_` chain it can't back) + let seg = 0x0100_0000u64; // 16 MiB WAL segment + let tmp = tempfile::tempdir().unwrap(); + let pgdata = tmp.path().join("pgdata"); + let summaries = pgdata.join("pg_wal/summaries"); + std::fs::create_dir_all(&summaries).unwrap(); + // timeline 1, start=seg, end=2*seg: covers exactly segment 1, no tail + let fname = format!( + "{:08X}{:08X}{:08X}{:08X}{:08X}.summary", + 1u32, + 0u32, + seg as u32, + 0u32, + (2 * seg) as u32, + ); + std::fs::write(summaries.join(fname), minimal_summary()).unwrap(); + + let bucket = tmp.path().join("bucket"); + std::fs::create_dir_all(&bucket).unwrap(); + let storage: DynStorage = Arc::new(crate::storage::fs::FsStorage::new(&bucket).unwrap()); + let settings = Settings::default(); + + let err = build_delta_map_from_summaries( + &settings, + &storage, + Some(&pgdata), + 1, + seg, // first_used = parent start + 2 * seg + 100, // first_not_used: tail gap inside segment 2 + ) + .await + .unwrap_err(); + assert!( + format!("{err:#}").contains("WAL summary gap"), + "tail-gap raw walk must fail on the missing segment: {err:#}" + ); + + // Build failed → has_delta=false → plain base name, never `_D_` + let parent = sample_parent("base_000000010000000000000001"); + let base = "base_000000010000000000000002"; + let name = resolve_backup_name(base, Some(&parent), false); + assert_eq!(name, base); + assert!( + !name.contains("_D_"), + "full fallback must not claim a chain" + ); + } + + #[test] + fn delta_failure_clears_increment_sentinel_fields() { + // delta_context None (failed/absent delta build) → every increment + // linkage field empty so the sentinel reports FULL. finalize_backup is + // shared by the streaming & filesystem paths, so this one gate covers both + let parent = sample_parent("base_000000010000000000000002"); + let (from_lsn, from_name, full_name, count, format) = + increment_sentinel_fields(Some(&parent), None); + assert!(from_lsn.is_none()); + assert!(from_name.is_none()); + assert!(full_name.is_none()); + assert!(count.is_none()); + assert_eq!(format, IncrementFormat::default()); + } + + #[test] + fn increment_sentinel_fields_populated_with_delta() { + // A real delta build (delta_context set) wires full parent linkage + let parent = sample_parent("base_000000010000000000000002"); + let ctx = DeltaContext { + map: Arc::new(crate::pg::backup::delta::PagedFileDeltaMap::new()), + format: IncrementFormat::default(), + parent_files: Arc::new(std::collections::HashSet::new()), + parent_start_lsn: None, + }; + let (from_lsn, from_name, full_name, count, _) = + increment_sentinel_fields(Some(&parent), Some(&ctx)); + assert_eq!(from_lsn, NonZeroU64::new(parent.start_lsn)); + assert_eq!(from_name.as_deref(), Some(parent.name.as_str())); + assert!(full_name.is_some()); + assert_eq!(count, Some(parent.increment_count as i32)); + } } diff --git a/src/pg/backup/show.rs b/src/pg/backup/show.rs index e4c1b8c..a47cace 100644 --- a/src/pg/backup/show.rs +++ b/src/pg/backup/show.rs @@ -152,14 +152,14 @@ fn print_plain(name: &str, s: &BackupSentinelDtoV2, files: Option<&FilesMetadata "start_lsn {}", s.sentinel .backup_start_lsn - .map(format_pg_lsn) + .map(|l| format_pg_lsn(l.get()).to_string()) .unwrap_or_else(|| "-".into()) ); println!( "finish_lsn {}", s.sentinel .backup_finish_lsn - .map(format_pg_lsn) + .map(|l| format_pg_lsn(l.get()).to_string()) .unwrap_or_else(|| "-".into()) ); println!("uncompressed_size {}", s.sentinel.uncompressed_size); @@ -193,6 +193,8 @@ fn print_plain(name: &str, s: &BackupSentinelDtoV2, files: Option<&FilesMetadata #[cfg(test)] mod tests { + use std::num::NonZeroU64; + use super::*; use crate::pg::backup::test_fixtures::{fs_store, put_files_metadata, put_sentinel}; use crate::pg::backup::{BackupSentinelDto, FileDescription, LATEST, TablespaceSpec}; @@ -202,8 +204,8 @@ mod tests { fn sentinel() -> BackupSentinelDtoV2 { BackupSentinelDtoV2 { sentinel: BackupSentinelDto { - backup_start_lsn: Some(0x0200_0000), - backup_finish_lsn: Some(0x0200_1000), + backup_start_lsn: NonZeroU64::new(0x0200_0000), + backup_finish_lsn: NonZeroU64::new(0x0200_1000), pg_version: 160003, system_identifier: Some(7_000_000_000_000_000_000), uncompressed_size: 2048, diff --git a/src/pg/backup/tar_streamer.rs b/src/pg/backup/tar_streamer.rs index 9b3f857..5968f24 100644 --- a/src/pg/backup/tar_streamer.rs +++ b/src/pg/backup/tar_streamer.rs @@ -14,7 +14,8 @@ //! `Archive` / `Builder`; per-part output flows over an mpsc of `Bytes` that //! the caller reads as an `AsyncRead` (see `ChannelReader`) -use std::collections::{BTreeSet, HashMap, HashSet}; +use std::collections::{HashMap, HashSet}; +use std::num::NonZeroU64; use std::pin::Pin; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; @@ -72,6 +73,13 @@ pub struct DeltaContext { /// Paths present in the increment-base backup. Files absent here are new /// since the parent and must ship in full, not as increments pub parent_files: Arc>, + /// Parent backup's start LSN, for the page-LSN final-state filter. `Some` + /// only on the filesystem push path, which has random page access to read + /// each candidate block's on-disk page header; the WAL/summary candidate + /// set is trimmed to blocks whose page changed at/after this LSN (wal-g's + /// selectivity). `None` on the BASE_BACKUP stream path (no random access), + /// leaving the candidate set unfiltered + pub parent_start_lsn: Option, } impl Default for StreamerOpts { @@ -339,36 +347,105 @@ pub(crate) fn classify_for_delta( // Filter to blocks that actually exist in the current file. Blocks past // entry_size/BLCKSZ would underflow the wi1/native reader on apply let file_blocks = (entry_size / PG_PAGE_SIZE) as u32; - let filtered: BTreeSet = match lookup { - Some(s) => s.into_iter().filter(|b| *b < file_blocks).collect(), + let blocks_vec: Vec = match lookup { + Some(s) => s.into_iter().take_while(|b| *b < file_blocks).collect(), None => return DeltaClass::Skip, }; - if filtered.is_empty() { + increment_class_for_blocks(ctx.format, entry_size, blocks_vec) +} + +/// Encode the increment class for a final block set (already filtered to the +/// file's range, ascending). Empty → `Skip`; a header-encoding failure degrades +/// to `Passthrough` (ship full), matching `classify_for_delta`. Split out so the +/// fs push path can rebuild the class after the page-LSN filter trims blocks +pub(crate) fn increment_class_for_blocks( + format: IncrementFormat, + entry_size: u64, + blocks: Vec, +) -> DeltaClass { + if blocks.is_empty() { return DeltaClass::Skip; } - let blocks_vec: Vec = filtered.into_iter().collect(); let mut header_bytes = Vec::new(); - match ctx.format { + match format { IncrementFormat::Wi1 => { - if write_increment_header(&mut header_bytes, entry_size, &blocks_vec).is_err() { + if write_increment_header(&mut header_bytes, entry_size, &blocks).is_err() { return DeltaClass::Passthrough; } } IncrementFormat::Native => { - let trunc = file_blocks; - if write_native_increment_header(&mut header_bytes, trunc, &blocks_vec).is_err() { + let trunc = (entry_size / PG_PAGE_SIZE) as u32; + if write_native_increment_header(&mut header_bytes, trunc, &blocks).is_err() { return DeltaClass::Passthrough; } } } - let total_size = header_bytes.len() as u64 + (blocks_vec.len() as u64) * PG_PAGE_SIZE; + let total_size = header_bytes.len() as u64 + (blocks.len() as u64) * PG_PAGE_SIZE; DeltaClass::Increment { header_bytes, - blocks: blocks_vec, + blocks, total_size, } } +// ─── PG page header (page-LSN final-state filter) ─────────────────────────── + +/// Bytes of the postgres page header consulted by the page-LSN filter. `pd_lsn` +/// occupies the first 8 (`xlogid` high u32, `xrecoff` low u32, native-endian); +/// validity checks reach through `pd_pagesize_version` at offset 18 +pub(crate) const PG_PAGE_HEADER_SIZE: usize = 24; + +/// wal-g `postgres_page_header.go` constants +const PAGE_VALID_FLAGS: u16 = 7; +const PAGE_LAYOUT_VERSION: u16 = 5; + +fn page_lsn(h: &[u8]) -> u64 { + let hi = u32::from_le_bytes(h[0..4].try_into().unwrap()) as u64; + let lo = u32::from_le_bytes(h[4..8].try_into().unwrap()) as u64; + (hi << 32) | lo +} + +/// `PageIsNew`: `pd_upper == 0` (offset 14). A vacuumed/never-initialised page +fn page_is_new(h: &[u8]) -> bool { + u16::from_le_bytes(h[14..16].try_into().unwrap()) == 0 +} + +/// Mirrors wal-g `PageHeader.isValid`: flag/offset sanity plus a non-zero LSN +/// and a `BLCKSZ`-matching size/version. A page failing this is torn or not a +/// standard heap page, so its LSN can't be trusted for the filter +fn page_is_valid(h: &[u8]) -> bool { + let pd_flags = u16::from_le_bytes(h[10..12].try_into().unwrap()); + let pd_lower = u16::from_le_bytes(h[12..14].try_into().unwrap()); + let pd_upper = u16::from_le_bytes(h[14..16].try_into().unwrap()); + let pd_special = u16::from_le_bytes(h[16..18].try_into().unwrap()); + let pd_pagesize_version = u16::from_le_bytes(h[18..20].try_into().unwrap()); + (pd_flags & PAGE_VALID_FLAGS) == pd_flags + && pd_lower >= PG_PAGE_HEADER_SIZE as u16 + && pd_lower <= pd_upper + && pd_upper <= pd_special + && pd_special as u64 <= PG_PAGE_SIZE + && page_lsn(h) != 0 + && (pd_pagesize_version & 0xFF00) as u64 == PG_PAGE_SIZE + && (pd_pagesize_version & 0x00FF) <= PAGE_LAYOUT_VERSION +} + +/// Should a candidate block stay in the increment, given its on-disk page header +/// and the parent backup's start LSN? Mirrors wal-g `SelectNewValidPage`: keep a +/// new/empty page, keep an unparseable/torn page, and keep any page whose LSN is +/// at/after the parent (changed since). Drop only a valid, non-new page settled +/// strictly below the parent — that block is byte-identical to the parent's copy, +/// so the WAL-derived candidate set over-counted it. Never drops a block that +/// might have changed, so the increment stays correct +pub(crate) fn page_changed_since(header: &[u8], parent_start_lsn: u64) -> bool { + if header.len() < PG_PAGE_HEADER_SIZE { + return true; + } + if page_is_new(header) || !page_is_valid(header) { + return true; + } + page_lsn(header) >= parent_start_lsn +} + /// `AsyncRead` impl that emits a pre-encoded increment header followed by the /// subset of input pages whose block numbers appear in `blocks`. Reads the /// input strictly forward — pages before each target are read & discarded @@ -918,6 +995,44 @@ mod tests { assert!(res.files.contains_key(real_path), "{:?}", res.files); } + // ─── page-LSN final-state filter ──────────────────────────────────────── + + /// Valid 24-byte heap page header carrying `lsn` (pd_upper non-zero ⇒ not + /// new, size/version/offsets all in range so `page_is_valid` holds) + fn page_header(lsn: u64) -> [u8; PG_PAGE_HEADER_SIZE] { + let mut h = [0u8; PG_PAGE_HEADER_SIZE]; + h[0..4].copy_from_slice(&((lsn >> 32) as u32).to_le_bytes()); // pd_lsn high + h[4..8].copy_from_slice(&(lsn as u32).to_le_bytes()); // pd_lsn low + h[10..12].copy_from_slice(&0u16.to_le_bytes()); // pd_flags + h[12..14].copy_from_slice(&(PG_PAGE_HEADER_SIZE as u16).to_le_bytes()); // pd_lower + h[14..16].copy_from_slice(&(PG_PAGE_SIZE as u16).to_le_bytes()); // pd_upper + h[16..18].copy_from_slice(&(PG_PAGE_SIZE as u16).to_le_bytes()); // pd_special + h[18..20].copy_from_slice(&(0x2000u16 | 4).to_le_bytes()); // BLCKSZ | layout v4 + h + } + + #[test] + fn page_filter_keeps_changed_and_drops_settled() { + let parent = 200u64; + // settled strictly below parent → identical to parent's copy → drop + assert!(!page_changed_since(&page_header(100), parent)); + // changed at/after parent → keep + assert!(page_changed_since(&page_header(200), parent)); + assert!(page_changed_since(&page_header(300), parent)); + } + + #[test] + fn page_filter_keeps_new_invalid_and_short() { + // all-zero (vacuumed/new) page: pd_upper == 0 ⇒ kept despite lsn 0 + assert!(page_changed_since(&[0u8; PG_PAGE_HEADER_SIZE], 200)); + // non-new but invalid (bad size/version) ⇒ lsn untrustworthy ⇒ kept + let mut bad = page_header(50); + bad[18..20].copy_from_slice(&0u16.to_le_bytes()); // wipe pd_pagesize_version + assert!(page_changed_since(&bad, 200)); + // truncated header ⇒ kept + assert!(page_changed_since(&[0u8; 8], 200)); + } + // ─── delta mode ───────────────────────────────────────────────────────── /// Parent-backup file set for delta tests: the paths the increment base @@ -961,6 +1076,7 @@ mod tests { map: Arc::new(map), format: IncrementFormat::Wi1, parent_files: parent_set(&[rel_path]), + parent_start_lsn: None, }), ..Default::default() }, @@ -1014,6 +1130,7 @@ mod tests { map: Arc::new(map), format: IncrementFormat::Native, parent_files: parent_set(&[rel_path]), + parent_start_lsn: None, }), ..Default::default() }, @@ -1061,6 +1178,7 @@ mod tests { map: Arc::new(map), format: IncrementFormat::Wi1, parent_files: parent_set(&[rel_path]), + parent_start_lsn: None, }), ..Default::default() }, @@ -1103,6 +1221,7 @@ mod tests { map: Arc::new(map), format: IncrementFormat::Wi1, parent_files: parent_set(&[rel_path]), + parent_start_lsn: None, }), ..Default::default() }, @@ -1145,6 +1264,7 @@ mod tests { format: IncrementFormat::Wi1, // parent did NOT contain this file parent_files: parent_set(&["base/16384/99999"]), + parent_start_lsn: None, }), ..Default::default() }, @@ -1188,6 +1308,7 @@ mod tests { map: Arc::new(map), format: IncrementFormat::Wi1, parent_files: parent_set(&[rel_path]), + parent_start_lsn: None, }), ..Default::default() }, diff --git a/src/pg/backup/wal_delta.rs b/src/pg/backup/wal_delta.rs index 9fc17cf..99ff670 100644 --- a/src/pg/backup/wal_delta.rs +++ b/src/pg/backup/wal_delta.rs @@ -712,6 +712,7 @@ mod tests { start_lsn, end_lsn, compression::Method::None, + None, ) .await .unwrap(); @@ -723,4 +724,687 @@ mod tests { (1000 + n as u32..=1000 + tail as u32).collect(); assert_eq!(got, want, "sidecar group + tail WAL must cover every block"); } + + /// Unaligned start_lsn: the parent full begins mid-group (segment 20, in + /// group 16), so group 16's sidecar would cover pre-start segments the full + /// never archived and is never finalized. The consumer must raw-walk the + /// leading partial [20, 31], fold the complete group-32 sidecar (segs + /// 32..=47), then raw-walk the trailing segment 48 — recovering 1020..=1048. + /// Before the leading-partial fix this errored on the absent group-16 sidecar + /// and fell back to a raw walk of the (unfetchable) sidecar-only segments + #[tokio::test] + async fn unaligned_start_walks_leading_partial() { + use crate::pg::backup::delta::build_delta_map_from_wal; + use crate::storage::DynStorage; + use crate::storage::fs::FsStorage; + use std::sync::Arc; + + let seg_size = DEFAULT_WAL_SEG_SIZE; + let n = WAL_FILES_IN_DELTA; + let tmp = tempfile::tempdir().unwrap(); + let pg_wal = tmp.path().join("pg_wal"); + let bucket = tmp.path().join("bucket"); + std::fs::create_dir_all(&pg_wal).unwrap(); + std::fs::create_dir_all(&bucket).unwrap(); + let settings = test_settings(&bucket); + let storage: DynStorage = Arc::new(FsStorage::new(&bucket).unwrap()); + + let start = n + 4; // 20: group 16, position 4 + let last_complete = 2 * n; // group 32 + let tail = 3 * n; // 48: trailing partial group + + // Leading partial [start, 31] + trailing seg 48 fetchable as raw WAL. + // Group-32 segments are deliberately absent here so a fold of that + // sidecar (not a raw fetch) is the only way to recover their blocks + for g in (start..last_complete).chain(std::iter::once(tail)) { + let name = seg_name_from_global(1, g, seg_size).format(); + let bytes = one_record_segment(1000 + g as u32); + let r: compression::AsyncReader = Box::pin(std::io::Cursor::new(bytes)); + storage + .put(&format!("{WAL_FOLDER}/{name}"), r, None) + .await + .unwrap(); + } + + // Record 31..=47: seg 31 seeds group 32's prev_head, 32..=47 complete it. + // Group 16 stays incomplete, so its sidecar is never uploaded + for g in (last_complete - 1)..(last_complete + n) { + let name = seg_name_from_global(1, g, seg_size).format(); + let bytes = one_record_segment(1000 + g as u32); + let path = pg_wal.join(&name); + std::fs::write(&path, &bytes).unwrap(); + record_segment(&settings, &storage, &path, &name) + .await + .unwrap(); + } + + let none = compression::Method::None; + assert!( + !storage + .exists(&delta_storage_key(&delta_group_name(1, n, seg_size), none)) + .await + .unwrap(), + "leading group 16 sidecar must be absent" + ); + assert!( + storage + .exists(&delta_storage_key( + &delta_group_name(1, last_complete, seg_size), + none + )) + .await + .unwrap(), + "complete group 32 sidecar must exist" + ); + + let start_lsn = start * seg_size; + let end_lsn = tail * seg_size + 100; + let map = build_delta_map_from_wal(&settings, &storage, 1, start_lsn, end_lsn, none, None) + .await + .unwrap(); + let got: std::collections::BTreeSet = + map.locations().into_iter().map(|l| l.block_no).collect(); + let want: std::collections::BTreeSet = + (1000 + start as u32..=1000 + tail as u32).collect(); + assert_eq!( + got, want, + "leading raw walk + group-32 sidecar + trailing seg must cover every block" + ); + } + + /// Aligned mid-stream start: recording begins exactly on group boundary 32, + /// so group 32 fills all 16 positions yet never seeds prev_head (segment 31 + /// was never recorded) and its sidecar is never finalized. The consumer must + /// raw-walk the sidecar-less group 32, fold the present group-48 sidecar, then + /// raw-walk the trailing segment 64 — recovering 1032..=1064 — instead of + /// failing the whole range to a full reparse on the absent group-32 sidecar + #[tokio::test] + async fn aligned_start_walks_missing_first_group() { + use crate::pg::backup::delta::build_delta_map_from_wal; + use crate::storage::DynStorage; + use crate::storage::fs::FsStorage; + use std::sync::Arc; + + let seg_size = DEFAULT_WAL_SEG_SIZE; + let n = WAL_FILES_IN_DELTA; + let tmp = tempfile::tempdir().unwrap(); + let pg_wal = tmp.path().join("pg_wal"); + let bucket = tmp.path().join("bucket"); + std::fs::create_dir_all(&pg_wal).unwrap(); + std::fs::create_dir_all(&bucket).unwrap(); + let settings = test_settings(&bucket); + let storage: DynStorage = Arc::new(FsStorage::new(&bucket).unwrap()); + + let start = 2 * n; // 32: group boundary, recording starts here + let second_group = 3 * n; // 48 + let tail = 4 * n; // 64: trailing partial + + // Record 32..=63 aligned: group 32 fills every position but never seeds + // prev_head (segment 31 unrecorded) so no group-32 sidecar; segment 47 + // seeds group 48, which completes at segment 63 + for g in start..(start + 2 * n) { + let name = seg_name_from_global(1, g, seg_size).format(); + let bytes = one_record_segment(1000 + g as u32); + let path = pg_wal.join(&name); + std::fs::write(&path, &bytes).unwrap(); + record_segment(&settings, &storage, &path, &name) + .await + .unwrap(); + } + + // Raw WAL for the sidecar-less group 32 + the tail seg 64 fetchable; group + // 48's raw segments deliberately absent so only its sidecar covers them + for g in (start..second_group).chain(std::iter::once(tail)) { + let name = seg_name_from_global(1, g, seg_size).format(); + let bytes = one_record_segment(1000 + g as u32); + let r: compression::AsyncReader = Box::pin(std::io::Cursor::new(bytes)); + storage + .put(&format!("{WAL_FOLDER}/{name}"), r, None) + .await + .unwrap(); + } + + let none = compression::Method::None; + assert!( + !storage + .exists(&delta_storage_key( + &delta_group_name(1, start, seg_size), + none + )) + .await + .unwrap(), + "aligned first group 32 sidecar must be absent" + ); + assert!( + storage + .exists(&delta_storage_key( + &delta_group_name(1, second_group, seg_size), + none + )) + .await + .unwrap(), + "group 48 sidecar must exist" + ); + + let start_lsn = start * seg_size; + let end_lsn = tail * seg_size + 100; + let map = build_delta_map_from_wal(&settings, &storage, 1, start_lsn, end_lsn, none, None) + .await + .unwrap(); + let got: std::collections::BTreeSet = + map.locations().into_iter().map(|l| l.block_no).collect(); + let want: std::collections::BTreeSet = + (1000 + start as u32..=1000 + tail as u32).collect(); + assert_eq!( + got, want, + "raw-walked group 32 + group-48 sidecar + tail must cover every block" + ); + } + + /// Multi-segment raw-WAL fallback: no sidecars, three fetchable segments in + /// group 0 so `build_delta_map_from_wal` bails to the full walk. Exercises + /// the fetch-vs-parse prefetch pipeline (segment N+1 fetched while N parses) + /// and proves the changed-block set is the union across all segments + #[tokio::test] + async fn full_walk_pipelines_multiple_segments() { + use crate::pg::backup::delta::build_delta_map_from_wal; + use crate::storage::DynStorage; + use crate::storage::fs::FsStorage; + use std::sync::Arc; + + let seg_size = DEFAULT_WAL_SEG_SIZE; + let tmp = tempfile::tempdir().unwrap(); + let bucket = tmp.path().join("bucket"); + std::fs::create_dir_all(&bucket).unwrap(); + let settings = test_settings(&bucket); + let storage: DynStorage = Arc::new(FsStorage::new(&bucket).unwrap()); + + // Segments 1..=3 (group 0), each touching a distinct block, all fetchable + for g in 1u64..=3 { + let name = seg_name_from_global(1, g, seg_size).format(); + let bytes = one_record_segment(2000 + g as u32); + let r: compression::AsyncReader = Box::pin(std::io::Cursor::new(bytes)); + storage + .put(&format!("{WAL_FOLDER}/{name}"), r, None) + .await + .unwrap(); + } + + let start_lsn = seg_size; // segment 1 + let end_lsn = 3 * seg_size + 100; // inside segment 3 + let map = build_delta_map_from_wal( + &settings, + &storage, + 1, + start_lsn, + end_lsn, + compression::Method::None, + None, + ) + .await + .unwrap(); + let got: std::collections::BTreeSet = + map.locations().into_iter().map(|l| l.block_no).collect(); + let want: std::collections::BTreeSet = (2001..=2003).collect(); + assert_eq!( + got, want, + "full walk must union every segment's changed blocks" + ); + } + + /// A missing segment in the required range is a hard error, never a silent + /// skip: dropping segment 2 would omit its changed pages from the increment + /// and restore stale parent data. Segments 1 and 3 present, 2 missing + #[tokio::test] + async fn full_walk_errors_on_missing_segment() { + use crate::pg::backup::delta::build_delta_map_from_wal; + use crate::storage::DynStorage; + use crate::storage::fs::FsStorage; + use std::sync::Arc; + + let seg_size = DEFAULT_WAL_SEG_SIZE; + let tmp = tempfile::tempdir().unwrap(); + let bucket = tmp.path().join("bucket"); + std::fs::create_dir_all(&bucket).unwrap(); + let settings = test_settings(&bucket); + let storage: DynStorage = Arc::new(FsStorage::new(&bucket).unwrap()); + + for g in [1u64, 3] { + let name = seg_name_from_global(1, g, seg_size).format(); + let bytes = one_record_segment(2000 + g as u32); + let r: compression::AsyncReader = Box::pin(std::io::Cursor::new(bytes)); + storage + .put(&format!("{WAL_FOLDER}/{name}"), r, None) + .await + .unwrap(); + } + + build_delta_map_from_wal( + &settings, + &storage, + 1, + seg_size, + 3 * seg_size + 100, + compression::Method::None, + None, + ) + .await + .expect_err("missing required segment must error, not skip"); + } + + /// `wal_dir` makes the walk read raw segments from local `pg_wal` and fall + /// back to the archive only for what is absent locally. Segment 1 lives only + /// on disk, segments 2 and 3 only in the bucket: recovering 3001, 3002 and + /// 3003 proves local-first read with archive fallback for the rest + #[tokio::test] + async fn full_walk_prefers_local_pg_wal() { + use crate::pg::backup::delta::build_delta_map_from_wal; + use crate::storage::DynStorage; + use crate::storage::fs::FsStorage; + use std::sync::Arc; + + let seg_size = DEFAULT_WAL_SEG_SIZE; + let tmp = tempfile::tempdir().unwrap(); + let pg_wal = tmp.path().join("pg_wal"); + let bucket = tmp.path().join("bucket"); + std::fs::create_dir_all(&pg_wal).unwrap(); + std::fs::create_dir_all(&bucket).unwrap(); + let settings = test_settings(&bucket); + let storage: DynStorage = Arc::new(FsStorage::new(&bucket).unwrap()); + + // segment 1: local pg_wal only (uncompressed, raw segment name) + let name1 = seg_name_from_global(1, 1, seg_size).format(); + std::fs::write(pg_wal.join(&name1), one_record_segment(3001)).unwrap(); + // segments 2,3: archive only — exercise the NotFound → archive fallback + for g in [2u64, 3] { + let name = seg_name_from_global(1, g, seg_size).format(); + let r: compression::AsyncReader = + Box::pin(std::io::Cursor::new(one_record_segment(3000 + g as u32))); + storage + .put(&format!("{WAL_FOLDER}/{name}"), r, None) + .await + .unwrap(); + } + + let map = build_delta_map_from_wal( + &settings, + &storage, + 1, + seg_size, + 3 * seg_size + 100, + compression::Method::None, + Some(&pg_wal), + ) + .await + .unwrap(); + let got: std::collections::BTreeSet = + map.locations().into_iter().map(|l| l.block_no).collect(); + let want: std::collections::BTreeSet = [3001, 3002, 3003].into_iter().collect(); + assert_eq!( + got, want, + "local segment + archive-fallback segments recovered" + ); + } + + /// Build a `total`-byte heap record referencing `base/200/300` block 7 + /// (24 header + 20 block-0 header + 5 LONG main-data marker + main data) + fn boundary_block_record(total: usize) -> Vec { + use crate::pg::walparser::{RmId, X_LOG_RECORD_HEADER_SIZE, XLR_BLOCK_ID_DATA_LONG}; + let main_len = total - X_LOG_RECORD_HEADER_SIZE - 20 - 5; + let mut r = Vec::new(); + r.extend_from_slice(&(total as u32).to_le_bytes()); + r.extend_from_slice(&0u32.to_le_bytes()); // xact + r.extend_from_slice(&0u64.to_le_bytes()); // prev + r.push(0u8); // info + r.push(RmId::Heap as u8); + r.push(0); + r.push(0); + r.extend_from_slice(&0u32.to_le_bytes()); // crc + r.push(0u8); // block id 0 + r.push(0u8); // fork_flags: no image, no data + r.extend_from_slice(&0u16.to_le_bytes()); // data_length + r.extend_from_slice(&1663u32.to_le_bytes()); // spc + r.extend_from_slice(&200u32.to_le_bytes()); // db + r.extend_from_slice(&300u32.to_le_bytes()); // rel + r.extend_from_slice(&7u32.to_le_bytes()); // block_no + r.push(XLR_BLOCK_ID_DATA_LONG); + r.extend_from_slice(&(main_len as u32).to_le_bytes()); + r.extend_from_slice(&vec![0x5Au8; main_len]); + assert_eq!(r.len(), total); + r + } + + /// Long-header page (36 B header + 4 B align) holding `body` bytes, zero-padded + fn long_header_page(body: &[u8]) -> Vec { + use crate::pg::walparser::XLP_LONG_HEADER; + let mut page = Vec::with_capacity(WAL_PAGE_SIZE as usize); + page.extend_from_slice(&XLP_PAGE_MAGIC_PG14.to_le_bytes()); + page.extend_from_slice(&XLP_LONG_HEADER.to_le_bytes()); + page.extend_from_slice(&1u32.to_le_bytes()); // timeline + page.extend_from_slice(&0u64.to_le_bytes()); // page_address + page.extend_from_slice(&0u32.to_le_bytes()); // remaining_data_len (no continuation) + page.extend_from_slice(&12345u64.to_le_bytes()); // sysid + page.extend_from_slice(&(16u32 * 1024 * 1024).to_le_bytes()); // seg_size + page.extend_from_slice(&8192u32.to_le_bytes()); // xlog_block_size + page.extend_from_slice(&[0u8; 4]); // align 36 → 40 + page.extend_from_slice(body); + page.resize(WAL_PAGE_SIZE as usize, 0); + page + } + + /// Short-header continuation page (20 B header + 4 B align) carrying `rem` + /// bytes of remaining-data length and `body` bytes, zero-padded + fn cont_header_page(rem: u32, body: &[u8]) -> Vec { + use crate::pg::walparser::XLP_FIRST_IS_CONT_RECORD; + let mut page = Vec::with_capacity(WAL_PAGE_SIZE as usize); + page.extend_from_slice(&XLP_PAGE_MAGIC_PG14.to_le_bytes()); + page.extend_from_slice(&XLP_FIRST_IS_CONT_RECORD.to_le_bytes()); + page.extend_from_slice(&1u32.to_le_bytes()); // timeline + page.extend_from_slice(&(WAL_PAGE_SIZE as u64).to_le_bytes()); // page_address + page.extend_from_slice(&rem.to_le_bytes()); + page.extend_from_slice(&[0u8; 4]); // align 20 → 24 + page.extend_from_slice(body); + page.resize(WAL_PAGE_SIZE as usize, 0); + page + } + + async fn put_segment(storage: &DynStorage, seg: u64, bytes: Vec) { + let name = seg_name_from_global(1, seg, wal_segment_size()).format(); + let r: compression::AsyncReader = Box::pin(std::io::Cursor::new(bytes)); + storage + .put(&format!("{WAL_FOLDER}/{name}"), r, None) + .await + .unwrap(); + } + + /// End-to-end parallel full walk over a record whose body spans the seg 1 / + /// seg 2 boundary: seg 1 holds the head, seg 2 the tail. The boundary stitch + /// must recover block 7, which neither segment's in-segment parse sees alone + #[tokio::test] + async fn full_walk_stitches_boundary_record() { + use crate::pg::backup::delta::build_delta_map_from_wal; + use crate::storage::DynStorage; + use crate::storage::fs::FsStorage; + use std::sync::Arc; + + let seg_size = DEFAULT_WAL_SEG_SIZE; + let tmp = tempfile::tempdir().unwrap(); + let bucket = tmp.path().join("bucket"); + std::fs::create_dir_all(&bucket).unwrap(); + let settings = test_settings(&bucket); + let storage: DynStorage = Arc::new(FsStorage::new(&bucket).unwrap()); + + let total = 9049; + let split = 8152; // bytes on seg 1 after long header + align + let record = boundary_block_record(total); + put_segment(&storage, 1, long_header_page(&record[..split])).await; + put_segment( + &storage, + 2, + cont_header_page((total - split) as u32, &record[split..]), + ) + .await; + + let map = build_delta_map_from_wal( + &settings, + &storage, + 1, + seg_size, + 2 * seg_size + 100, + compression::Method::None, + None, + ) + .await + .unwrap(); + let got: Vec = map.locations().into_iter().map(|l| l.block_no).collect(); + assert_eq!(got, vec![7], "boundary record's block recovered via stitch"); + } + + /// A record longer than a segment (head in seg 1, all of seg 2 its middle, + /// tail in seg 3) can't be reconstructed pairwise; the parallel walk detects + /// the fully-middle seg 2 and falls back to the serial threaded walk, which + /// still recovers block 7 + #[tokio::test] + async fn full_walk_falls_back_on_multi_segment_record() { + use crate::pg::backup::delta::build_delta_map_from_wal; + use crate::storage::DynStorage; + use crate::storage::fs::FsStorage; + use std::sync::Arc; + + let seg_size = DEFAULT_WAL_SEG_SIZE; + let tmp = tempfile::tempdir().unwrap(); + let bucket = tmp.path().join("bucket"); + std::fs::create_dir_all(&bucket).unwrap(); + let settings = test_settings(&bucket); + let storage: DynStorage = Arc::new(FsStorage::new(&bucket).unwrap()); + + let head = 8152; // bytes on seg 1 (long header + align) + let mid = 8168; // bytes on seg 2 (short header + align), fully inside record + let tail = 500; + let total = head + mid + tail; + let record = boundary_block_record(total); + put_segment(&storage, 1, long_header_page(&record[..head])).await; + put_segment( + &storage, + 2, + cont_header_page((total - head) as u32, &record[head..head + mid]), + ) + .await; + put_segment( + &storage, + 3, + cont_header_page((total - head - mid) as u32, &record[head + mid..]), + ) + .await; + + let map = build_delta_map_from_wal( + &settings, + &storage, + 1, + seg_size, + 3 * seg_size + 100, + compression::Method::None, + None, + ) + .await + .unwrap(); + let got: Vec = map.locations().into_iter().map(|l| l.block_no).collect(); + assert_eq!(got, vec![7], "serial fallback recovers the oversize record"); + } + + /// A record straddling the seg 1 / seg 2 boundary leaves seg 1 with a + /// trailing head only seg 2 can complete. With seg 2 absent the parallel + /// walk must error, never return a map missing the stitched block — a + /// partial increment would restore stale parent data for that page + #[tokio::test] + async fn parallel_parse_missing_boundary_neighbor_errors() { + use crate::pg::backup::delta::build_delta_map_from_wal; + use crate::storage::DynStorage; + use crate::storage::fs::FsStorage; + use std::sync::Arc; + + let seg_size = DEFAULT_WAL_SEG_SIZE; + let tmp = tempfile::tempdir().unwrap(); + let bucket = tmp.path().join("bucket"); + std::fs::create_dir_all(&bucket).unwrap(); + let settings = test_settings(&bucket); + let storage: DynStorage = Arc::new(FsStorage::new(&bucket).unwrap()); + + // seg 1 holds only the head of a boundary-spanning record (trailing head + // non-empty); seg 2, carrying the tail, is never uploaded + let total = 9049; + let split = 8152; // bytes on seg 1 after long header + align + let record = boundary_block_record(total); + put_segment(&storage, 1, long_header_page(&record[..split])).await; + + build_delta_map_from_wal( + &settings, + &storage, + 1, + seg_size, + 2 * seg_size + 100, + compression::Method::None, + None, + ) + .await + .expect_err("absent boundary neighbor must error, not yield a partial map"); + } + + /// A complete group's sidecar truncated mid-stream (no terminator, eg an + /// interrupted upload) must never fold as a partial map: the consumer errors + /// the sidecar path and re-walks the group's raw WAL, recovering exactly the + /// real blocks. The truncated sidecar's stray tuple must not leak through + #[tokio::test] + async fn truncated_sidecar_falls_back_to_raw_walk() { + use crate::pg::backup::delta::build_delta_map_from_wal; + use crate::storage::DynStorage; + use crate::storage::fs::FsStorage; + use std::sync::Arc; + + let seg_size = DEFAULT_WAL_SEG_SIZE; + let n = WAL_FILES_IN_DELTA; + let tmp = tempfile::tempdir().unwrap(); + let bucket = tmp.path().join("bucket"); + std::fs::create_dir_all(&bucket).unwrap(); + let settings = test_settings(&bucket); + let storage: DynStorage = Arc::new(FsStorage::new(&bucket).unwrap()); + let none = compression::Method::None; + + // group 16's 16 raw segments fetchable for the fallback walk + for g in n..(2 * n) { + put_segment(&storage, g, one_record_segment(1000 + g as u32)).await; + } + + // Truncated group-16 sidecar: a lone tuple (block 9999), no terminator + // or parser state — mimics a finalize cut short + let group16 = delta_group_name(1, n, seg_size); + let mut raw = Vec::new(); + write_location_tuples(&mut raw, &[BlockLocation::new(1663, 16384, 16385, 9999)]).unwrap(); + let key = delta_storage_key(&group16, none); + let len = raw.len() as u64; + let r: compression::AsyncReader = Box::pin(std::io::Cursor::new(raw)); + storage.put(&key, r, Some(len)).await.unwrap(); + + let start_lsn = n * seg_size; // seg 16, group-aligned + let end_lsn = 2 * n * seg_size; // seg 32 exclusive → no trailing group + let map = build_delta_map_from_wal(&settings, &storage, 1, start_lsn, end_lsn, none, None) + .await + .unwrap(); + let got: std::collections::BTreeSet = + map.locations().into_iter().map(|l| l.block_no).collect(); + // raw walk of segs 16..=31 → blocks 1016..=1031; the truncated sidecar's + // 9999 is discarded, not folded + let want: std::collections::BTreeSet = (1016..=1031).collect(); + assert_eq!(got, want, "fallback raw walk recovers real blocks only"); + assert!( + !got.contains(&9999), + "truncated sidecar tuple must not leak" + ); + } + + /// Companion to [`aligned_start_walks_missing_first_group`]: when the + /// sidecar-less aligned first group also has no fetchable raw WAL, the build + /// must error rather than silently drop that group's changed blocks. Records + /// segs 32..=63 (group 48 finalizes, group 32 never does) but uploads no raw + /// segments, so group 32 is recoverable by neither sidecar nor raw walk + #[tokio::test] + async fn aligned_first_group_missing_sidecar_and_raw_errors() { + use crate::pg::backup::delta::build_delta_map_from_wal; + use crate::storage::DynStorage; + use crate::storage::fs::FsStorage; + use std::sync::Arc; + + let seg_size = DEFAULT_WAL_SEG_SIZE; + let n = WAL_FILES_IN_DELTA; + let tmp = tempfile::tempdir().unwrap(); + let pg_wal = tmp.path().join("pg_wal"); + let bucket = tmp.path().join("bucket"); + std::fs::create_dir_all(&pg_wal).unwrap(); + std::fs::create_dir_all(&bucket).unwrap(); + let settings = test_settings(&bucket); + let storage: DynStorage = Arc::new(FsStorage::new(&bucket).unwrap()); + + let start = 2 * n; // 32: group boundary, recording starts here + let second_group = 3 * n; // 48 + + // Record 32..=63 aligned: group 32 fills every position but never seeds + // prev_head (segment 31 unrecorded) so no group-32 sidecar; group 48 + // completes. Recording only writes local scratch + the finalized + // sidecar — no raw segments reach the bucket + for g in start..(start + 2 * n) { + let name = seg_name_from_global(1, g, seg_size).format(); + let bytes = one_record_segment(1000 + g as u32); + let path = pg_wal.join(&name); + std::fs::write(&path, &bytes).unwrap(); + record_segment(&settings, &storage, &path, &name) + .await + .unwrap(); + } + + let none = compression::Method::None; + assert!( + !storage + .exists(&delta_storage_key( + &delta_group_name(1, start, seg_size), + none + )) + .await + .unwrap(), + "aligned first group 32 sidecar must be absent" + ); + assert!( + storage + .exists(&delta_storage_key( + &delta_group_name(1, second_group, seg_size), + none + )) + .await + .unwrap(), + "group 48 sidecar must exist" + ); + + let start_lsn = start * seg_size; + let end_lsn = 4 * n * seg_size + 100; // through seg 64 + build_delta_map_from_wal(&settings, &storage, 1, start_lsn, end_lsn, none, None) + .await + .expect_err("sidecar-less group with no raw WAL must error, not drop blocks"); + } + + /// Corrupt sidecar with no fallback WAL. A truncated group-16 sidecar forces + /// the raw-WAL fallback (see [`truncated_sidecar_falls_back_to_raw_walk`] for + /// the success case), but group 16's raw segments are absent from the archive + /// too, so the build must error rather than fold the corrupt sidecar's partial + /// map — a partial increment would restore stale parent data + #[tokio::test] + async fn corrupt_sidecar_without_raw_wal_errors() { + use crate::pg::backup::delta::build_delta_map_from_wal; + use crate::storage::DynStorage; + use crate::storage::fs::FsStorage; + use std::sync::Arc; + + let seg_size = DEFAULT_WAL_SEG_SIZE; + let n = WAL_FILES_IN_DELTA; + let tmp = tempfile::tempdir().unwrap(); + let bucket = tmp.path().join("bucket"); + std::fs::create_dir_all(&bucket).unwrap(); + let settings = test_settings(&bucket); + let storage: DynStorage = Arc::new(FsStorage::new(&bucket).unwrap()); + let none = compression::Method::None; + + // Truncated group-16 sidecar: lone tuple, no terminator. No raw segments + // uploaded, so the fallback walk has nothing to read + let group16 = delta_group_name(1, n, seg_size); + let mut raw = Vec::new(); + write_location_tuples(&mut raw, &[BlockLocation::new(1663, 16384, 16385, 9999)]).unwrap(); + let key = delta_storage_key(&group16, none); + let len = raw.len() as u64; + let r: compression::AsyncReader = Box::pin(std::io::Cursor::new(raw)); + storage.put(&key, r, Some(len)).await.unwrap(); + + let start_lsn = n * seg_size; // seg 16, group-aligned + let end_lsn = 2 * n * seg_size; // seg 32 exclusive → only group 16 + build_delta_map_from_wal(&settings, &storage, 1, start_lsn, end_lsn, none, None) + .await + .expect_err("corrupt sidecar with no raw WAL must error, not fold a partial map"); + } } diff --git a/src/pg/replication/server.rs b/src/pg/replication/server.rs index 4b51dea..cbbf1f1 100644 --- a/src/pg/replication/server.rs +++ b/src/pg/replication/server.rs @@ -387,7 +387,7 @@ fn encode_identify_system(tx: &mut BytesMut, identity: &Identity) { tx[row_desc_len_pos..row_desc_len_pos + 4].copy_from_slice(&payload_len.to_be_bytes()); // DataRow with the 4 column values. - let xlogpos_str = format_pg_lsn(identity.xlogpos); + let xlogpos_str = format_pg_lsn(identity.xlogpos).to_string(); let columns: [Option<&str>; 4] = [ Some(identity.system_id.as_str()), None, // timeline rendered below (needs a String) diff --git a/src/pg/wal/receive.rs b/src/pg/wal/receive.rs index 956a22c..47e2a7c 100644 --- a/src/pg/wal/receive.rs +++ b/src/pg/wal/receive.rs @@ -30,6 +30,7 @@ //! `archive_command`-driven pushes. use std::collections::BTreeSet; +use std::num::NonZeroU64; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -417,7 +418,7 @@ pub async fn handle(settings: &Settings, storage: DynStorage, archive_dir: &Path // recycle WAL below this, so a gap is possible if we fall behind, traded for // not pinning primary WAL let start_lsn = match slot_name.as_deref() { - Some(_) if slot.exists => slot.restart_lsn.unwrap_or(xlogpos), + Some(_) if slot.exists => slot.restart_lsn.map_or(xlogpos, |l| l.get()), Some(name) => { conn.create_physical_replication_slot(name).await?; tracing::info!(target = "wal_receive", "created replication slot {name}"); @@ -569,7 +570,7 @@ fn validate_slot_name(name: &str) -> Result<()> { /// Physical replication slot state read from `pg_replication_slots` struct SlotInfo { exists: bool, - restart_lsn: Option, + restart_lsn: Option, } /// `wal_segment_size` from `pg_settings`. PG 10 and below report it in 8 KiB @@ -612,7 +613,8 @@ async fn query_slot_info(q: &mut ReplicationConn, slot_name: &str) -> Result, staging copy removed + let archived = store.join(crate::pg::WAL_FOLDER).join("00000002.history"); + assert_eq!(std::fs::read(&archived).unwrap(), b"1\t0/3000000\t\n"); + assert!(!dir.path().join("00000002.history").exists()); + } + #[tokio::test] async fn timeline_switch_uploads_partial_and_re_anchors() { let dir = tempfile::tempdir().unwrap(); diff --git a/src/pg/wal/show.rs b/src/pg/wal/show.rs index 8ddefa6..2536655 100644 --- a/src/pg/wal/show.rs +++ b/src/pg/wal/show.rs @@ -6,6 +6,7 @@ //! parse identically use std::collections::BTreeMap; +use std::num::NonZeroU64; use anyhow::{Context, Result}; use futures::StreamExt; @@ -53,8 +54,8 @@ pub struct GapInfo { #[derive(Debug, Clone, Serialize)] pub struct BackupRef { pub name: String, - pub start_lsn: Option, - pub finish_lsn: Option, + pub start_lsn: Option, + pub finish_lsn: Option, } pub async fn handle(storage: DynStorage, format: Format) -> Result<()> { @@ -190,10 +191,13 @@ fn print_plain(timelines: &[TimelineInfo]) { println!(" gap: {} -> {} (missing {})", g.from, g.to, g.missing); } for b in &t.backups { - let start = b.start_lsn.map(format_pg_lsn).unwrap_or_else(|| "-".into()); + let start = b + .start_lsn + .map(|l| format_pg_lsn(l.get()).to_string()) + .unwrap_or_else(|| "-".into()); let finish = b .finish_lsn - .map(format_pg_lsn) + .map(|l| format_pg_lsn(l.get()).to_string()) .unwrap_or_else(|| "-".into()); println!(" backup: {} start={} finish={}", b.name, start, finish); } @@ -375,4 +379,89 @@ mod tests { assert_eq!(t2.gaps[0].missing, 1); assert_eq!(t2.status, TimelineStatus::Lost); } + + // print_plain has no return value; exercising it confirms the range / gap / + // backup formatting (incl. both the present-LSN and the missing-LSN "-" + // arms) runs without panicking + #[test] + fn print_plain_renders_range_gaps_and_backups() { + let timelines = vec![TimelineInfo { + timeline: 1, + start_segment: Some("000000010000000000000001".into()), + end_segment: Some("000000010000000000000005".into()), + segments_count: 3, + gaps: vec![GapInfo { + from: "000000010000000000000002".into(), + to: "000000010000000000000004".into(), + missing: 1, + }], + backups: vec![ + BackupRef { + name: "base_000000010000000000000001".into(), + start_lsn: NonZeroU64::new(0x0100_0000), + finish_lsn: NonZeroU64::new(0x0100_1000), + }, + // both LSNs absent -> the "-" fallback arms + BackupRef { + name: "base_000000010000000000000003".into(), + start_lsn: None, + finish_lsn: None, + }, + ], + status: TimelineStatus::Lost, + }]; + print_plain(&timelines); + } + + #[tokio::test] + async fn gaps_by_timeline_returns_only_lossy_timelines() { + let dir = tempfile::tempdir().unwrap(); + let store: DynStorage = Arc::new(FsStorage::new(dir.path()).unwrap()); + // tli 1 contiguous (no gap), tli 2 has a hole at seg 6 + for k in [ + "wal_005/000000010000000000000001", + "wal_005/000000010000000000000002", + "wal_005/000000020000000000000005", + "wal_005/000000020000000000000007", + ] { + store.put(k, empty_body(), None).await.unwrap(); + } + let gaps = gaps_by_timeline(store).await.unwrap(); + assert_eq!(gaps.len(), 1, "only the lossy timeline is reported"); + let g = gaps.get(&2).unwrap(); + assert_eq!(g.len(), 1); + assert_eq!(g[0].missing, 1); + } + + #[tokio::test] + async fn integrity_for_backup_absent_timeline_reports_na() { + let dir = tempfile::tempdir().unwrap(); + let store: DynStorage = Arc::new(FsStorage::new(dir.path()).unwrap()); + store + .put("wal_005/000000010000000000000001", empty_body(), None) + .await + .unwrap(); + // timeline 9 has no archived segments at all + let gaps = integrity_for_backup(store, 0, 9).await.unwrap(); + assert_eq!(gaps.len(), 1); + assert_eq!(gaps[0].from, "n/a"); + assert_eq!(gaps[0].missing, 0); + } + + // a sentinel whose name carries a non-hex timeline must be skipped by + // collect rather than producing a bogus timeline (the `continue` arm) + #[tokio::test] + async fn collect_skips_backup_with_unparseable_timeline() { + use crate::pg::backup::BackupSentinelDtoV2; + use crate::pg::backup::test_fixtures::{fs_store, put_sentinel}; + + let dir = tempfile::tempdir().unwrap(); + let store = fs_store(dir.path()); + put_sentinel(&store, "base_ZZZZZZZZ0000", &BackupSentinelDtoV2::default()).await; + let tlis = collect(store).await.unwrap(); + assert!( + tlis.is_empty(), + "unparseable backup name yields no timeline" + ); + } } diff --git a/src/pg/wal/verify.rs b/src/pg/wal/verify.rs index 360eada..2d12d06 100644 --- a/src/pg/wal/verify.rs +++ b/src/pg/wal/verify.rs @@ -9,6 +9,8 @@ //! Mirrors wal-g's `wal-verify` modes; output is intentionally machine- //! readable so it can drive an exit-non-zero check +use std::num::NonZeroU64; + use anyhow::{Context, Result, anyhow}; use serde::Serialize; @@ -23,7 +25,7 @@ pub struct IntegrityReport { pub status: ReportStatus, pub backup_name: Option, pub timeline: u32, - pub start_lsn: Option, + pub start_lsn: Option, pub gaps: Vec, } @@ -121,7 +123,7 @@ pub async fn check_integrity(storage: DynStorage) -> Result { gaps: Vec::new(), }); }; - let gaps = show::integrity_for_backup(storage, start, timeline).await?; + let gaps = show::integrity_for_backup(storage, start.get(), timeline).await?; let status = if gaps.is_empty() { ReportStatus::Ok } else { @@ -163,7 +165,9 @@ fn print_integrity(r: &IntegrityReport) { println!( " backup: {name} timeline={} start_lsn={}", r.timeline, - r.start_lsn.map(format_pg_lsn).unwrap_or_else(|| "-".into()) + r.start_lsn + .map(|l| format_pg_lsn(l.get()).to_string()) + .unwrap_or_else(|| "-".into()) ); } for g in &r.gaps { @@ -190,8 +194,8 @@ mod tests { fn sentinel(seg_no: u64) -> BackupSentinelDtoV2 { BackupSentinelDtoV2 { sentinel: BackupSentinelDto { - backup_start_lsn: Some(lsn_for_seg(seg_no)), - backup_finish_lsn: Some(lsn_for_seg(seg_no)), + backup_start_lsn: NonZeroU64::new(lsn_for_seg(seg_no)), + backup_finish_lsn: NonZeroU64::new(lsn_for_seg(seg_no)), pg_version: 160003, ..Default::default() }, @@ -210,7 +214,7 @@ mod tests { let r = check_integrity(s).await.unwrap(); assert_eq!(r.status, ReportStatus::Ok); assert_eq!(r.timeline, 1); - assert_eq!(r.start_lsn, Some(lsn_for_seg(2))); + assert_eq!(r.start_lsn, NonZeroU64::new(lsn_for_seg(2))); assert!(r.gaps.is_empty()); } diff --git a/src/pg/wal_summaries.rs b/src/pg/wal_summaries.rs index c073bb6..c0c74b6 100644 --- a/src/pg/wal_summaries.rs +++ b/src/pg/wal_summaries.rs @@ -29,6 +29,7 @@ use roaring::RoaringBitmap; use thiserror::Error; use crate::pg::backup::delta::PagedFileDeltaMap; +use crate::pg::backup::format_pg_lsn; use crate::pg::walparser::RelFileNode; pub const SUMMARIES_DIR: &str = "pg_wal/summaries"; @@ -50,19 +51,16 @@ pub enum SummaryError { BadMagic { expected: u32, got: u32 }, #[error("CRC mismatch: expected {expected:08X}, got {got:08X}")] BadCrc { expected: u32, got: u32 }, - #[error("empty LSN range [{start:X}, {end:X})")] + #[error("empty LSN range [{}, {})", format_pg_lsn(*start), format_pg_lsn(*end))] EmptyRange { start: u64, end: u64 }, #[error( - "no WAL summaries cover [{start:X}, {end:X}) on timeline {timeline} \ - (enable summarize_wal and retain summaries for the full range)" + "no WAL summaries cover [{}, {}) on timeline {timeline} \ + (enable summarize_wal and retain summaries for the full range)", + format_pg_lsn(*start), format_pg_lsn(*end) )] NoSummariesForRange { start: u64, end: u64, timeline: u32 }, - #[error("WAL summary gap at start: first summary begins at {first:X}, need {need:X}")] - GapAtStart { first: u64, need: u64 }, - #[error("WAL summary gap between {a_end:X} and {b_start:X}")] + #[error("WAL summary gap between {} and {}", format_pg_lsn(*a_end), format_pg_lsn(*b_start))] GapInside { a_end: u64, b_start: u64 }, - #[error("WAL summary gap at end: last summary ends at {last:X}, need {need:X}")] - GapAtEnd { last: u64, need: u64 }, } /// One on-disk summary file, decoded from its filename. @@ -78,16 +76,19 @@ pub struct SummaryFile { /// Top-level: walk `$pgdata/pg_wal/summaries`, pick the files covering /// `[first_used_lsn, first_not_used_lsn)` on `timeline`, verify contiguous /// coverage, parse them chronologically, return main-fork blocks aggregated -/// into a `PagedFileDeltaMap` +/// into a `PagedFileDeltaMap` plus the `[covered_start, covered_end)` LSN span +/// the summaries actually span (may fall short of the request at either end; +/// see `select_for_range`) pub fn read_for_range( pg_data_dir: &Path, timeline: u32, first_used_lsn: u64, first_not_used_lsn: u64, -) -> Result { +) -> Result<(PagedFileDeltaMap, u64, u64), SummaryError> { let dir = pg_data_dir.join(SUMMARIES_DIR); let files = list_summary_files(&dir)?; - let selected = select_for_range(&files, timeline, first_used_lsn, first_not_used_lsn)?; + let (selected, covered_start, covered_end) = + select_for_range(&files, timeline, first_used_lsn, first_not_used_lsn)?; let mut state: BTreeMap = BTreeMap::new(); for f in &selected { tracing::info!( @@ -109,7 +110,7 @@ pub fn read_for_range( }); } } - Ok(delta) + Ok((delta, covered_start, covered_end)) } #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] @@ -177,16 +178,20 @@ pub fn parse_summary_filename(name: &str) -> Option { }) } -/// Pick the subset of `files` overlapping `[first_used, first_not_used)` -/// on the given timeline, then assert contiguous coverage. Errors out on -/// any gap (covers wal-g semantics: a delta with missing WAL summaries -/// must NOT silently produce an incorrect delta) +/// Pick the subset of `files` overlapping `[first_used, first_not_used)` on the +/// given timeline (sorted), with the LSN span `[covered_start, covered_end)` +/// they actually cover. Leading and trailing gaps are soft: retention drops the +/// oldest summaries (head gap) and the summarizer trails the backup LSN (tail +/// gap), so the caller raw-walks `[first_used, covered_start)` and +/// `[covered_end, first_not_used)` and unions them in. A gap *between* summaries +/// stays fatal — not normal lag/retention, so the WAL it'd need may be gone; +/// erroring drops the push to a full backup rather than a silently-wrong delta pub fn select_for_range( files: &[SummaryFile], timeline: u32, first_used_lsn: u64, first_not_used_lsn: u64, -) -> Result, SummaryError> { +) -> Result<(Vec, u64, u64), SummaryError> { if first_not_used_lsn <= first_used_lsn { return Err(SummaryError::EmptyRange { start: first_used_lsn, @@ -207,12 +212,6 @@ pub fn select_for_range( timeline, }); } - if kept[0].start_lsn > first_used_lsn { - return Err(SummaryError::GapAtStart { - first: kept[0].start_lsn, - need: first_used_lsn, - }); - } for w in kept.windows(2) { if w[1].start_lsn > w[0].end_lsn { return Err(SummaryError::GapInside { @@ -221,14 +220,9 @@ pub fn select_for_range( }); } } - let last = kept.last().unwrap(); - if last.end_lsn < first_not_used_lsn { - return Err(SummaryError::GapAtEnd { - last: last.end_lsn, - need: first_not_used_lsn, - }); - } - Ok(kept) + let covered_start = kept[0].start_lsn; + let covered_end = kept.last().unwrap().end_lsn; + Ok((kept, covered_start, covered_end)) } /// Stream one summary file, fold its entries into `state`. Reads through a @@ -528,10 +522,12 @@ mod tests { end_lsn: 0x500, }, ]; - let got = select_for_range(&files, 1, 0x150, 0x350).unwrap(); + let (got, covered_start, covered_end) = select_for_range(&files, 1, 0x150, 0x350).unwrap(); assert_eq!(got.len(), 3); assert_eq!(got[0].start_lsn, 0x100); assert_eq!(got[2].end_lsn, 0x400); + // span brackets the request: caller walks no gap + assert_eq!((covered_start, covered_end), (0x100, 0x400)); } #[test] @@ -558,15 +554,33 @@ mod tests { } #[test] - fn select_for_range_tail_missing() { + fn select_for_range_tail_gap_soft() { + // Summarizer trails the backup LSN: coverage stops at 0x200, caller walks + // [0x200, 0x300) let files = vec![SummaryFile { path: PathBuf::new(), timeline: 1, start_lsn: 0x100, end_lsn: 0x200, }]; - let err = select_for_range(&files, 1, 0x150, 0x300).unwrap_err(); - assert!(matches!(err, SummaryError::GapAtEnd { .. }), "{err:?}"); + let (got, covered_start, covered_end) = select_for_range(&files, 1, 0x150, 0x300).unwrap(); + assert_eq!(got.len(), 1); + assert_eq!((covered_start, covered_end), (0x100, 0x200)); + } + + #[test] + fn select_for_range_head_gap_soft() { + // Retention dropped the oldest summaries: coverage starts at 0x200, caller + // walks [0x150, 0x200) + let files = vec![SummaryFile { + path: PathBuf::new(), + timeline: 1, + start_lsn: 0x200, + end_lsn: 0x300, + }]; + let (got, covered_start, covered_end) = select_for_range(&files, 1, 0x150, 0x300).unwrap(); + assert_eq!(got.len(), 1); + assert_eq!((covered_start, covered_end), (0x200, 0x300)); } #[test] @@ -740,7 +754,8 @@ mod tests { let fname = "0000000100000000000001000000000000000200.summary"; std::fs::write(sum_dir.join(fname), &data).unwrap(); - let m = read_for_range(dir.path(), 1, 0x100, 0x200).unwrap(); + let (m, covered_start, covered_end) = read_for_range(dir.path(), 1, 0x100, 0x200).unwrap(); + assert_eq!((covered_start, covered_end), (0x100, 0x200)); let blocks = m .blocks_for("base/16385/100") .unwrap() diff --git a/src/pg/walparser/mod.rs b/src/pg/walparser/mod.rs index 5febda7..3a71bd7 100644 --- a/src/pg/walparser/mod.rs +++ b/src/pg/walparser/mod.rs @@ -24,8 +24,9 @@ mod types; pub use parse::{ExtractError, ParseError, extract_block_locations, parse_record_from_bytes}; pub use state::{ - ParsePageError, ReadLocationsError, WalParser, extract_locations_from_wal_file, - read_locations_from, write_location_tuples, write_locations_to, + ParsePageError, ReadLocationsError, SegmentBoundary, WalParser, + extract_locations_from_wal_file, read_locations_from, walk_segment_locations, + write_location_tuples, write_locations_to, }; pub use types::{ BKP_BLOCK_HAS_IMAGE, BKP_IMAGE_COMPRESS_LZ4, BKP_IMAGE_COMPRESS_MASK_PG15, diff --git a/src/pg/walparser/state.rs b/src/pg/walparser/state.rs index fd70795..87f67ea 100644 --- a/src/pg/walparser/state.rs +++ b/src/pg/walparser/state.rs @@ -315,6 +315,61 @@ pub fn extract_locations_from_wal_file( } } +/// Boundary fragments left by a per-segment parse, so a record crossing into an +/// adjacent segment can be stitched back together. `leading_tail` is the +/// headless continuation of a record that began in the prior segment; +/// `trailing_head` is the head of a record continuing into the next. +/// `trailing_is_record_start` is false when the segment ends still buffering a +/// record begun more than one segment back (a record longer than a segment) — +/// the per-segment model can't represent that span, so the caller must fall +/// back to a threaded walk +pub struct SegmentBoundary { + pub leading_tail: Vec, + pub trailing_head: Vec, + pub trailing_is_record_start: bool, + pub page_magic: u16, +} + +/// Parse one WAL segment buffer with a fresh parser, emitting in-segment block +/// locations through `f` and returning the boundary fragments. Independent of +/// any other segment (no threaded parser state carried in), so a full-range +/// reparse can fan these across cores and union the results; records crossing a +/// boundary are reconstructed by the caller from +/// `trailing_head[i] ++ leading_tail[i+1]`. Shares +/// [`process_locations_from_page`]'s allocation-light location-only walk. +/// Mirrors wal-g's per-segment `WalDeltaRecordingReader` +pub fn walk_segment_locations( + bytes: &[u8], + mut f: F, +) -> Result { + let mut parser = WalParser::new(); + let mut leading_tail: Option> = None; + let page = WAL_PAGE_SIZE as usize; + let mut off = 0; + while off < bytes.len() { + let end = (off + page).min(bytes.len()); + if let Some(tail) = process_locations_from_page(&mut parser, &bytes[off..end], &mut f) + .map_err(parse_to_extract)? + { + if leading_tail.is_some() { + // Second discarded tail mid-segment: wal-g's + // CantDiscardWalDataError. Only one record can continue in from + // the prior segment, so this is corruption — surface as a parse + // error so the caller skips this segment like the serial walk + return Err(ExtractError::Parse(ParseError::ContinuationNotFound)); + } + leading_tail = Some(tail); + } + off = end; + } + Ok(SegmentBoundary { + leading_tail: leading_tail.unwrap_or_default(), + trailing_head: parser.current_record_data().to_vec(), + trailing_is_record_start: parser.has_current_record_beginning(), + page_magic: parser.page_magic(), + }) +} + /// Locations-only sibling of [`WalParser::parse_records_from_page`]. /// Walks the same page-/record-stitching state machine but emits /// `BlockLocation`s through `f` instead of materialising every record's @@ -322,20 +377,25 @@ pub fn extract_locations_from_wal_file( /// allocation cost of `extract_locations_from_wal_file` from /// O(record bodies) to O(#records) header-walks + the existing partial /// record stitching buffer +/// +/// Returns the drained orphan tail when this page completes a record whose +/// *beginning* the parser never saw (a segment-leading continuation): those +/// bytes can't be parsed headless, so a per-segment walk surfaces them to +/// stitch onto the prior segment's trailing head. `None` on every other page pub fn process_locations_from_page( parser: &mut WalParser, page_data: &[u8], mut f: F, -) -> Result<(), ParsePageError> { +) -> Result>, ParsePageError> { if page_data.len() < WAL_PAGE_SIZE as usize / 2 { - return Ok(()); + return Ok(None); } let mut cursor: &[u8] = page_data; let header = match read_xlog_page_header(&mut cursor) { Ok(h) => h, Err(ParseError::ZeroPageHeader) => { if all_zero(page_data) { - return Ok(()); + return Ok(None); } return Err(ParseError::ZeroPageHeader.into()); } @@ -358,7 +418,7 @@ pub fn process_locations_from_page( // & wait for the next page (matches parse_records_from_page) if remaining_data.len() != header.remaining_data_len as usize { parser.current_record_data.extend_from_slice(remaining_data); - return Ok(()); + return Ok(None); } // Stitch buffered head (if any) with this page's trailing bytes. @@ -380,11 +440,17 @@ pub fn process_locations_from_page( if rec_header.resource_manager_id == RmId::Xlog as u8 && (rec_header.info & !XLR_INFO_MASK) == X_LOG_SWITCH { - return Ok(()); + return Ok(None); } + walk_locations_xlog_page_inner(parser, &mut ar, page_magic, f)?; + return Ok(None); } - walk_locations_xlog_page_inner(parser, &mut ar, page_magic, f) + // No buffered beginning: `stitched` is the tail of a record that began + // before this parser started. Walk the page's own records, then surface + // the orphan tail for cross-segment stitching + walk_locations_xlog_page_inner(parser, &mut ar, page_magic, f)?; + Ok((!stitched.is_empty()).then_some(stitched)) } fn walk_locations_xlog_page_inner( @@ -746,6 +812,113 @@ mod tests { assert!(!parser.has_current_record_beginning()); } + /// Build a record that references base block 7 and is `total` bytes long + /// (24 header + 20 block-0 header + 5 LONG main-data marker + main data) + fn block_record(total: usize) -> Vec { + let main_len = total - X_LOG_RECORD_HEADER_SIZE - 20 - 5; + let mut r = Vec::new(); + r.extend_from_slice(&(total as u32).to_le_bytes()); // total_record_length + r.extend_from_slice(&0u32.to_le_bytes()); // xact + r.extend_from_slice(&0u64.to_le_bytes()); // prev + r.push(0u8); // info + r.push(RmId::Heap as u8); + r.push(0); + r.push(0); + r.extend_from_slice(&0u32.to_le_bytes()); // crc + // block 0 header: no image, no data + r.push(0u8); // block id 0 + r.push(0u8); // fork_flags + r.extend_from_slice(&0u16.to_le_bytes()); // data_length + r.extend_from_slice(&100u32.to_le_bytes()); // spc + r.extend_from_slice(&200u32.to_le_bytes()); // db + r.extend_from_slice(&300u32.to_le_bytes()); // rel + r.extend_from_slice(&7u32.to_le_bytes()); // block_no + r.push(XLR_BLOCK_ID_DATA_LONG); + r.extend_from_slice(&(main_len as u32).to_le_bytes()); + r.extend_from_slice(&vec![0x5Au8; main_len]); + assert_eq!(r.len(), total); + r + } + + /// A record carrying a block reference whose body spans a segment boundary: + /// its head fills segment A, its tail opens segment B. The serial threaded + /// walk and the per-segment walk + boundary stitch must surface the same + /// block location, neither segment seeing it alone + #[test] + fn segment_boundary_record_stitches_like_serial_walk() { + use crate::pg::walparser::parse::extract_block_locations; + let total = 9049; + let record = block_record(total); + let split = 8152; // record bytes that fit on segment A after header+align + + let mut seg_a = long_page_header(0); + seg_a.extend_from_slice(&[0u8; 4]); // 36 -> 40 alignment pad + seg_a.extend_from_slice(&record[..split]); + assert_eq!(seg_a.len(), PAGE); + + let mut seg_b = short_page_header(XLP_FIRST_IS_CONT_RECORD, (total - split) as u32); + seg_b.extend_from_slice(&[0u8; 4]); // 20 -> 24 alignment pad + seg_b.extend_from_slice(&record[split..]); + seg_b.resize(PAGE, 0); + + let want = vec![BlockLocation::new(100, 200, 300, 7)]; + + // Serial threaded walk across both segments (mirrors walk_segments_pipelined) + let mut p = WalParser::new(); + let mut serial = + extract_locations_from_wal_file(&mut p, std::io::Cursor::new(seg_a.clone())).unwrap(); + serial.extend( + extract_locations_from_wal_file(&mut p, std::io::Cursor::new(seg_b.clone())).unwrap(), + ); + assert_eq!(serial, want); + + // Per-segment walk: A leaves the head, B the tail; neither sees the block + let mut locs_a = Vec::new(); + let a = walk_segment_locations(&seg_a, |l| locs_a.push(l)).unwrap(); + assert!(locs_a.is_empty(), "record incomplete in segment A"); + assert!(a.leading_tail.is_empty()); + assert!(!a.trailing_head.is_empty()); + assert!(a.trailing_is_record_start); + + let mut locs_b = Vec::new(); + let b = walk_segment_locations(&seg_b, |l| locs_b.push(l)).unwrap(); + assert!(locs_b.is_empty(), "tail alone yields no in-segment record"); + assert!(!b.leading_tail.is_empty()); + assert!(b.trailing_head.is_empty()); + + // Stitch head + tail → the boundary record's block + let mut data = a.trailing_head.clone(); + data.extend_from_slice(&b.leading_tail); + let rec = parse_record_from_bytes(&data, a.page_magic).unwrap(); + assert_eq!(extract_block_locations(std::slice::from_ref(&rec)), want); + } + + /// A record longer than a segment leaves a middle segment buffering a + /// headless continuation to EOF: trailing head set but not a record start, + /// the signal that pairwise stitching can't represent the span + #[test] + fn segment_fully_inside_record_flags_multi_segment_span() { + let total = 20049; + let record = block_record(total); + let head_on_a = 8152; // bytes on segment A after long header+align + let mid_on_b = 8168; // bytes on segment B after short header+align + + let mut seg_b = short_page_header(XLP_FIRST_IS_CONT_RECORD, (total - head_on_a) as u32); + seg_b.extend_from_slice(&[0u8; 4]); + seg_b.extend_from_slice(&record[head_on_a..head_on_a + mid_on_b]); + assert_eq!(seg_b.len(), PAGE); + + let mut locs = Vec::new(); + let b = walk_segment_locations(&seg_b, |l| locs.push(l)).unwrap(); + assert!(locs.is_empty()); + assert!(b.leading_tail.is_empty()); + assert!(!b.trailing_head.is_empty()); + assert!( + !b.trailing_is_record_start, + "middle of an oversize record is not a record start" + ); + } + #[test] fn multi_record_page_emits_all_records() { let rec = minimal_record(); diff --git a/src/retry.rs b/src/retry.rs index 211d9a2..d706456 100644 --- a/src/retry.rs +++ b/src/retry.rs @@ -165,4 +165,32 @@ mod tests { assert!(res.is_err()); assert_eq!(calls.load(Ordering::SeqCst), 3); } + + #[test] + fn backoff_jitter_stays_within_capped_window() { + let policy = RetryPolicy { + max_attempts: 8, + base_delay: Duration::from_millis(100), + max_delay: Duration::from_secs(30), + jitter: true, + }; + // full-jitter draws uniform [0, capped); attempt 1 caps at base_delay + for _ in 0..64 { + assert!(policy.backoff(1) < policy.base_delay); + } + // late attempts saturate at max_delay; jitter keeps them under it + assert!(policy.backoff(20) < policy.max_delay); + } + + #[test] + fn backoff_jitter_zero_window_returns_capped() { + // max_delay 0 -> capped is zero, the ms==0 guard returns it verbatim + let policy = RetryPolicy { + max_attempts: 4, + base_delay: Duration::from_millis(100), + max_delay: Duration::ZERO, + jitter: true, + }; + assert_eq!(policy.backoff(1), Duration::ZERO); + } } diff --git a/src/storage/creds.rs b/src/storage/creds.rs index 3903b41..a15983a 100644 --- a/src/storage/creds.rs +++ b/src/storage/creds.rs @@ -307,6 +307,66 @@ mod tests { assert!(matches!(p.credentials().await, Err(StorageError::Auth(_)))); } + #[tokio::test] + async fn credential_source_imds_fetches_and_identity_is_constant() { + let exp = (Utc::now() + chrono::Duration::hours(6)).to_rfc3339(); + let (p, _) = provider(exp, true).await; + let src = CredentialSource::Imds(Arc::new(p)); + // identity folds to a constant so rotating IMDS keys don't break copy + assert_eq!(src.identity(), "imds"); + let c = src.get().await.unwrap(); + assert_eq!(c.access_key, "ASIAEXAMPLE"); + } + + #[test] + fn static_identity_is_the_access_key() { + let src = CredentialSource::Static(Credentials { + access_key: "AKIAEXAMPLE".into(), + secret_key: "secret".into(), + session_token: None, + expires_at: None, + }); + assert_eq!(src.identity(), "AKIAEXAMPLE"); + } + + #[tokio::test] + async fn http_error_surfaces_when_role_fetch_fails() { + // token PUT succeeds; the IAM role GET 500s, so get() returns Http + let base = serve(|req: &Req| match (req.method.as_str(), req.path.as_str()) { + ("PUT", TOKEN_PATH) => Resp::new(200).body(b"TOKEN".to_vec()), + ("GET", IAM_PATH) => Resp::new(500).body(b"boom".to_vec()), + _ => Resp::new(404), + }) + .await; + let p = ImdsProvider::with_endpoint(base).unwrap(); + assert!(matches!( + p.credentials().await, + Err(StorageError::Http { status: 500, .. }) + )); + } + + #[test] + fn expires_within_honors_margin_and_static_keys() { + let soon = Credentials { + access_key: "a".into(), + secret_key: "b".into(), + session_token: Some("t".into()), + expires_at: Some(SystemTime::now() + Duration::from_secs(60)), + }; + assert!(soon.expires_within(REFRESH_MARGIN)); + let far = Credentials { + expires_at: Some(SystemTime::now() + Duration::from_secs(REFRESH_MARGIN.as_secs() * 4)), + ..soon.clone() + }; + assert!(!far.expires_within(REFRESH_MARGIN)); + // static keys never expire + let stat = Credentials { + expires_at: None, + ..soon + }; + assert!(!stat.expires_within(REFRESH_MARGIN)); + } + #[test] fn parse_creds_rejects_non_success_code() { // all key fields present so deserialization passes and the Code guard diff --git a/tests/backup_roundtrip.rs b/tests/backup_roundtrip.rs index 89d1686..2582ee2 100644 --- a/tests/backup_roundtrip.rs +++ b/tests/backup_roundtrip.rs @@ -1,6 +1,7 @@ //! Backup-list / backup-fetch end-to-end against fs storage with a synthetic //! sentinel + tar produced in wal-g format +use std::num::NonZeroU64; use std::sync::Arc; use chrono::Utc; @@ -30,9 +31,9 @@ fn test_settings() -> Settings { fn make_sentinel_v2(name_data_dir: &str) -> BackupSentinelDtoV2 { BackupSentinelDtoV2 { sentinel: BackupSentinelDto { - backup_start_lsn: Some(0x0300_0000), + backup_start_lsn: NonZeroU64::new(0x0300_0000), pg_version: 160003, - backup_finish_lsn: Some(0x0300_1000), + backup_finish_lsn: NonZeroU64::new(0x0300_1000), system_identifier: Some(7000000000000000000), uncompressed_size: 1024, compressed_size: 512, @@ -92,7 +93,7 @@ async fn list_finds_seeded_backup() { assert_eq!(summaries.len(), 1); let s = &summaries[0]; assert_eq!(s.name, backup_name); - assert_eq!(s.start_lsn, Some(0x0300_0000)); + assert_eq!(s.start_lsn, NonZeroU64::new(0x0300_0000)); assert_eq!(s.pg_version, 160003); assert_eq!(s.hostname.as_deref(), Some("testhost")); } @@ -476,7 +477,7 @@ async fn delta_parent_picks_latest_when_enabled() { // Seed two sentinels; the later one (higher LSN, later StartTime) wins let older_name = format_backup_name(1, 0x0100_0000, 16 * 1024 * 1024); let mut older = make_sentinel_v2("/var/lib/postgres/data"); - older.sentinel.backup_start_lsn = Some(0x0100_0000); + older.sentinel.backup_start_lsn = NonZeroU64::new(0x0100_0000); older.start_time = chrono::Utc::now() - chrono::Duration::hours(2); older.finish_time = older.start_time + chrono::Duration::minutes(1); put_bytes( @@ -488,7 +489,7 @@ async fn delta_parent_picks_latest_when_enabled() { let newer_name = format_backup_name(1, 0x0300_0000, 16 * 1024 * 1024); let mut newer = make_sentinel_v2("/var/lib/postgres/data"); - newer.sentinel.backup_start_lsn = Some(0x0300_0000); + newer.sentinel.backup_start_lsn = NonZeroU64::new(0x0300_0000); newer.start_time = chrono::Utc::now(); newer.finish_time = newer.start_time + chrono::Duration::minutes(1); put_bytes( @@ -693,10 +694,10 @@ async fn fetch_applies_delta_chain_wi1() { ); let mut delta_sentinel = make_sentinel_v2("/d"); delta_sentinel.sentinel.increment_from = Some(full_name.clone()); - delta_sentinel.sentinel.increment_from_lsn = Some(0x0100_0000); + delta_sentinel.sentinel.increment_from_lsn = NonZeroU64::new(0x0100_0000); delta_sentinel.sentinel.increment_full_name = Some(full_name.clone()); delta_sentinel.sentinel.increment_count = Some(1); - delta_sentinel.sentinel.backup_start_lsn = Some(0x0200_0000); + delta_sentinel.sentinel.backup_start_lsn = NonZeroU64::new(0x0200_0000); put_bytes( store.clone(), &sentinel_key(&delta_name), @@ -810,10 +811,10 @@ async fn fetch_applies_delta_chain_walg_leading_slash() { ); let mut delta_sentinel = make_sentinel_v2("/d"); delta_sentinel.sentinel.increment_from = Some(full_name.clone()); - delta_sentinel.sentinel.increment_from_lsn = Some(0x0100_0000); + delta_sentinel.sentinel.increment_from_lsn = NonZeroU64::new(0x0100_0000); delta_sentinel.sentinel.increment_full_name = Some(full_name.clone()); delta_sentinel.sentinel.increment_count = Some(1); - delta_sentinel.sentinel.backup_start_lsn = Some(0x0200_0000); + delta_sentinel.sentinel.backup_start_lsn = NonZeroU64::new(0x0200_0000); put_bytes( store.clone(), &sentinel_key(&delta_name), @@ -909,10 +910,10 @@ async fn fetch_walks_three_step_chain() { ); let mut s1 = make_sentinel_v2("/d"); s1.sentinel.increment_from = Some(full_name.clone()); - s1.sentinel.increment_from_lsn = Some(0x0100_0000); + s1.sentinel.increment_from_lsn = NonZeroU64::new(0x0100_0000); s1.sentinel.increment_full_name = Some(full_name.clone()); s1.sentinel.increment_count = Some(1); - s1.sentinel.backup_start_lsn = Some(0x0200_0000); + s1.sentinel.backup_start_lsn = NonZeroU64::new(0x0200_0000); put_bytes( store.clone(), &sentinel_key(&delta1_name), @@ -959,10 +960,10 @@ async fn fetch_walks_three_step_chain() { ); let mut s2 = make_sentinel_v2("/d"); s2.sentinel.increment_from = Some(delta1_name.clone()); - s2.sentinel.increment_from_lsn = Some(0x0200_0000); + s2.sentinel.increment_from_lsn = NonZeroU64::new(0x0200_0000); s2.sentinel.increment_full_name = Some(full_name.clone()); s2.sentinel.increment_count = Some(2); - s2.sentinel.backup_start_lsn = Some(0x0300_0000); + s2.sentinel.backup_start_lsn = NonZeroU64::new(0x0300_0000); put_bytes( store.clone(), &sentinel_key(&delta2_name), diff --git a/tests/cli_bin.rs b/tests/cli_bin.rs index e5844e0..e956c88 100644 --- a/tests/cli_bin.rs +++ b/tests/cli_bin.rs @@ -3,6 +3,7 @@ //! covered. cargo-llvm-cov merges coverage from spawned instrumented children //! via LLVM_PROFILE_FILE. +use std::num::NonZeroU64; use std::path::Path; use std::process::Command; @@ -26,8 +27,8 @@ const BACKUP: &str = "base_000000010000000000000002"; fn seed_store(dir: &Path) { let sentinel = BackupSentinelDtoV2 { sentinel: BackupSentinelDto { - backup_start_lsn: Some(0x0200_0000), - backup_finish_lsn: Some(0x0200_1000), + backup_start_lsn: NonZeroU64::new(0x0200_0000), + backup_finish_lsn: NonZeroU64::new(0x0200_1000), pg_version: 160003, uncompressed_size: 2048, compressed_size: 1024, diff --git a/tests/retention.rs b/tests/retention.rs index a45a16a..66185ce 100644 --- a/tests/retention.rs +++ b/tests/retention.rs @@ -4,6 +4,7 @@ //! impermanent backups, and WAL segments. Exercises each `delete` mode and //! the `copy` command end-to-end. Mirrors wal-g's `delete_test.go` shape +use std::num::NonZeroU64; use std::sync::Arc; use chrono::Utc; @@ -34,9 +35,9 @@ fn seg_size() -> u64 { fn make_sentinel(start_lsn: u64, is_permanent: bool) -> BackupSentinelDtoV2 { BackupSentinelDtoV2 { sentinel: BackupSentinelDto { - backup_start_lsn: Some(start_lsn), + backup_start_lsn: NonZeroU64::new(start_lsn), pg_version: 160003, - backup_finish_lsn: Some(start_lsn + seg_size()), + backup_finish_lsn: NonZeroU64::new(start_lsn + seg_size()), system_identifier: Some(7000000000000000000), uncompressed_size: 1024, compressed_size: 512, @@ -310,7 +311,7 @@ async fn delete_target_drops_delta_dependants() { let store = Arc::new(FsStorage::new(dir.path()).unwrap()); let full = backup_name(1, seg_size()); let mut full_s = make_sentinel(seg_size(), false); - full_s.sentinel.backup_start_lsn = Some(seg_size()); + full_s.sentinel.backup_start_lsn = NonZeroU64::new(seg_size()); put_bytes( &store, &sentinel_key(&full), diff --git a/tests/vm_live.rs b/tests/vm_live.rs index be62811..fb3e668 100644 --- a/tests/vm_live.rs +++ b/tests/vm_live.rs @@ -540,17 +540,28 @@ async fn delta_chain_against_live_pg() { psql(&format!("SELECT count(*) FROM {tbl}")); // settle hint bits on touched pages psql("CHECKPOINT"); - // self-archive WAL so the WAL-walk delta map sees the changed blocks + // Archive WAL as the fallback source for segments PG recycles; the delta + // push below reads the live pg_wal first psql("SELECT pg_switch_wal()"); - let data_dir = psql("SHOW data_directory"); - archive_pg_wal(&s, &store, &std::path::Path::new(&data_dir).join("pg_wal")).await; + let data_dir = std::path::PathBuf::from(psql("SHOW data_directory")); + archive_pg_wal(&s, &store, &data_dir.join("pg_wal")).await; - // (2) delta backup off the full (WALG_DELTA_MAX_STEPS=1) + // (2) delta backup off the full (WALG_DELTA_MAX_STEPS=1). Real deltas read a + // local PGDATA (filesystem source): only the changed blocks ship, and the + // WAL-walk delta map serves segments from the live pg_wal. BASE_BACKUP has no + // local WAL & streams every block, so it's a full-backup path only let mut s_delta = s.clone(); s_delta.delta.max_steps = 1; - backup::push::handle(&s_delta, store.clone(), default_push_args()) - .await - .expect("delta backup"); + backup::push::handle( + &s_delta, + store.clone(), + backup::push::PushArgs { + pgdata: Some(data_dir.clone()), + ..default_push_args() + }, + ) + .await + .expect("delta backup"); let delta_name = backup::fetch::resolve_name(&store, "LATEST").await.unwrap(); assert_ne!(delta_name, full_name, "delta should be the new LATEST"); assert!( @@ -860,7 +871,7 @@ async fn wal_summaries_parse_real_pg_files() { ))) .unwrap(); - let map = + let (map, _covered_start, _covered_end) = walrus::pg::wal_summaries::read_for_range(std::path::Path::new(&data_dir), tli, start, end) .expect("parse real PG WAL summaries"); assert!(!map.is_empty(), "summary map should carry changed blocks"); diff --git a/tests/wal_roundtrip.rs b/tests/wal_roundtrip.rs index d8ef25c..fba5930 100644 --- a/tests/wal_roundtrip.rs +++ b/tests/wal_roundtrip.rs @@ -1,5 +1,6 @@ //! End-to-end wal-push -> wal-fetch with fs backend; bytes must match +use std::num::NonZeroU64; use std::path::PathBuf; use std::sync::Arc; @@ -509,6 +510,86 @@ async fn wal_restore_fills_gap_into_local_dir() { // covers the unhappy path: restore tolerates missing-segment errors } +#[tokio::test] +async fn wal_restore_timeline_filter_skips_other_timelines() { + use walrus::pg::wal::restore; + use walrus::storage::Storage; + + let dir = tempfile::tempdir().unwrap(); + let storage_dir = dir.path().join("storage"); + let stage = dir.path().join("stage"); + let restore_dst = dir.path().join("restore"); + std::fs::create_dir_all(&stage).unwrap(); + let store = Arc::new(FsStorage::new(&storage_dir).unwrap()); + let s = settings_for(storage_dir.to_str().unwrap(), Method::None); + + // tli 1 hole at seg 3 (push 1,2,4); tli 2 hole at seg 6 (push 5,7) + for hex in [ + "000000010000000000000001", + "000000010000000000000002", + "000000010000000000000004", + "000000020000000000000005", + "000000020000000000000007", + ] { + let p = stage.join(hex); + std::fs::write(&p, hex.as_bytes()).unwrap(); + walrus::pg::wal::push::handle(&s, store.clone(), &p) + .await + .unwrap(); + } + + // Filter to tli 2: the tli-1 gap is skipped before its segments expand, so + // only tli-2's missing seg is attempted (and tolerated as unfetchable) + restore::handle(&s, store as Arc, &restore_dst, Some(2)) + .await + .unwrap(); + assert!( + !restore_dst.join("000000010000000000000003").exists(), + "filtered-out timeline must not be restored" + ); +} + +#[tokio::test] +async fn wal_restore_skips_segment_already_present() { + use walrus::pg::wal::restore; + use walrus::storage::Storage; + + let dir = tempfile::tempdir().unwrap(); + let storage_dir = dir.path().join("storage"); + let stage = dir.path().join("stage"); + let restore_dst = dir.path().join("restore"); + std::fs::create_dir_all(&stage).unwrap(); + std::fs::create_dir_all(&restore_dst).unwrap(); + let store = Arc::new(FsStorage::new(&storage_dir).unwrap()); + let s = settings_for(storage_dir.to_str().unwrap(), Method::None); + + // Hole at seg 3 (push 1,2,4) + for hex in [ + "000000010000000000000001", + "000000010000000000000002", + "000000010000000000000004", + ] { + let p = stage.join(hex); + std::fs::write(&p, hex.as_bytes()).unwrap(); + walrus::pg::wal::push::handle(&s, store.clone(), &p) + .await + .unwrap(); + } + + // Pre-place the missing segment in dst: restore must skip it (idempotent), + // leaving the sentinel bytes untouched + let present = restore_dst.join("000000010000000000000003"); + std::fs::write(&present, b"already-here").unwrap(); + restore::handle(&s, store as Arc, &restore_dst, None) + .await + .unwrap(); + assert_eq!( + std::fs::read(&present).unwrap(), + b"already-here", + "an already-present segment must not be overwritten" + ); +} + #[tokio::test] async fn wal_verify_integrity_detects_gap_after_backup() { use walrus::pg::backup::{format_backup_name, sentinel_key}; @@ -539,9 +620,9 @@ async fn wal_verify_integrity_detects_gap_after_backup() { let backup_name = format_backup_name(1, seg_size, seg_size); let v2 = walrus::pg::backup::BackupSentinelDtoV2 { sentinel: walrus::pg::backup::BackupSentinelDto { - backup_start_lsn: Some(seg_size), + backup_start_lsn: NonZeroU64::new(seg_size), pg_version: 160003, - backup_finish_lsn: Some(seg_size + 16), + backup_finish_lsn: NonZeroU64::new(seg_size + 16), system_identifier: Some(1), files_metadata_disabled: true, ..Default::default()