diff --git a/Cargo.lock b/Cargo.lock index 45c4f13..0981273 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -97,6 +97,22 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "astral-tokio-tar" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb50a7aae84a03bf55b067832bc376f4961b790c97e64d3eacee97d389b90277" +dependencies = [ + "filetime", + "futures-core", + "libc", + "portable-atomic", + "rustc-hash", + "tokio", + "tokio-stream", + "xattr", +] + [[package]] name = "async-compression" version = "0.4.42" @@ -1297,6 +1313,12 @@ version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + [[package]] name = "postgres-protocol" version = "0.6.12" @@ -2048,6 +2070,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-util" version = "0.7.18" @@ -2253,6 +2286,7 @@ name = "wal-rus" version = "0.1.1" dependencies = [ "anyhow", + "astral-tokio-tar", "async-compression", "async-trait", "aws-lc-rs", @@ -2265,6 +2299,7 @@ dependencies = [ "fallible-iterator", "futures", "hex", + "libc", "percent-encoding", "postgres-protocol", "quick-xml", diff --git a/Cargo.toml b/Cargo.toml index 249e67c..16d0753 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,7 +41,7 @@ roaring = "0.11" aws-lc-rs = "1" quick-xml = "0.40" chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] } -tar = "0.4" +astral-tokio-tar = "0.6" postgres-protocol = "0.6" fallible-iterator = "0.2" tokio-rustls = { version = "0.26", default-features = false, features = ["aws-lc-rs", "tls12"] } @@ -50,9 +50,11 @@ rustls-pki-types = "1" rustls-pemfile = "2" webpki-roots = "1" dryoc = { version = "0.8", default-features = false, features = ["u64_backend"] } +libc = "0.2" [dev-dependencies] tempfile = "3" +tar = "0.4" [features] # Enabled only on the VM test runner: hits a real PG cluster at PGPORT diff --git a/bench/README.md b/bench/README.md index 403d2c7..5ee4d1c 100644 --- a/bench/README.md +++ b/bench/README.md @@ -3,8 +3,8 @@ Reproducible single-host benchmark comparing three PostgreSQL 18 WAL archivers on **throughput** and **memory** under heavy write load: -- **walrus** (this repo, Rust) — serial wal-push daemon -- **wal-g** (Go) — fan-out daemon (`WALG_UPLOAD_CONCURRENCY`) +- **walrus** (this repo, Rust) — look-ahead fan-out daemon (`WALG_UPLOAD_CONCURRENCY`; pre-uploads `concurrency-1` segments, streaming per-upload, no full-segment buffer) +- **wal-g** (Go) — fan-out daemon (same `WALG_UPLOAD_CONCURRENCY`) - **pgbackrest** (C) — daemonless; PG forks `archive-push`, async `process-max` workers All three are driven identically: PG `archive_command` → the tool's own client → S3. @@ -118,7 +118,7 @@ daemon (~27 MB for walrus; wal-g's fan-out daemon adds more baseline). | OP | walrus / wal-g | pgbackrest | measures | |---|---|---|---| -| `backup-send` | `backup-push --full` | `backup --type=full` | full base backup → S3 | +| `backup-send` | `backup-push --full` | `backup --type=full` | full base backup → S3 | | `backup-fetch` | `backup-fetch LATEST` | `restore` | restore ← S3 | | `backup-delta` | `backup-push` (delta, `wi1`) | `backup --type=incr` | delta backup → S3 | | `backup-delta-summaries` | `backup-push --delta-from-wal-summaries` | — (walrus-only) | delta from PG17 WAL summaries → S3 | @@ -174,7 +174,9 @@ Notes: ## Config knobs See `config.env.example`. Common ones: `UPLOAD_CONCURRENCY` (wal-g concurrency / -pgbackrest `process-max`), `SCALE` (pgbench DB size), `CHURN_ROWS`, `BURST_SECONDS`, +pgbackrest `process-max`; also seeds `WALG_DOWNLOAD_CONCURRENCY` so `backup-fetch` +scales with the same knob — set `DOWNLOAD_CONCURRENCY` to decouple), `SCALE` +(pgbench DB size), `CHURN_ROWS`, `BURST_SECONDS`, `BURST_WORKERS`. `matrix.sh` honors `DAEMONS` (and `RUN_ID`). Operation benchmarks add `RESTORE_DIR`, `WAL_RECV_DIR`, `WAL_RECEIVE_SECONDS`, `DELTA_CHURN_SECONDS`, `DELTA_MAX_STEPS`, `DELTA_ORIGIN`; `op_matrix.sh` honors `OPS`, `TOOLS` (and diff --git a/bench/op_matrix.sh b/bench/op_matrix.sh index 77abfaa..b3190c4 100755 --- a/bench/op_matrix.sh +++ b/bench/op_matrix.sh @@ -13,6 +13,10 @@ # Skipped cells: pgbackrest has no wal-receive equivalent; backup-delta-summaries # is walrus-only (no wal-g / pgbackrest WAL-summary delta). Override OPS / TOOLS # via env. Counterpart of matrix.sh (archive path). +# +# backup-delta-chain (DELTA_MAX_STEPS-deep chain + leaf restore) is omitted from +# the default sweep — it churns once per step, so its cost scales with depth. Opt +# in with OPS="backup-send backup-delta-chain" (backup-send must precede it). set -euo pipefail SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" diff --git a/bench/run_op.sh b/bench/run_op.sh index 73aa769..2b51427 100755 --- a/bench/run_op.sh +++ b/bench/run_op.sh @@ -2,8 +2,8 @@ # # run_op.sh OP TOOL RUN_ID # -# OP - backup-send | backup-fetch | backup-delta | -# backup-delta-summaries | wal-receive (data-movement operation) +# OP - backup-send | backup-fetch | backup-delta | backup-delta-summaries | +# backup-delta-chain | wal-receive (data-movement operation) # TOOL - walrus | walg | pgbackrest (implementation) # RUN_ID - free-form label, e.g. r1 / 2026-06-22 # @@ -11,11 +11,14 @@ # local), cross-tool where an equivalent exists. Counterpart of run.sh, which # benches the archive_command (wal-push) path; this covers the rest of walrus: # -# backup-send base backup -> S3 walrus/wal-g backup-push --full | pgbackrest backup --type=full +# backup-send base backup -> S3 walrus/wal-g backup-push ... --full | +# pgbackrest backup --type=full # backup-fetch restore <- S3 walrus/wal-g backup-fetch | pgbackrest restore # backup-delta delta backup -> S3 walrus/wal-g backup-push (wi1) | pgbackrest backup --type=incr # backup-delta-summaries delta from WAL walrus backup-push | (walrus-only) # summaries -> S3 --delta-from-wal-summaries +# backup-delta-chain N-deep delta chain walrus/wal-g backup-push xN | pgbackrest backup --type=incr xN +# + restore of leaf (origin=LATEST), then backup-fetch LATEST # wal-receive stream WAL from PG walrus/wal-g wal-receive | (no pgbackrest peer) # # Delta cells need a parent full backup (backup-send must precede them) and a @@ -28,6 +31,14 @@ # anchor to chain root. Delta size is S3-inventory byte growth across the push, # not on-disk cluster size. # +# backup-delta-chain builds a real DELTA_MAX_STEPS-deep chain: each step churns, +# drains, then pushes a delta with WALG_DELTA_ORIGIN=LATEST so it extends the +# PREVIOUS delta (LATEST_FULL would re-anchor each to the root, leaving restore +# depth 2). Every step is timed + sized on its own (chain_metrics.txt), then a +# backup-fetch LATEST walks full + all N deltas to exercise restore-time replay. +# Its churn is per-step and INSIDE the sampler window, so the daemon's archiving +# during churn is sampled too; the per-step push timings isolate the push. +# # walrus's walsender (serving WAL via the replication protocol) has no CLI entry # point yet, so wal-send is intentionally absent. # @@ -55,7 +66,7 @@ LOG_TAG=op load_config if [[ $# -ne 3 ]]; then - echo "usage: $0 " >&2 + echo "usage: $0 " >&2 exit 2 fi OP="$1" @@ -63,8 +74,8 @@ TOOL="$2" RUN_ID="$3" case "${OP}" in - backup-send|backup-fetch|backup-delta|backup-delta-summaries|wal-receive) ;; - *) echo "error: OP must be backup-send|backup-fetch|backup-delta|backup-delta-summaries|wal-receive, got '${OP}'" >&2; exit 2 ;; + backup-send|backup-fetch|backup-delta|backup-delta-summaries|backup-delta-chain|wal-receive) ;; + *) echo "error: OP must be backup-send|backup-fetch|backup-delta|backup-delta-summaries|backup-delta-chain|wal-receive, got '${OP}'" >&2; exit 2 ;; esac case "${TOOL}" in walrus|walg|pgbackrest) ;; @@ -80,9 +91,12 @@ if [[ "${OP}" == "backup-delta-summaries" && "${TOOL}" != "walrus" ]]; then exit 2 fi -# Delta ops drive a churn phase, then a delta push; group them for branch tests. +# Single-delta ops drive one churn phase, then one delta push; group for branches. IS_DELTA=0 [[ "${OP}" == "backup-delta" || "${OP}" == "backup-delta-summaries" ]] && IS_DELTA=1 +# Chain op churns + pushes per step inside the timed loop (not the single step 1b). +IS_CHAIN=0 +[[ "${OP}" == "backup-delta-chain" ]] && IS_CHAIN=1 # Backup-push ops (full + delta) take a base backup, whose pg_backup_stop blocks # on BackupWaitWalArchive until the backup's WAL is archived. So the tool's @@ -90,7 +104,7 @@ IS_DELTA=0 # process plus the mostly-idle daemon; for walrus that baseline is ~27 MB). # backup-fetch (restore) and wal-receive need no archiver. NEEDS_ARCHIVE=0 -case "${OP}" in backup-send|backup-delta|backup-delta-summaries) NEEDS_ARCHIVE=1 ;; esac +case "${OP}" in backup-send|backup-delta|backup-delta-summaries|backup-delta-chain) NEEDS_ARCHIVE=1 ;; esac : "${BUCKET:?set BUCKET in config.env}" : "${PGUSER:?set PGUSER in config.env}" @@ -122,7 +136,7 @@ WAL_RECEIVE_SECONDS="${WAL_RECEIVE_SECONDS:-300}" # Delta cells: churn window that dirties pages between the parent full and the # delta push, and the delta-chain depth handed to walrus/wal-g (WALG_DELTA_MAX_STEPS). DELTA_CHURN_SECONDS="${DELTA_CHURN_SECONDS:-300}" -DELTA_MAX_STEPS="${DELTA_MAX_STEPS:-7}" +DELTA_MAX_STEPS="${DELTA_MAX_STEPS:-3}" DELTA_ORIGIN="${DELTA_ORIGIN:-LATEST_FULL}" case "${TOOL}" in @@ -161,6 +175,28 @@ inv_size() { | awk '/Total Size:/ {print $3}' | tail -1 } +# Fail fast if no parent backup exists for a delta to anchor to. Without one, +# backup-push silently emits a FULL (mislabeled as a delta) and inv-growth sizing +# reports a full's bytes. op_matrix runs backup-send first; this guards lone runs. +assert_delta_parent() { + local roots + if [[ "${TOOL}" == "pgbackrest" ]]; then + # full backup-set dirs end in 'F/'; incr (delta) dirs end in 'I/' + roots="$(sudo aws s3 ls "s3://${BUCKET}${PGBACKREST_REPO_PATH}/backup/${PGBACKREST_STANZA}/" \ + --region "${AWS_REGION}" 2>/dev/null | awk '/ PRE / && /F\/$/ {n++} END{print n+0}')" + else + # walrus/wal-g chain root = base_ without the _D_ delta suffix + roots="$(sudo aws s3 ls "${WALG_PREFIX}/basebackups_005/" \ + --region "${AWS_REGION}" 2>/dev/null | awk '/ PRE base_/ && !/_D_/ {n++} END{print n+0}')" + fi + if [[ "${roots:-0}" -eq 0 ]]; then + echo "error: no parent full backup under ${INV_PREFIX}; run backup-send ${TOOL} ${RUN_ID} first" >&2 + echo " (a delta with no parent silently becomes a full, corrupting the measurement)" >&2 + exit 1 + fi + log "parent check: ${roots} full backup(s) under ${INV_PREFIX}" +} + # --- pre-flight: DB seeded? (backup-send + wal-receive need a populated DB) --- [[ "${OP}" == "backup-fetch" ]] || require_seeded @@ -197,7 +233,7 @@ sudo -u postgres pgbackrest --stanza="${STANZA}" stanza-create || true # backup (full or incr) needs WAL archiving live (pgbackrest blocks on the # start-WAL archive), so point archive_command at pgbackrest and drain. restore # reads only the repo. backup-delta (incr) churns + drains in the delta-prep step. -if [[ "${OP}" == "backup-send" || "${OP}" == "backup-delta" ]]; then +if [[ "${OP}" == "backup-send" || "${OP}" == "backup-delta" || "${OP}" == "backup-delta-chain" ]]; then ARCHIVE_CMD="pgbackrest --stanza=${STANZA} archive-push %p" sudo -u postgres "${PGBIN}/psql" -p 5432 -tA \ -c "ALTER SYSTEM SET archive_library = '';" \ @@ -238,6 +274,9 @@ if [[ "${OP}" == "backup-send" || "${OP}" == "wal-receive" ]]; then CHECKPOINT_BEFORE_WORKLOAD=1 fi +# Delta ops must extend an existing full; bail before churning if none exists. +[[ "${IS_DELTA}" -eq 1 || "${IS_CHAIN}" -eq 1 ]] && assert_delta_parent + # --- step 1b: delta prep — churn between the parent full and the delta push --- # The default delta map walks ARCHIVED WAL, so the churn WAL must reach the repo # before the push. The tool's archiver is already live (step 1, NEEDS_ARCHIVE) @@ -272,7 +311,7 @@ case "${OP}" in backup-send) log "base backup -> ${INV_PREFIX} (full)" case "${TOOL}" in - walrus) run_tool "${WALRUS_BIN}" backup-push --full ;; + walrus) run_tool "${WALRUS_BIN}" backup-push "${PGDATA_DIR}" --full ;; walg) run_tool "${WALG_BIN}" backup-push "${PGDATA_DIR}" --full ;; pgbackrest) sudo -u postgres pgbackrest --stanza="${PGBACKREST_STANZA}" backup --type=full ;; esac @@ -285,7 +324,7 @@ case "${OP}" in case "${TOOL}" in walrus) run_tool env WALG_DELTA_MAX_STEPS="${DELTA_MAX_STEPS}" \ WALG_DELTA_ORIGIN="${DELTA_ORIGIN}" \ - "${WALRUS_BIN}" backup-push --pgdata "${PGDATA_DIR}" ;; + "${WALRUS_BIN}" backup-push "${PGDATA_DIR}" ;; walg) run_tool env WALG_DELTA_MAX_STEPS="${DELTA_MAX_STEPS}" \ WALG_DELTA_ORIGIN="${DELTA_ORIGIN}" \ "${WALG_BIN}" backup-push "${PGDATA_DIR}" ;; @@ -300,10 +339,93 @@ case "${OP}" in log "delta-from-wal-summaries backup -> ${INV_PREFIX} (origin=${DELTA_ORIGIN}; parent inventory ${inv_before} B)" run_tool env WALG_DELTA_MAX_STEPS="${DELTA_MAX_STEPS}" \ WALG_DELTA_ORIGIN="${DELTA_ORIGIN}" \ - "${WALRUS_BIN}" backup-push --pgdata "${PGDATA_DIR}" --delta-from-wal-summaries + "${WALRUS_BIN}" backup-push "${PGDATA_DIR}" --delta-from-wal-summaries inv_after="$(inv_size)"; inv_after="${inv_after:-0}" BYTES=$(( inv_after - inv_before )); (( BYTES < 0 )) && BYTES=0 ;; + backup-delta-chain) + # Build a DELTA_MAX_STEPS-deep chain (origin=LATEST: each delta extends the + # prior one). Per step: churn, drain, then time + size the push alone. BYTES + # accumulates per-step delta payloads (not END-START inventory: that would + # also count the inter-step churn WAL). chain_metrics.txt holds the breakdown. + DELTA_ORIGIN=LATEST + CHAIN_METRICS="${RESULT_DIR}/chain_metrics.txt" + push_s_total=0 + chain_rows="" + log "delta chain: ${DELTA_MAX_STEPS} steps (origin=LATEST, cap WALG_DELTA_MAX_STEPS=${DELTA_MAX_STEPS}) -> ${INV_PREFIX}" + for ((i=1; i<=DELTA_MAX_STEPS; i++)); do + log "chain step ${i}/${DELTA_MAX_STEPS}: checkpoint + churn ${DELTA_CHURN_SECONDS}s" + checkpoint_pg + CHECKPOINT_BEFORE_WORKLOAD=1 + CH_ENV=(PGHOST="${PGHOST_DRIVER}" PGUSER="${PGUSER}" PGPASSWORD="${PGPASSWORD}" + DURATION="${DELTA_CHURN_SECONDS}" CHURN_ROWS="${CHURN_ROWS:-2000000}") + [[ -n "${BURST_WORKERS:-}" ]] && CH_ENV+=("WORKERS=${BURST_WORKERS}") + if ! env "${CH_ENV[@]}" bash "${SCRIPT_DIR}/scripts/driver/workload_burst.sh"; then + mark_invalid "chain step ${i} churn degraded (non-comparable delta)" + fi + drain_backlog 5 600 + step_before="$(inv_size)"; step_before="${step_before:-0}" + step_t0="$(date +%s.%N)" + case "${TOOL}" in + walrus) run_tool env WALG_DELTA_MAX_STEPS="${DELTA_MAX_STEPS}" WALG_DELTA_ORIGIN=LATEST \ + "${WALRUS_BIN}" backup-push "${PGDATA_DIR}" ;; + walg) run_tool env WALG_DELTA_MAX_STEPS="${DELTA_MAX_STEPS}" WALG_DELTA_ORIGIN=LATEST \ + "${WALG_BIN}" backup-push "${PGDATA_DIR}" ;; + pgbackrest) sudo -u postgres pgbackrest --stanza="${PGBACKREST_STANZA}" backup --type=incr ;; + esac + step_t1="$(date +%s.%N)" + step_after="$(inv_size)"; step_after="${step_after:-0}" + step_bytes=$(( step_after - step_before )); (( step_bytes < 0 )) && step_bytes=0 + step_s="$(awk -v a="${step_t0}" -v b="${step_t1}" 'BEGIN{printf "%.3f", b-a}')" + step_mbps="$(awk -v by="${step_bytes}" -v s="${step_s}" 'BEGIN{printf "%.2f",(s>0)?by/1e6/s:0}')" + push_s_total="$(awk -v a="${push_s_total}" -v b="${step_s}" 'BEGIN{printf "%.3f", a+b}')" + BYTES=$(( BYTES + step_bytes )) + log "chain step ${i}: elapsed=${step_s}s delta=${step_bytes} B (${step_mbps} MB/s)" + chain_rows+="step=${i} elapsed_s=${step_s} bytes=${step_bytes} mb_s=${step_mbps}"$'\n' + done + + log "chain restore: backup-fetch LATEST (walks full + ${DELTA_MAX_STEPS} deltas) -> ${RESTORE_DIR}" + run_root "${RESTORE_DIR}" <<'REMOTE' +set -euo pipefail +RESTORE_DIR="$1" +rm -rf "${RESTORE_DIR}" +install -d -o postgres -g postgres "${RESTORE_DIR}" +REMOTE + restore_t0="$(date +%s.%N)" + case "${TOOL}" in + walrus) run_tool "${WALRUS_BIN}" backup-fetch "${RESTORE_DIR}" LATEST ;; + walg) run_tool "${WALG_BIN}" backup-fetch "${RESTORE_DIR}" LATEST ;; + pgbackrest) + sudo -u postgres pgbackrest --stanza="${PGBACKREST_STANZA}" \ + --pg1-path="${RESTORE_DIR}" --type=none restore ;; + esac + restore_t1="$(date +%s.%N)" + restore_s="$(awk -v a="${restore_t0}" -v b="${restore_t1}" 'BEGIN{printf "%.3f", b-a}')" + restore_bytes="$(sudo du -sb "${RESTORE_DIR}" | awk '{print $1}')" + log "chain restore: elapsed=${restore_s}s restored=${restore_bytes} B" + sudo rm -rf "${RESTORE_DIR}" + + run_root "${CHAIN_METRICS}" "${TOOL}" "${RUN_ID}" "${DELTA_MAX_STEPS}" \ + "${push_s_total}" "${BYTES}" "${restore_s}" "${restore_bytes}" "${chain_rows}" <<'REMOTE' +set -euo pipefail +CHAIN_METRICS="$1"; TOOL="$2"; RUN_ID="$3"; STEPS="$4"; PUSH_S_TOTAL="$5" +TOTAL_BYTES="$6"; RESTORE_S="$7"; RESTORE_BYTES="$8"; ROWS="$9" +{ + echo "op=backup-delta-chain" + echo "tool=${TOOL}" + echo "run_id=${RUN_ID}" + echo "delta_origin=LATEST" + echo "chain_steps=${STEPS}" + printf '%s' "${ROWS}" + echo "push_s_total=${PUSH_S_TOTAL}" + echo "chain_delta_bytes=${TOTAL_BYTES}" + echo "restore_s=${RESTORE_S}" + echo "restore_bytes=${RESTORE_BYTES}" +} >"${CHAIN_METRICS}" +chown postgres:postgres "${CHAIN_METRICS}" 2>/dev/null || true +cat "${CHAIN_METRICS}" +REMOTE + ;; backup-fetch) log "restore LATEST -> ${RESTORE_DIR}" run_root "${RESTORE_DIR}" <<'REMOTE' diff --git a/bench/scripts/sut/05_install_pgbackrest.sh b/bench/scripts/sut/05_install_pgbackrest.sh index cb130bb..f09302e 100755 --- a/bench/scripts/sut/05_install_pgbackrest.sh +++ b/bench/scripts/sut/05_install_pgbackrest.sh @@ -19,7 +19,7 @@ set -euo pipefail BUCKET="${BUCKET:-${1:-}}" -UPLOAD_CONCURRENCY="${UPLOAD_CONCURRENCY:-${2:-16}}" +UPLOAD_CONCURRENCY="${UPLOAD_CONCURRENCY:-${2:-4}}" AWS_REGION="${AWS_REGION:-us-east-1}" STANZA="${PGBACKREST_STANZA:-walbench}" REPO_PATH="${PGBACKREST_REPO_PATH:-/pgbackrest-bench}" diff --git a/bench/scripts/sut/11_write_walg_env.sh b/bench/scripts/sut/11_write_walg_env.sh index ea18c35..94edb74 100755 --- a/bench/scripts/sut/11_write_walg_env.sh +++ b/bench/scripts/sut/11_write_walg_env.sh @@ -14,7 +14,11 @@ set -euo pipefail BUCKET="${BUCKET:-${1:-}}" -UPLOAD_CONCURRENCY="${UPLOAD_CONCURRENCY:-${2:-16}}" +UPLOAD_CONCURRENCY="${UPLOAD_CONCURRENCY:-${2:-4}}" +# backup-fetch / wal-fetch download fan-out; defaults to upload concurrency so a +# single concurrency sweep tunes both directions (override DOWNLOAD_CONCURRENCY +# to decouple). +DOWNLOAD_CONCURRENCY="${DOWNLOAD_CONCURRENCY:-${UPLOAD_CONCURRENCY}}" ENV_FILE="${ENV_FILE:-/etc/postgresql/wal-g.env}" AWS_REGION="${AWS_REGION:-us-east-1}" COMPRESSION_METHOD="${WALG_COMPRESSION_METHOD:-lz4}" @@ -81,7 +85,7 @@ if [[ -z "${ACCESS_KEY}" || -z "${SECRET_KEY}" ]]; then exit 1 fi -echo "=== Writing ${ENV_FILE} (UPLOAD_CONCURRENCY=${UPLOAD_CONCURRENCY}) ===" +echo "=== Writing ${ENV_FILE} (UPLOAD_CONCURRENCY=${UPLOAD_CONCURRENCY} DOWNLOAD_CONCURRENCY=${DOWNLOAD_CONCURRENCY}) ===" install -d -o postgres -g postgres -m 0755 "$(dirname "${ENV_FILE}")" umask 077 tmp="$(mktemp)" @@ -90,6 +94,7 @@ WALG_S3_PREFIX=${WALG_S3_PREFIX} AWS_REGION=${AWS_REGION} WALG_COMPRESSION_METHOD=${COMPRESSION_METHOD} WALG_UPLOAD_CONCURRENCY=${UPLOAD_CONCURRENCY} +WALG_DOWNLOAD_CONCURRENCY=${DOWNLOAD_CONCURRENCY} PGHOST=/var/run/postgresql PGDATA=/dat/18/data AWS_ACCESS_KEY_ID=${ACCESS_KEY} diff --git a/docs/DESIGN.md b/docs/DESIGN.md index a13b503..cbbb256 100644 --- a/docs/DESIGN.md +++ b/docs/DESIGN.md @@ -1,31 +1,25 @@ ## Goal Functional parity with wal-g's Postgres surface so an on-prem shop can -swap binaries without touching `archive_command`, sentinels, bucket -layout, or operator runbooks. North star: a backup written by either -tool restorable by the other. +swap binaries without touching `archive_command`, sentinels, or bucket +layout. Backups written by either tool restorable by either. -Optimized for no-overcommit hosts: every pipeline stage is streaming, -no full-segment or full-file buffering. +Optimized for https://www.postgresql.org/docs/current/kernel-resources.html#LINUX-MEMORY-OVERCOMMIT ## Runtime -Runtime flavor is picked per command before construction -(`Cli::worker_threads`), overridable via `--threads` / `WALG_THREADS`; -1 builds current-thread, >1 multi-thread with that many workers. +Runtime flavor is picked per command (`Cli::worker_threads`), +overridable via `--threads` / `WALG_THREADS`: 1 builds a current-thread +runtime, >1 multi-thread with that many workers. -Default 1 for most commands: `wal-push` as `archive_command` runs once -per 16 MB segment; multi-thread runtime would spawn worker threads + -per-thread malloc arenas for nothing. Daemon mode stays at 1 since I/O -is the bottleneck. - -Commands whose fan-out does real CPU work per task (compress, encrypt, -checksum, TLS) default to multi-thread capped by the matching -concurrency knob, otherwise `WALG_UPLOAD_CONCURRENCY` tasks timeshare -one core and uploads overlap only on network: `backup-push` +Default 1 for most commands. `wal-push` runs once per 16 MB segment as +`archive_command`, so extra worker threads would only add per-thread +malloc arenas; daemon mode stays at 1 (I/O bound). Commands with real +per-task CPU work (compress, encrypt, checksum, TLS) default to +multi-thread capped by the matching concurrency knob: `backup-push` min(cores, upload concurrency); `backup-fetch` / `wal-prefetch` / -`wal-restore` min(cores, download concurrency). Worker count stays -bounded so arenas + stacks don't balloon and postgres keeps its cores. +`wal-restore` min(cores, download concurrency). Bounded so arenas + +stacks stay small and postgres keeps its cores. ## Storage trait @@ -35,116 +29,119 @@ async fn get(&self, key: &str) -> Result; ``` `AsyncReader = Pin>`. Compression and -encryption are also `AsyncReader`s, so push pipelines as -`File → compress → encrypt → storage.put` without materializing -anything. `size_hint` lets s3 pick single-PUT vs multipart, left unset -under compression/encryption since variable-length output makes the -hint lie, then the unknown-size path takes over (see S3). +encryption are also `AsyncReader`s, so a push pipeline is +`File → compress → encrypt → storage.put` with nothing materialized. +`size_hint` lets S3 pick single-PUT vs multipart, left unset under +compression/encryption (variable-length output makes the hint lie) so +the unknown-size path takes over. Pipeline order matches wal-g: push `raw → compress → encrypt → storage`, -fetch inverse. Sentinel / metadata JSON bypass compress+encrypt entirely -(wal-g `UploadDto` behavior), so `backup-list` and `delete` work against -an encrypted bucket without the key. +fetch inverse. Sentinel / metadata JSON bypass compress+encrypt (wal-g +`UploadDto` behavior), so `backup-list` and `delete` work against an +encrypted bucket without the key. ### S3 Hand-rolled SigV4 instead of `aws-sdk-rust` (multi-MB dependency -footprint) or `object_store` (arrow deps, abstracts away streaming -control). UNSIGNED-PAYLOAD over HTTPS streams bodies without hashing up -front, TLS covers integrity. Multipart parts buffer in memory so a -transient retry replays identical bytes, the safety net since -UNSIGNED-PAYLOAD leaves the signature off the body. Unknown-size bodies -buffer up to the single-PUT cap and skip multipart's -create/upload/complete trio when they fit, so a compressed 16 MiB -segment lands in one PUT. - -Credentials resolve as a small chain (`storage/creds.rs`): explicit -static keys (`AWS_ACCESS_KEY_ID`/`_SECRET_ACCESS_KEY`, optional -`AWS_SESSION_TOKEN`), else the EC2 metadata service (IMDS). IMDS uses -IMDSv2 (token PUT then authenticated GET, falling back to unauthenticated -v1 if the token is refused), caching temporary creds and refetching 5 min -before expiry; the lock spans the fetch so concurrent signers single-flight. -Set `AWS_EC2_METADATA_DISABLED` to force the static-only path, -`AWS_EC2_METADATA_SERVICE_ENDPOINT` to override the link-local address. -Rotating IMDS keys would break the key-based server-side-copy identity, so -IMDS folds to a constant identity. Profile/shared-credentials files and STS -web-identity (`AWS_WEB_IDENTITY_TOKEN_FILE`) are not implemented. +footprint) or `object_store` (arrow deps, hides streaming control). +UNSIGNED-PAYLOAD over HTTPS streams bodies without hashing up front, TLS +covers integrity. Multipart parts buffer in memory so a transient retry +replays identical bytes, the safety net since UNSIGNED-PAYLOAD leaves +the body unsigned. Unknown-size bodies buffer up to the single-PUT cap +and skip the multipart create/upload/complete trio when they fit, so a +compressed 16 MiB segment lands in one PUT. + +Credentials resolve as a chain (`storage/creds.rs`): static keys +(`AWS_ACCESS_KEY_ID`/`_SECRET_ACCESS_KEY`, optional `AWS_SESSION_TOKEN`), +else IMDSv2 (token PUT then authenticated GET, falling back to +unauthenticated v1 if the token is refused), caching temporary creds and +refetching 5 min before expiry; the lock spans the fetch so concurrent +signers single-flight. `AWS_EC2_METADATA_DISABLED` forces the +static-only path, `AWS_EC2_METADATA_SERVICE_ENDPOINT` overrides the +link-local address. Rotating IMDS keys would break the key-based +server-side-copy identity, so IMDS folds to a constant identity. +Profile/shared-credentials files and STS web-identity +(`AWS_WEB_IDENTITY_TOKEN_FILE`) are not implemented. ### GCS -Service-account JWT (RS256 via aws-lc-rs) exchanged for OAuth bearer, -cached until 60 s before expiry. Uploads stream via `uploadType=media` -chunked transfer. Resumable uploads and metadata-server auth not -implemented (see PLAN.md). +Service-account JWT (RS256 via aws-lc-rs) exchanged for an OAuth bearer, +cached until 60 s before expiry. Uploads stream via `uploadType=media`. +Resumable uploads and metadata-server auth not implemented. ### Retry classification -`StorageError::Http { status, body }` + `Transport` let `is_transient()` -classify retryable failures. Reads retry unconditionally on transient. -The `RetryingStorage` wrapper retries small bounded-size `put`s -(sentinels, manifests, history files) by buffering the body once; -larger or unknown-size streams pass through to S3's own in-place retry, -which replays its per-PUT / per-part buffer. `fs` skips the wrapper, no -transient classes worth wrapping. +`is_transient()` classifies `StorageError::Http { status, body }` + +`Transport`. Reads retry unconditionally on transient. `RetryingStorage` +retries small bounded-size `put`s (sentinels, manifests, history files) +by buffering the body once; larger or unknown-size streams pass through +to S3's own in-place retry, which replays its per-PUT / per-part buffer. +`fs` skips the wrapper. ## Compression `async_compression` bufread encoders chain as -`File → BufReader → Encoder → put`, no thread per stream. First -iteration used `spawn_blocking` + mpsc around sync zstd: worked, but -143 MB VmPeak vs 7.3 MB after the switch. - -`wal-fetch` probes the configured extension first, then `.zst`, then -bare, then remaining codec extensions, handling buckets with +`File → BufReader → Encoder → put`: no thread per stream, resident +memory stays tiny. `wal-fetch` probes the configured extension first, +then the other codec extensions and bare, handling buckets with mixed-method writes across a compression-setting migration. ## Replication client Speaks the PG replication wire protocol directly, no `pg_basebackup` subprocess, no disk spool. PG14- and PG15+ BASE_BACKUP wire forms both -handled. Auth: trust, cleartext, SCRAM-SHA-256; MD5 rejected. Without -`--pgdata`, `backup-push` is purely network-driven (sidecar host needs -no filesystem access, `data_dir` filled from `SHOW data_directory`). -`PGHOST` starting with `/` dials a Unix socket per libpq convention, -skipping TLS. +handled. Auth: trust, cleartext, SCRAM-SHA-256; MD5 rejected. `PGHOST` +starting with `/` dials a Unix socket per libpq convention, skipping TLS. A tokio task owns the connection and emits `BackupEvent`s over mpsc; each archive carries an mpsc of `Bytes` chunks wrapped as `ChannelReader`. -Backpressure flows naturally: upload stalls → channel fills → pump's -send blocks → TCP window closes. `ChannelReader` loops on empty chunks, -a real PG 13 stream contains empty CopyData frames mid-stream and an -empty poll-fill reads as EOF per the AsyncRead contract. +Backpressure flows naturally: upload stall → channel fills → pump's send +blocks → TCP window closes. `ChannelReader` loops on empty chunks, since +a real PG 13 stream carries empty CopyData frames mid-stream and an empty +poll-fill would otherwise read as EOF. ### TLS `sslmode` mirrors libpq exactly: `disable | allow | prefer (default) | require | verify-ca | verify-full`. `prefer`/`require` encrypt without -authenticating (matches libpq, same operator surprise). `verify-ca` -delegates to `WebPkiServerVerifier`, suppressing only -`NotValidForName{,Context}`. +authenticating (matches libpq). `verify-ca` delegates to +`WebPkiServerVerifier`, suppressing only `NotValidForName{,Context}`. -Client certificate auth (mutual TLS): set `PGSSLCERT` and `PGSSLKEY` to a -PEM cert chain and unencrypted private key (PKCS#8 / PKCS#1 / SEC1) and -walrus presents them in every TLS mode. Both must be set together; -encrypted keys (`PGSSLPASSWORD`) and libpq's `~/.postgresql/postgresql.{crt,key}` -default location aren't honored, matching the env-only `PGSSLROOTCERT` handling. +Client certificate auth (mTLS): set `PGSSLCERT` and `PGSSLKEY` to a PEM +cert chain and unencrypted private key (PKCS#8 / PKCS#1 / SEC1), presented +in every TLS mode; both required together. Encrypted keys +(`PGSSLPASSWORD`) and libpq's `~/.postgresql/postgresql.{crt,key}` default +location aren't honored, matching the env-only `PGSSLROOTCERT` handling. ## Tar streamer -One `spawn_blocking` task per archive bridges async→sync via -`SyncIoBridge`, re-tars with tablespace path remap, rotates parts at -`WALG_TAR_SIZE_THRESHOLD`, tees `global/pg_control` into its own part -uploaded last, collects per-file metadata. +The BASE_BACKUP path uses `astral-tokio-tar` async archive and builder +APIs. One task per archive re-tars with tablespace path remap, rotates +parts at `WALG_TAR_SIZE_THRESHOLD`, tees `global/pg_control` into its own +part uploaded last, and collects per-file metadata. Part bytes flow +through bounded mpsc chunks into upload workers, overlapping +compression/encryption/storage with re-tarring without a sync bridge +thread. + +With a positional `PGDATA`, `backup-push` reads the local data directory +instead of BASE_BACKUP: it brackets the copy with `pg_backup_start` / +`pg_backup_stop`, walks `$PGDATA` plus tablespace symlink targets, and +runs `WALG_UPLOAD_CONCURRENCY` pack workers each streaming one +size-bounded tar part. This is the throughput path for local full and +delta backups; the replication path remains a single source stream +bounded by the BASE_BACKUP protocol. Without `PGDATA` the push is purely +network-driven (`data_dir` from `SHOW data_directory`), so a sidecar host +needs no filesystem access. `backup-fetch` extracts manually rather than via `Archive::unpack`: the tar crate's canonicalize guard refuses writes through `pg_tblspc/` -symlinks, which legitimate PG restores require. `..`-traversal still -blocked. Tablespace symlinks created before extraction so the first -entry under `pg_tblspc//` can't materialize a real directory there. +symlinks that legitimate restores require. `..`-traversal stays blocked. +Tablespace symlinks are created before extraction so the first entry +under `pg_tblspc//` can't materialize a real directory there. -Uploads drain through a `JoinSet` bounded by -`Semaphore(WALG_UPLOAD_CONCURRENCY)`, JoinSet over `FuturesUnordered` -so the bail path aborts in-flight tasks instead of detaching them. +BASE_BACKUP uploads drain through `BoundedTasks`, filesystem-source +workers through a `JoinSet`; both bounded by `WALG_UPLOAD_CONCURRENCY`, +bail paths abort in-flight work instead of detaching it. ## Delta backups @@ -154,86 +151,76 @@ Two per-file payload formats, magic-dispatched on apply: - PG17 native INCREMENTAL (magic `0xd3ae1f0d`), built from `pg_wal/summaries/*.summary` via `--delta-from-wal-summaries` -`IncrementBodyReader` streams header + dirty pages with one BLCKSZ -scratch page, no file-sized buffer regardless of dirty density (naive -buffering worst case: 1 GiB resident per concurrent paged file). Three -outcomes per paged file: incremented, skipped (entry omitted, metadata -record kept), passthrough. Dirty blocks past EOF filtered, apply-side +`IncrementBodyReader` streams header + dirty pages with one BLCKSZ scratch +page, so resident memory is independent of dirty density. Three outcomes +per paged file: incremented, skipped (entry omitted, metadata record +kept), passthrough. Dirty blocks past EOF filtered, apply-side `read_exact` would underflow otherwise. -Map build fails closed: on any WAL-walk error, warn + fall back to full -*and* leave `increment_from` unset. The sentinel never claims a delta -the bucket can't deliver. Fetch walks `increment_from` root→leaf, -capped at 64 steps + visited-set against cyclic sentinels; only the -leaf's tablespace `Spec` is applied (it's a property of pgdata, not -LSN). - -In-memory delta map is `BTreeMap`, matching -wal-g's `map[RelFileNode]*roaring.Bitmap`. A `BTreeSet` costs a flat -~13 B/block regardless of density, so a large-rewrite delta (VACUUM FULL, -CREATE INDEX, bulk load: 100 GiB rel ≈ 13 M blocks) balloons to ~160 MB -resident; roaring run/bitmap-compresses dense rewrites to ~1.6 MB and -keeps sparse OLTP deltas comparable. The on-disk format is a flat tuple -list either way, so it costs nothing in interop. - -The sidecar (`_delta`) is never materialized as a struct: the -running working file accumulates location tuples append-only across the -group's 16 segments, then completion appends the boundary-record tuples, -terminator, and parser seed and streams the file to the bucket. The map -build folds each sidecar's tuples back in one at a time. So neither the -sidecar write nor the map read holds a whole group's locations in memory. - -Walparser operates on byte slices rather than wal-g's reader-of-reader -chains; one segment is 16 MiB and already in memory. wal summaries -parsing cross-referenced field-by-field against postgres -`src/common/blkreftable.c` (see WALG_COMPAT.md). +Map build fails closed: any WAL-walk error warns, falls back to full, and +leaves `increment_from` unset, so the sentinel never claims a delta the +bucket can't deliver. Fetch walks `increment_from` root→leaf, capped at +64 steps with a visited-set against cyclic sentinels; only the leaf's +tablespace `Spec` is applied (a property of pgdata, not LSN). + +The in-memory map is `BTreeMap`, matching +wal-g's `map[RelFileNode]*roaring.Bitmap`; roaring keeps dense rewrites +(VACUUM FULL, CREATE INDEX, bulk load) from ballooning resident memory +while staying comparable on sparse OLTP deltas. The on-disk format is a +flat tuple list either way, so it costs nothing in interop. + +The sidecar (`_delta`) is never materialized as a struct: a working +file accumulates location tuples append-only across the group's 16 +segments, then completion appends the boundary-record tuples, terminator, +and parser seed and streams the file out. Map build folds each sidecar's +tuples back in one at a time, so neither the write nor the read holds a +whole group's locations in memory. + +Walparser operates on byte slices (one segment is 16 MiB, already in +memory). WAL-summary and native INCREMENTAL parsing cross-referenced +field-by-field against postgres `src/common/blkreftable.c`, +`src/backend/backup/basebackup.c`, and +`src/bin/pg_combinebackup/reconstruct.c`. ## Encryption -libsodium `crypto_secretstream_xchacha20poly1305` via `dryoc` -(pure Rust, no C toolchain). Key transforms `none | hex | base64` -mirror wal-g, `none` requires ≥ 25 bytes so low-entropy keys can't -sneak through the legacy path. - -OpenPGP intentionally unsupported. rPGP pulls dozens of transitives and -its async wrapper buffers whole payloads, breaking the streaming -contract; symmetric AEAD already covers the single-tenant on-prem -threat model; a migrating PGP bucket re-encrypts once. To prevent -silent plaintext regressions, any `WALG_PGP_*` env var is a hard error -at startup. +libsodium `crypto_secretstream_xchacha20poly1305` via `dryoc` (pure Rust, +no C toolchain). Key transforms `none | hex | base64` mirror wal-g; +`none` requires ≥ 25 bytes so low-entropy keys can't sneak through. -Buckets don't tag objects encrypted-or-not (matches wal-g), so the key -must stay consistently configured per prefix; mismatch fails loudly on -first read. +OpenPGP intentionally unsupported: rPGP pulls dozens of transitives and +its async wrapper buffers whole payloads, breaking the streaming contract; +symmetric AEAD already covers the single-tenant on-prem threat model. Any +`WALG_PGP_*` env var is a hard startup error to prevent silent plaintext +regressions. Buckets don't tag objects encrypted-or-not (matches wal-g), +so the key must stay consistently configured per prefix; a mismatch fails +loudly on first read. ## Retention & copy -Objects ordered by `(timeline, global_seg_no)` extracted from the -24-hex segment substring, wal-g's `timelineAndSegmentNoLess`. Permanent -backups reserve WAL `[(start_lsn-1)/seg_size, (finish_lsn-1)/seg_size]` -inclusive. `delete` is dry-run by default, `--confirm` executes; the -plan struct is returned so tests assert without parsing logs. -`delete target` BFS-walks the increment graph for dependants. +Objects ordered by `(timeline, global_seg_no)` from the 24-hex segment +substring (wal-g's `timelineAndSegmentNoLess`). Permanent backups reserve +WAL `[(start_lsn-1)/seg_size, (finish_lsn-1)/seg_size]` inclusive. +`delete` is dry-run by default, `--confirm` executes; the plan struct is +returned so tests assert without parsing logs. `delete target` BFS-walks +the increment graph for dependants. `copy` reuses source credentials for the destination URI, stream-through -for cross-backend; WAL window `[start_seg, finish_seg]` copied with a -single backup, `--with-history` extends to all WAL ≤ finish_lsn. +for cross-backend. A single backup's WAL window is `[start_seg, +finish_seg]`; `--with-history` extends to all WAL ≤ finish_lsn. ## Daemon Byte-compatible with wal-g's Unix-socket protocol (`[type][u16 BE len][body]`), so `archive_command` can point at either -tool's daemon-client unchanged. Implemented ops: Check, WalPush, -WalFetch. - -PG's archiver is serial, so a standing `Uploader` -(`src/daemon/uploader.rs`) keeps a look-ahead pool saturated across -invocations. Foreground `WalPush(N)` acks only once `N` is durable -(no early ack), but `N+1..` pre-upload concurrently -(`lookahead = WALG_UPLOAD_CONCURRENCY - 1`, serial and byte-identical -at 1). Replaces wal-g's per-invocation `BgUploader` + on-disk marker -dir with an in-memory inflight/done map deduping foreground pushes -against look-ahead. See PLAN.md. +tool's daemon-client unchanged. Implemented ops: Check, WalPush, WalFetch. + +PG's archiver is serial, so a standing `Uploader` keeps a look-ahead pool +saturated across invocations. Foreground `WalPush(N)` acks only once `N` +is durable; `N+1..` pre-upload concurrently +(`lookahead = WALG_UPLOAD_CONCURRENCY - 1`, serial and byte-identical at +1). Replaces wal-g's per-invocation `BgUploader` + on-disk marker dir with +an in-memory inflight/done map. ## wal-receive @@ -244,15 +231,3 @@ stay consistent with archive_command pushes. Shutdown finalizes the in-flight segment as `.partial` locally, never uploaded, matching `pg_receivewal`. Status updates on a 10 s cadence, immediate on server-requested-reply keepalives. - -## Dependency budget - -Recurring theme: prefer hand-rolling small fixed formats over pulling -crates. No `regex` (summary filenames + tablespace prefixes are trivial -decodes), no aws-sdk. `roaring` is the one earned exception (+`bytemuck`, -both pure-Rust leaves): a stdlib `BTreeSet` can't compress dense deltas, -so it broke the no-overcommit budget by ~100x on large rewrites (see -Delta backups). `quick-xml` parses S3 list + multipart responses -(pull-parser does charset decode + entity unescape, replacing earlier -hand-rolled string extraction). Single crypto stack on aws-lc-rs -(rustls provider + GCS RS256), no transitive ring. diff --git a/docs/WALG_COMPAT.md b/docs/WALG_COMPAT.md index 370f8be..a68bee5 100644 --- a/docs/WALG_COMPAT.md +++ b/docs/WALG_COMPAT.md @@ -10,34 +10,44 @@ the bump PR, not master). ## Shared on-bucket format -- Key layout version `005`: `wal_005/[.]`, - `basebackups_005//tar_partitions/part_NNN.tar.`, - `pg_control.tar.` tee, sentinel at - `basebackups_005/_backup_stop_sentinel.json` (one level above - the per-backup dir, same asymmetry as wal-g) -- Sentinel mirrors `BackupSentinelDtoV2` field-for-field, PascalCase - keys, `Spec` for tablespaces; every Option field tolerant-deserializes - so sentinels from either tool parse -- `files_metadata.json` schema (`Files`, `TarFileSets`) -- Delta naming `base_<24hex>_D_`; chain discovered via - sentinel `IncrementFrom`, format detected per-file by magic byte, no - sentinel format flag (wal-g convention) -- `wi1` increment format and PG17 native INCREMENTAL format - (magic `0xd3ae1f0d`); native layout verified field-by-field against - postgres source (`src/common/blkreftable.c`, - `src/backend/backup/basebackup.c`, - `src/bin/pg_combinebackup/reconstruct.c`) -- libsodium framing: 24-byte secretstream header, 8 KiB plaintext - chunks, 17-byte per-chunk overhead, explicit FINAL chunk on close; - a wire-format pin test fails on any drift -- Prefetch dir layout `pg_wal/.wal-g/prefetch/{running/,}` so a sidecar - can run either tool against the same pg_wal -- Daemon Unix-socket protocol byte format (Check / WalPush / WalFetch) -- `delete` mode + modifier vocabulary (`before` / `retain` / - `everything` / `target` / `garbage`; `FULL`, `FIND_FULL`, `FORCE`, - `ARCHIVES`, `BACKUPS`, `--after`), permanent-backup WAL reservation, - `--confirm` gate -- Env vars follow `WALG_*` / `PG*` / `AWS_*` / `GOOGLE_*` naming +The on-bucket format is wal-g's verbatim, so this doc covers only the +gaps. Matched without further note: key layout `005`, the +`BackupSentinelDtoV2` sentinel (PascalCase, tolerant-deserialized so +either tool's sentinels parse), `files_metadata.json`, delta naming +(`base_<24hex>_D_`, chain via sentinel `IncrementFrom`, +format magic-detected per file), `wi1` and PG17 native INCREMENTAL +payloads, libsodium secretstream framing, prefetch dir layout, the +daemon Unix-socket protocol, the `delete` mode + modifier vocabulary, +and `WALG_*` / `PG*` / `AWS_*` / `GOOGLE_*` env naming. + +## Delta page selection + +Both tools emit byte-identical `wi1` / native increments (see above), so a +delta produced by either restores under either. They diverge only in how the +producer decides *which* blocks an increment carries. + +wal-g defaults to a full scan (`WALG_USE_WAL_DELTA` is false by default): it +reads every page of every paged relation and ships a page only if the page is +new (`pd_upper == 0`) or its header LSN is at or past the increment-base LSN +(`incremental_page_reader.go:SelectNewValidPage`, a predicate lifted from +PostgreSQL's own page-validity checks and refined on pgsql-hackers). This is +self-validating: it needs no WAL and re-derives "changed" from each page's own +header, so a gap in the archived WAL cannot silently drop a changed block. +Setting `WALG_USE_WAL_DELTA=true` switches wal-g to instead trust a WAL-derived +changed-block bitmap (file-size gated, no per-page LSN recheck), warning and +falling back to the full scan when the bitmap can't be loaded +(`WALG_FORCE_WAL_DELTA` forbids that fallback). + +walrus implements only the map-trusting path. `classify_for_delta` ships +exactly the blocks the changed-block map reports, filtered to blocks within the +current file size, with no page-LSN recheck. The map is built from WAL +`_delta` sidecars (raw-WAL walk when a sidecar is missing) or from +`pg_walsummary` under `--delta-from-wal-summaries`; if it can't be built, walrus +produces a full backup rather than a scan-based delta. So for walrus +`WALG_USE_WAL_DELTA` only governs sidecar recording during wal-push, not +selection — backup-push always selects blocks from a WAL/summary map regardless +— and a walrus delta is correct only if that map is complete, whereas wal-g's +default would still catch a missed block by its page LSN. ## Deliberate divergences @@ -82,7 +92,8 @@ which accepts more connection variables: Partial support: - `PGDATA`: walrus uses it only for daemon path resolution, not as - backup-push data directory config + backup-push data directory config. `backup-push ` positional + syntax matches wal-g CLI behavior - `PGHOST`, `PGPORT`: walrus supports single host/port only, not pgx multihost semantics @@ -209,78 +220,11 @@ GCE/GKE metadata-server auth is not implemented. ### Storage backends not implemented -Azure: - -- `WALG_AZ_PREFIX` -- `WALE_AZ_PREFIX` -- `AZURE_STORAGE_ACCOUNT` -- `AZURE_STORAGE_ACCESS_KEY` -- `AZURE_STORAGE_SAS_TOKEN` -- `AZURE_CLIENT_ID` -- `AZURE_TENANT_ID` -- `AZURE_CLIENT_SECRET` -- `AZURE_ENVIRONMENT_NAME` -- `AZURE_ENDPOINT_SUFFIX` -- `AZURE_BUFFER_SIZE` -- `WALG_AZURE_BUFFER_SIZE` -- `AZURE_MAX_BUFFERS` -- `WALG_AZURE_MAX_BUFFERS` -- `AZURE_TRY_TIMEOUT` -- `AZURE_BLOB_STORE_API_VERSION` - -Alicloud OSS: - -- `WALG_OSS_PREFIX` -- `WALE_OSS_PREFIX` -- `OSS_ACCESS_KEY_ID` -- `OSS_ACCESS_KEY_SECRET` -- `OSS_SESSION_TOKEN` -- `OSS_ENDPOINT` -- `OSS_REGION` -- `OSS_ROLE_ARN` -- `OSS_ROLE_SESSION_NAME` -- `OSS_SKIP_VALIDATION` -- `OSS_MAX_RETRIES` -- `OSS_CONNECT_TIMEOUT` -- `OSS_UPLOAD_PART_SIZE` -- `OSS_COPY_PART_SIZE` - -Swift: - -- `WALG_SWIFT_PREFIX` -- `WALE_SWIFT_PREFIX` -- `OS_AUTH_URL` -- `OS_USERNAME` -- `OS_PASSWORD` -- `OS_TENANT_NAME` -- `OS_REGION_NAME` - -SSH storage: - -- `WALG_SSH_PREFIX` -- `WALE_SSH_PREFIX` -- `SSH_PORT` -- `SSH_USERNAME` -- `SSH_PASSWORD` -- `SSH_PRIVATE_KEY_PATH` - -File storage alias: - -- `WALE_FILE_PREFIX` - -### Failover storage - -- `WALG_FAILOVER_STORAGES` -- `WALG_FAILOVER_STORAGES_CHECK` -- `WALG_FAILOVER_STORAGES_CHECK_TIMEOUT` -- `WALG_FAILOVER_STORAGES_CHECK_SIZE` -- `WALG_FAILOVER_STORAGES_CACHE_LIFETIME` -- `WALG_FAILOVER_STORAGES_CACHE_EMA_ALIVE_LIMIT` -- `WALG_FAILOVER_STORAGES_CACHE_EMA_DEAD_LIMIT` -- `WALG_FAILOVER_STORAGES_CACHE_EMA_ALPHA_ALIVE_MAX` -- `WALG_FAILOVER_STORAGES_CACHE_EMA_ALPHA_ALIVE_MIN` -- `WALG_FAILOVER_STORAGES_CACHE_EMA_ALPHA_DEAD_MAX` -- `WALG_FAILOVER_STORAGES_CACHE_EMA_ALPHA_DEAD_MIN` +Azure, Alicloud OSS, Swift, and SSH backends are absent (see the +divergence table), so all their env vars (`WALG_AZ_PREFIX` / `AZURE_*`, +`WALG_OSS_PREFIX` / `OSS_*`, `WALG_SWIFT_PREFIX` / `OS_*`, +`WALG_SSH_PREFIX` / `SSH_*`), the `WALE_FILE_PREFIX` file alias, and +failover storages (`WALG_FAILOVER_STORAGES*`) are unsupported. ### Storage aliases diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 77ee5b4..7224aba 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -106,13 +106,14 @@ pub enum Cmd { #[arg(long)] target_user_data: Option, }, - /// Take a streaming base backup via the replication BASE_BACKUP protocol + /// Take a base backup /// /// Uses libpq env vars (PGHOST/PGPORT/PGUSER/PGPASSWORD/PGDATABASE). - /// Without --pgdata, the sentinel records the server-reported data_directory. + /// With PGDATA, reads local filesystem like wal-g. Without PGDATA, streams + /// through replication BASE_BACKUP and records server-reported data_directory. BackupPush { - /// Optional path to local PostgreSQL data directory (sentinel only) - #[arg(long)] + /// Optional path to local PostgreSQL data directory + #[arg(value_name = "PGDATA")] pgdata: Option, /// Mark this backup as permanent #[arg(long)] @@ -535,6 +536,18 @@ mod tests { assert_eq!(Format::from(IncrementFormatArg::Native), Format::Native); } + #[test] + fn backup_push_accepts_positional_pgdata() { + let cli = Cli::parse_from(["walrus", "backup-push", "/dat/18/data", "--full"]); + match cli.cmd { + Cmd::BackupPush { pgdata, full, .. } => { + assert_eq!(pgdata, Some(PathBuf::from("/dat/18/data"))); + assert!(full); + } + _ => panic!("expected backup-push"), + } + } + fn worker_threads_of(args: &[&str]) -> usize { Cli::parse_from(args).worker_threads().unwrap() } diff --git a/src/compression/mod.rs b/src/compression/mod.rs index 3ea4cc9..1d0447d 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -6,16 +6,18 @@ use std::pin::Pin; use async_compression::Level; +use async_compression::lz4::{BlockSize, EncoderParams}; use async_compression::tokio::bufread::{ BrotliDecoder, BrotliEncoder, GzipDecoder, GzipEncoder, Lz4Decoder, Lz4Encoder, LzmaDecoder, LzmaEncoder, ZstdDecoder, ZstdEncoder, }; use thiserror::Error; -use tokio::io::{AsyncRead, BufReader}; +use tokio::io::{AsyncBufRead, AsyncRead, BufReader}; -const BUF_CAPACITY: usize = 64 * 1024; +const BUF_CAPACITY: usize = 256 * 1024; pub type AsyncReader = Pin>; +pub type AsyncBufReader = Pin>; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Method { @@ -73,35 +75,35 @@ pub enum CompressionError { pub fn encode(method: Method, input: AsyncReader, level: i32) -> AsyncReader { match method { Method::None => input, - Method::Zstd => { - let buffered = BufReader::with_capacity(BUF_CAPACITY, input); - Box::pin(ZstdEncoder::with_quality(buffered, Level::Precise(level))) - } - Method::Brotli => { - let buffered = BufReader::with_capacity(BUF_CAPACITY, input); - Box::pin(BrotliEncoder::with_quality( - buffered, - Level::Precise(brotli_quality(level)), - )) - } - Method::Lz4 => { - let buffered = BufReader::with_capacity(BUF_CAPACITY, input); - Box::pin(Lz4Encoder::new(buffered)) - } - Method::Lzma => { - let buffered = BufReader::with_capacity(BUF_CAPACITY, input); - Box::pin(LzmaEncoder::with_quality( - buffered, - Level::Precise(lzma_preset(level)), - )) - } - Method::Gz => { - let buffered = BufReader::with_capacity(BUF_CAPACITY, input); - Box::pin(GzipEncoder::with_quality( - buffered, - Level::Precise(gzip_level(level)), - )) - } + _ => encode_buffered( + method, + Box::pin(BufReader::with_capacity(BUF_CAPACITY, input)), + level, + ), + } +} + +pub fn encode_buffered(method: Method, input: AsyncBufReader, level: i32) -> AsyncReader { + match method { + Method::None => Box::pin(input), + Method::Zstd => Box::pin(ZstdEncoder::with_quality(input, Level::Precise(level))), + Method::Brotli => Box::pin(BrotliEncoder::with_quality( + input, + Level::Precise(brotli_quality(level)), + )), + Method::Lz4 => Box::pin(Lz4Encoder::with_quality_and_params( + input, + Level::Precise(level), + EncoderParams::default().block_size(BlockSize::Max256KB), + )), + Method::Lzma => Box::pin(LzmaEncoder::with_quality( + input, + Level::Precise(lzma_preset(level)), + )), + Method::Gz => Box::pin(GzipEncoder::with_quality( + input, + Level::Precise(gzip_level(level)), + )), } } @@ -194,6 +196,19 @@ mod tests { roundtrip(Method::Gz).await; } + #[tokio::test] + async fn encode_buffered_matches_encode() { + // encode_buffered feeds the codec an AsyncBufRead directly (no internal + // BufReader); output must still decode back to the original + let original = payload(); + let buffered: AsyncBufReader = Box::pin(Cursor::new(original.clone())); + let enc = encode_buffered(Method::Lz4, buffered, 3); + let mut dec = decode(Method::Lz4, enc); + let mut out = Vec::new(); + dec.read_to_end(&mut out).await.unwrap(); + assert_eq!(out, original); + } + #[tokio::test] async fn none_passthrough() { let mut r = encode(Method::None, reader(b"hello"), 3); diff --git a/src/config/mod.rs b/src/config/mod.rs index 26840a3..bbcd23a 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -60,7 +60,7 @@ pub enum StorageSettings { } impl Default for Settings { - /// Convenience defaults: single-worker fs pipeline at zstd-3, no throttling + /// Convenience defaults: single-worker fs pipeline at lz4, no throttling /// or encryption. Production constructs via [`Settings::from_env`]; this /// lets tests vary only the fields they exercise via `..Default::default()` fn default() -> Self { @@ -68,7 +68,7 @@ impl Default for Settings { storage: StorageSettings::Fs { path: String::new(), }, - compression: compression::Method::Zstd, + compression: compression::Method::Lz4, compression_level: 3, upload_concurrency: 1, upload_queue: 1, @@ -88,11 +88,11 @@ impl Settings { pub fn from_env() -> Result { let storage = detect_storage()?; let compression = match std::env::var("WALG_COMPRESSION_METHOD").ok().as_deref() { - None => compression::Method::Zstd, + None => compression::Method::Lz4, Some(s) => compression::Method::from_name(s) .ok_or_else(|| anyhow!("unsupported WALG_COMPRESSION_METHOD={s}"))?, }; - let compression_level = parse_env_int("WALG_COMPRESSION_LEVEL", 3)? as i32; + let compression_level = parse_env_int("WALG_COMPRESSION_LEVEL", 1)? as i32; let upload_concurrency = upload_concurrency_from_env()?; let upload_queue = parse_env_int("WALG_UPLOAD_QUEUE", 2)?.max(1) as usize; let download_concurrency = download_concurrency_from_env()?; diff --git a/src/daemon/uploader.rs b/src/daemon/uploader.rs index e8efb43..315863e 100644 --- a/src/daemon/uploader.rs +++ b/src/daemon/uploader.rs @@ -1,11 +1,14 @@ //! Standing background WAL uploader for the daemon archive path. //! //! PG's archiver is serial — it runs `archive_command` for one segment, waits -//! for success, then the next — so walrus's per-connection `wal-push` is -//! serial too and `WALG_UPLOAD_CONCURRENCY` is a no-op here: the archiver -//! falls behind a high WAL rate. wal-g closes the gap with a per-invocation -//! `BgUploader` (wal-g `internal/databases/postgres/bguploader.go`) that scans -//! `archive_status/` and uploads look-ahead segments concurrently. +//! for success, then the next — so a per-connection `wal-push` (one CLI +//! invocation per segment) is serial too, with no opening for +//! `WALG_UPLOAD_CONCURRENCY`, and the archiver falls behind a high WAL rate. +//! wal-g closes the gap with a per-invocation `BgUploader` (wal-g +//! `internal/databases/postgres/bguploader.go`) that scans `archive_status/` +//! and uploads look-ahead segments concurrently. This module is walrus's +//! standing-daemon equivalent, where `WALG_UPLOAD_CONCURRENCY` does take +//! effect via the look-ahead below. //! //! Because the walrus daemon is one long-lived process, bookkeeping stays //! in-memory: a shared `inflight` map dedups foreground pushes against diff --git a/src/main.rs b/src/main.rs index 37f8b0d..bd7e004 100644 --- a/src/main.rs +++ b/src/main.rs @@ -17,6 +17,7 @@ fn main() -> ExitCode { fn run(cli: walrus::cli::Cli) -> anyhow::Result<()> { let threads = cli.worker_threads()?; + cap_malloc_arenas(threads); // current_thread when 1: no worker threads, single glibc malloc arena // (see docs/DESIGN.md Runtime) let mut builder = if threads > 1 { @@ -37,3 +38,19 @@ fn run(cli: walrus::cli::Cli) -> anyhow::Result<()> { .build()? .block_on(cli.run()) } + +/// Cap glibc malloc arenas to the CPU count. glibc otherwise grows to 8*ncpu +/// arenas, each reserving a 64 MiB heap by mmap; once the multi-thread runtime +/// drives concurrent allocation that inflates virtual memory far past the +/// resident set. One arena per core keeps VSZ bounded without measurably +/// hurting allocator throughput. Must run before any worker thread spawns +#[cfg(all(target_os = "linux", target_env = "gnu"))] +fn cap_malloc_arenas(n: usize) { + // SAFETY: mallopt is thread-safe; called once on the main thread pre-runtime + unsafe { + libc::mallopt(libc::M_ARENA_MAX, n as libc::c_int); + } +} + +#[cfg(not(all(target_os = "linux", target_env = "gnu")))] +fn cap_malloc_arenas() {} diff --git a/src/pg/backup/fetch.rs b/src/pg/backup/fetch.rs index 12b0c0e..7f2a197 100644 --- a/src/pg/backup/fetch.rs +++ b/src/pg/backup/fetch.rs @@ -11,6 +11,7 @@ use std::sync::Arc; use anyhow::{Context, Result, anyhow, bail}; use futures::StreamExt; +use tokio_tar::Archive; use tokio_util::io::SyncIoBridge; use crate::compression; @@ -331,108 +332,133 @@ async fn unpack_part( let throttled = settings.throttle_network(body); let decrypted = settings.decrypt(throttled); let decoded = compression::decode(method, decrypted); - let dst: PathBuf = dst.to_path_buf(); - let res: std::io::Result<()> = tokio::task::spawn_blocking(move || { - let sync_r = SyncIoBridge::new(decoded); - let mut archive = tar::Archive::new(sync_r); - unpack_manual(&mut archive, &dst, &incremented) - }) - .await - .context("tar unpack join")?; - res.with_context(|| format!("unpack {key}"))?; + let mut archive = Archive::new(decoded); + let mut entries = archive.entries().context("open tar entries")?; + while let Some(entry) = entries.next().await { + let entry = entry.context("read tar entry")?; + unpack_entry(entry, dst, &incremented) + .await + .with_context(|| format!("unpack {key}"))?; + } tracing::info!(target = "backup_fetch", "unpacked {key}"); Ok(()) } -/// Manual tar extraction without the `tar` crate's "stays inside dst" -/// canonicalization check. PG restores legitimately need to follow -/// `pg_tblspc/` symlinks that point outside `dst` — the safe-extract -/// behavior in `tar::Archive::unpack` refuses that -fn unpack_manual( - archive: &mut tar::Archive, +/// Restore one tar entry. PG restores legitimately follow `pg_tblspc/` +/// symlinks pointing outside `dst`, so we skip the tar crate's "stays inside +/// dst" canonicalization. File bodies bridge to a `spawn_blocking` apply path +/// because `apply_increment_in_place` needs `Seek` +async fn unpack_entry( + entry: tokio_tar::Entry, dst: &Path, incremented: &HashSet, -) -> std::io::Result<()> { - use std::io::Write; - - for entry in archive.entries()? { - let mut entry = entry?; - let path = entry.path()?.into_owned(); - // Skip absolute / parent-dir traversals - let rel = strip_to_relative(&path); - if rel.as_os_str().is_empty() { - continue; +) -> Result<()> +where + R: tokio::io::AsyncRead + Unpin + Send + 'static, +{ + let path = entry.path().context("entry path")?.into_owned(); + // Skip absolute / parent-dir traversals + let rel = strip_to_relative(&path); + if rel.as_os_str().is_empty() { + return Ok(()); + } + let target = dst.join(&rel); + let header = entry.header().clone(); + let etype = header.entry_type(); + if let Some(parent) = target.parent() { + tokio::fs::create_dir_all(parent).await?; + } + if etype.is_dir() { + return tokio::fs::create_dir(&target).await.or_else(|e| { + if e.kind() == std::io::ErrorKind::AlreadyExists { + Ok(()) + } else { + Err(e.into()) + } + }); + } + if etype.is_symlink() { + // pg_tblspc/ links are restored up-front from the sentinel + // TablespaceSpec (mapping-aware) before any part unpacks. Recreating + // them from a part entry would race the concurrent data fan-out — its + // remove+recreate window vs another part materializing the link's + // pg_tblspc//... contents — and would clobber a + // --tablespace-mapping relocation with the archived (backup-time) + // target. PG basebackup emits symlinks only under pg_tblspc, so the + // sentinel link is authoritative; skip the entry + if rel.parent() == Some(Path::new("pg_tblspc")) { + return Ok(()); } - let target = dst.join(&rel); - let header = entry.header().clone(); - let etype = header.entry_type(); - if let Some(parent) = target.parent() { - std::fs::create_dir_all(parent)?; + #[cfg(unix)] + { + let link = header + .link_name() + .context("symlink target")? + .ok_or_else(|| anyhow!("symlink without target"))?; + // best-effort overwrite + let _ = tokio::fs::remove_file(&target).await; + tokio::fs::symlink(link.as_ref(), &target).await?; } - if etype.is_dir() { - match std::fs::create_dir(&target) { - Ok(()) => {} - Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {} - Err(e) => return Err(e), - } - } else if etype.is_symlink() { - #[cfg(unix)] - { - let link = header.link_name()?.ok_or_else(|| { - std::io::Error::new(std::io::ErrorKind::InvalidData, "symlink without target") + return Ok(()); + } + // ignore fifo, char/block devices — none appear in a PG basebackup. Hard + // links are treated like regular files (basebackup emits none) + if !(etype.is_file() || etype.is_hard_link()) { + return Ok(()); + } + + let path_key = rel.to_string_lossy().into_owned(); + let is_increment = incremented.contains(&path_key); + let target = target.clone(); + let mode = header.mode().ok(); + let bridge = SyncIoBridge::new(entry); + tokio::task::spawn_blocking(move || -> std::io::Result<()> { + use std::io::Write; + let mut bridge = bridge; + if is_increment { + // Increment path: apply onto whatever the earlier chain step left + // in place. The target must already exist (chain root wrote the + // full file). open() in r+w (not truncate) + let mut f = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(&target) + .map_err(|e| { + std::io::Error::new( + e.kind(), + format!("apply increment {path_key}: open target: {e}"), + ) })?; - // best-effort overwrite - let _ = std::fs::remove_file(&target); - std::os::unix::fs::symlink(link.as_ref(), &target)?; - } - } else if etype.is_file() || etype.is_hard_link() { - // ignore hard links to keep this simple; PG basebackup doesn't emit any - let path_key = rel.to_string_lossy().into_owned(); - if incremented.contains(&path_key) { - // Increment path: apply onto whatever the earlier chain step - // left in place. The target must already exist (chain root - // wrote the full file). open() in r+w (not truncate) - let mut f = std::fs::OpenOptions::new() - .read(true) - .write(true) - .open(&target) - .map_err(|e| { - std::io::Error::new( - e.kind(), - format!("apply increment {path_key}: open target: {e}"), - ) - })?; - let (final_size, _, _) = - apply_increment_in_place(&mut entry, &mut f).map_err(|e| { - std::io::Error::new( - std::io::ErrorKind::InvalidData, - format!("apply increment {path_key}: {e}"), - ) - })?; - f.set_len(final_size)?; - f.flush()?; - } else { - let mut f = std::fs::OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(&target)?; - std::io::copy(&mut entry, &mut f)?; - f.flush()?; - } - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - if let Ok(mode) = header.mode() { - let _ = - std::fs::set_permissions(&target, std::fs::Permissions::from_mode(mode)); - } + let (final_size, _, _) = + apply_increment_in_place(&mut bridge, &mut f).map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("apply increment {path_key}: {e}"), + ) + })?; + f.set_len(final_size)?; + f.flush()?; + } else { + let mut f = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(&target)?; + std::io::copy(&mut bridge, &mut f)?; + f.flush()?; + } + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + if let Some(mode) = mode { + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(mode)); } } - // entry types we don't restore: hard links, fifo, char/block devices — - // none appear in a PG basebackup - } + Ok(()) + }) + .await + .context("unpack file join")??; Ok(()) } diff --git a/src/pg/backup/fs_push.rs b/src/pg/backup/fs_push.rs new file mode 100644 index 0000000..816713c --- /dev/null +++ b/src/pg/backup/fs_push.rs @@ -0,0 +1,1341 @@ +//! backup-push from a local data directory (wal-g-style filesystem source) +//! +//! Walks `$PGDATA`, packs files into tar parts across N concurrent workers — +//! each worker streams one part through compression to S3 — and brackets the +//! copy with `pg_backup_start` / `pg_backup_stop` over a non-replication SQL +//! session. Output layout matches the BASE_BACKUP path (`tar_partitions/ +//! part_NNN.tar.`, `pg_control.tar.`, files_metadata.json, sentinel, +//! metadata) so backup-fetch is identical +//! +//! Concurrency is the throughput win over the single-stream BASE_BACKUP path: +//! `WALG_UPLOAD_CONCURRENCY` parts pack + compress + upload simultaneously, so +//! several S3 connections and CPU cores run at once instead of one + +use std::collections::HashMap; +use std::os::unix::fs::PermissionsExt; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; +use std::time::UNIX_EPOCH; + +use anyhow::{Context, Result, anyhow, bail}; +use bytes::Bytes; +use chrono::Utc; +use tokio::io::{AsyncWriteExt, BufReader}; +use tokio::sync::{Mutex, mpsc}; +use tokio_tar::{Builder, EntryType, Header}; + +use crate::compression::{self, AsyncBufReader, AsyncReader}; +use crate::config::Settings; +use crate::pg::backup::delta; +use crate::pg::backup::increment::Format as IncrementFormat; +use crate::pg::backup::push::{self, Finalize, PushArgs}; +use crate::pg::backup::tar_streamer::{ + DeltaClass, DeltaContext, IncrementBodyReader, PartWriter, classify_for_delta, +}; +use crate::pg::backup::{ + BACKUP_NAME_PREFIX, FileDescription, TablespaceSpec, format_backup_name, format_pg_lsn, + parse_pg_lsn, tar_part_key, +}; +use crate::pg::replication::PgConfig; +use crate::pg::replication::base_backup::ChannelReader; +use crate::pg::replication::conn::ReplicationConn; +use crate::storage::DynStorage; + +const PG_CONTROL_ENTRY: &str = "global/pg_control"; + +/// Coalesce file-body reads. tokio_tar copies each body through io::copy's 8 KB +/// buffer, and every tokio::fs::File read is a blocking-pool dispatch; reading a +/// multi-GB relation in 8 KB units floods the pool and bounds single-stream +/// throughput. A BufReader turns ~CAP/8KB dispatches into one. 256 KB is the knee +/// (matches CHUNK_BYTES); peak resident is CAP × upload_concurrency (one open +/// file per packer) +const FILE_READ_BUF: usize = 256 * 1024; + +/// Filenames dropped from the copy, matched by basename anywhere in the tree. +/// Mirrors wal-g's `ExcludedFilenames` plus `pg_internal.init` / `recovery.signal` +/// (which pgbackrest also drops). Directories appear as empty entries (recreated +/// on restore) but aren't recursed; files are dropped entirely. `pg_control` is +/// handled separately (tee'd into `pg_control.tar`) +const EXCLUDED: &[&str] = &[ + "log", + "pg_log", + "pg_xlog", + "pg_wal", + "pgsql_tmp", + "postgresql.auto.conf.tmp", + "postmaster.pid", + "postmaster.opts", + "recovery.conf", + "pg_dynshmem", + "pg_notify", + "pg_replslot", + "pg_serial", + "pg_stat_tmp", + "pg_snapshots", + "pg_subtrans", + "pg_internal.init", + "standby.signal", + "recovery.signal", +]; + +/// True when `path` looks like a local PG data directory (so backup-push reads +/// the filesystem rather than streaming BASE_BACKUP) +pub fn is_pgdata_dir(path: &Path) -> bool { + path.join("PG_VERSION").is_file() || path.join("global/pg_control").is_file() +} + +#[derive(Clone)] +enum EntryKind { + Dir, + File, +} + +#[derive(Clone)] +struct WalkEntry { + kind: EntryKind, + /// path inside the tar (relative to the data dir; tablespaces remapped + /// under `pg_tblspc//`) + tar_path: String, + /// absolute on-disk path (files only) + abs: PathBuf, + /// size recorded at stat time; the body is padded/truncated to match + size: u64, + mode: u32, + mtime: i64, +} + +/// Walk results not carried in the entry stream: tablespace list, pg_control +/// path, and the entry count for the post-walk log +struct WalkMeta { + /// (oid, location) for each non-default tablespace + tablespaces: Vec<(u32, String)>, + pg_control: Option, + entry_count: usize, +} + +/// Accumulates walked entries into `tar_size`-bounded batches and blocking-sends +/// each completed batch downstream. Rotation matches the old consumer-side +/// `next_batch`: split before an entry would overflow a non-empty batch, close a +/// batch once it reaches the threshold, let a lone oversize entry stand alone. +/// Runs inside `spawn_blocking`, so `blocking_send` backpressures the walk when +/// the packers fall behind, capping resident entries instead of materializing +/// the whole tree +struct Batcher { + tar_size: u64, + tx: mpsc::Sender>, + cur: Vec, + cur_size: u64, + count: usize, +} + +impl Batcher { + fn new(tar_size: u64, tx: mpsc::Sender>) -> Self { + Self { + tar_size, + tx, + cur: Vec::new(), + cur_size: 0, + count: 0, + } + } + + fn push(&mut self, e: WalkEntry) -> Result<()> { + if !self.cur.is_empty() && self.cur_size.saturating_add(e.size) > self.tar_size { + self.flush()?; + } + self.cur_size = self.cur_size.saturating_add(e.size); + self.count += 1; + self.cur.push(e); + if self.cur_size >= self.tar_size { + self.flush()?; + } + Ok(()) + } + + fn flush(&mut self) -> Result<()> { + if self.cur.is_empty() { + return Ok(()); + } + self.cur_size = 0; + let batch = std::mem::take(&mut self.cur); + self.tx + .blocking_send(batch) + .map_err(|_| anyhow!("pack workers dropped before walk completed")) + } +} + +/// Sink threaded through the recursive walk: batches entries, records +/// tablespaces and the pg_control path +struct WalkSink { + batcher: Batcher, + tablespaces: Vec<(u32, String)>, + pg_control: Option, +} + +#[derive(Default)] +struct WorkerResult { + files: HashMap, + tar_file_sets: HashMap>, + compressed: i64, + uncompressed: i64, + max_file_no: u32, +} + +pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> Result<()> { + let start_time = Utc::now(); + let pgdata = args + .pgdata + .clone() + .ok_or_else(|| anyhow!("filesystem backup-push requires local PGDATA"))?; + + // Resolve a delta parent unless --full (matches BASE_BACKUP path) + let parent = if args.full { + None + } else { + delta::configure_delta_parent(&storage, &settings.delta, args.is_permanent).await? + }; + let increment_format = args.increment_format; + if let Some(p) = parent.as_ref() + && let Some(parent_fmt) = p.parent_increment_format + && parent_fmt != increment_format + { + bail!( + "increment format mismatch: delta parent {} uses {parent_fmt:?} but \ + --increment-format requests {increment_format:?}; a chain must use one \ + format end-to-end (match the parent, or pass --full for a new chain)", + p.name, + ); + } + + let cfg = PgConfig::from_env()?; + tracing::info!( + target = "backup_push", + "filesystem backup-push from {} (connecting to {}:{} as {})", + pgdata.display(), + cfg.host, + cfg.port, + cfg.user, + ); + let mut conn = ReplicationConn::connect_with(&cfg, false).await?; + let pg_version = conn.server_pg_version(); + let system_identifier = query_u64( + &mut conn, + "SELECT system_identifier FROM pg_control_system()", + ) + .await + .context("read system_identifier")?; + let timeline = + query_u64(&mut conn, "SELECT timeline_id FROM pg_control_checkpoint()").await? as u32; + let data_directory = pgdata + .canonicalize() + .unwrap_or_else(|_| pgdata.clone()) + .display() + .to_string(); + + if args.delta_from_wal_summaries { + if pg_version < 170000 { + bail!( + "--delta-from-wal-summaries requires PostgreSQL 17 or newer (server reports {pg_version})" + ); + } + let on = show_setting(&mut conn, "summarize_wal").await?; + if on.trim() != "on" { + bail!("--delta-from-wal-summaries requires summarize_wal=on on the server"); + } + } + + // pg_backup_start brackets the copy; the session must stay open until stop + let label = format!("walrus {}", Utc::now().format("%Y%m%dT%H%M%SZ")); + let start_lsn = backup_start(&mut conn, pg_version, &label, args.fast_checkpoint).await?; + tracing::info!( + target = "backup_push", + "pg_backup_start: lsn={} timeline={}", + format_pg_lsn(start_lsn), + timeline, + ); + + let seg_size = crate::pg::wal::segment::wal_segment_size(); + let base_name = format_backup_name(timeline, start_lsn, seg_size); + let backup_name = match parent.as_ref() { + Some(p) => format!( + "{base_name}_D_{}", + p.name.strip_prefix(BACKUP_NAME_PREFIX).unwrap_or(&p.name), + ), + None => base_name.clone(), + }; + + // Build the delta map now that the upper LSN bound is known. Failure drops + // to a full backup (wal-g semantics: a partial delta is worse than a full) + let delta_context = build_delta_context( + settings, + &storage, + parent.as_ref(), + &args, + increment_format, + pgdata.as_path(), + timeline, + start_lsn, + ) + .await; + + let tar_size = if args.tar_size_threshold == 0 { + crate::pg::backup::tar_streamer::DEFAULT_TAR_SIZE_THRESHOLD + } else { + args.tar_size_threshold + }; + + let n_workers = settings.upload_concurrency.max(1); + tracing::info!( + target = "backup_push", + "packing with upload_concurrency={}", + settings.upload_concurrency, + ); + + // Stream the walk into a bounded batch channel instead of materializing every + // WalkEntry resident first. The metadata-only walk far outruns packing, so an + // unbounded handoff would hold the whole entry list in memory; channel depth = + // worker count, so blocking_send backpressures the walk and packing overlaps it + let (batch_tx, batch_rx) = mpsc::channel::>(n_workers); + let walk_pgdata = pgdata.clone(); + let walk_task = + tokio::task::spawn_blocking(move || walk_data_dir(&walk_pgdata, tar_size, batch_tx)); + + // Concurrent packing: N workers steal batches off the shared receiver, each + // streaming one part through compression to S3. A JoinSet ensures that if + // one worker fails, dropping the set aborts the rest (and each aborted + // worker aborts its in-flight upload via AbortOnDrop) — nothing keeps + // touching PGDATA / S3 after this returns and the backup session closes. + // Dropping every receiver clone also unblocks the walk's blocking_send, + // ending the producer + let batch_rx = Arc::new(Mutex::new(batch_rx)); + let counter = Arc::new(AtomicU32::new(0)); + let mut set: tokio::task::JoinSet> = tokio::task::JoinSet::new(); + for _ in 0..n_workers { + let batch_rx = batch_rx.clone(); + let counter = counter.clone(); + let settings = settings.clone(); + let storage = storage.clone(); + let backup_name = backup_name.clone(); + let delta_context = delta_context.clone(); + set.spawn(async move { + pack_worker( + batch_rx, + counter, + settings, + storage, + backup_name, + delta_context, + ) + .await + }); + } + + let mut all_files: HashMap = HashMap::new(); + let mut tar_file_sets: HashMap> = HashMap::new(); + let mut compressed_size: i64 = 0; + let mut uncompressed_size: i64 = 0; + let mut max_file_no: u32 = 0; + while let Some(joined) = set.join_next().await { + let r = joined.context("pack worker join")??; + all_files.extend(r.files); + for (k, v) in r.tar_file_sets { + tar_file_sets.entry(k).or_default().extend(v); + } + compressed_size += r.compressed; + uncompressed_size += r.uncompressed; + max_file_no = max_file_no.max(r.max_file_no); + } + + // Producer closed the channel once the walk finished, so every worker has + // drained and exited by here; its tablespace list & pg_control path are final + let walk = walk_task.await.context("walk join")??; + let pg_control = walk.pg_control; + let tablespaces = walk.tablespaces; + tracing::info!( + target = "backup_push", + "walked {} entries, {} tablespace(s)", + walk.entry_count, + tablespaces.len(), + ); + + // pg_control tee → pg_control.tar (applied last on restore). BASE_BACKUP + // counts pg_control inline in its archive stream; here it never enters a + // data part, so add the tee tar bytes to keep uncompressed_size consistent + let pg_control_tee = match pg_control { + Some(abs) => Some(build_pg_control_tar(&abs).await?), + None => None, + }; + if let Some(tee) = pg_control_tee.as_ref() { + uncompressed_size += tee.len() as i64; + } + + // pg_backup_stop: end LSN + non-exclusive backup_label / tablespace_map + let (end_lsn, labelfile, spcmapfile) = backup_stop(&mut conn, pg_version).await?; + tracing::info!( + target = "backup_push", + "pg_backup_stop at {}", + format_pg_lsn(end_lsn) + ); + + // backup_label (+ tablespace_map) ship as a final part so restore writes + // them into the data dir; they don't exist on disk in non-exclusive backup + let label_file_no = counter.fetch_add(1, Ordering::SeqCst) + 1; + max_file_no = max_file_no.max(label_file_no); + let part_name = format!("part_{label_file_no:03}.tar"); + let mut label_entries: Vec<(&str, &str)> = vec![("backup_label", labelfile.as_str())]; + if !spcmapfile.trim().is_empty() { + label_entries.push(("tablespace_map", spcmapfile.as_str())); + } + let label_tar = build_small_tar(&label_entries).await?; + let key = tar_part_key( + &backup_name, + label_file_no, + settings.compression.extension(), + ); + uncompressed_size += label_tar.len() as i64; + compressed_size += upload_bytes(settings, &storage, &key, label_tar).await? as i64; + let now = Utc::now(); + for (name, _) in &label_entries { + all_files.insert( + (*name).to_string(), + FileDescription { + is_incremented: false, + is_skipped: false, + mtime: now, + updates_count: 0, + }, + ); + tar_file_sets + .entry(part_name.clone()) + .or_default() + .push((*name).to_string()); + } + + let tablespace_spec = if tablespaces.is_empty() { + None + } else { + let mut spec = TablespaceSpec::new(&data_directory); + for (oid, location) in &tablespaces { + spec.add(*oid, location); + } + Some(spec) + }; + + push::finalize_backup(Finalize { + settings, + storage: &storage, + backup_name, + start_lsn, + end_lsn, + pg_version, + system_identifier, + uncompressed_size, + compressed_size, + data_directory, + tablespace_spec, + tablespace_count: tablespaces.len(), + all_files, + tar_file_sets, + pg_control_tee, + parent: parent.as_ref(), + delta_context: delta_context.as_ref(), + args: &args, + start_time, + part_count: max_file_no, + }) + .await +} + +/// One packing worker: repeatedly steals a pre-batched part off the shared +/// receiver and packs it into a single part streamed to S3, until the producer +/// closes the channel +async fn pack_worker( + batch_rx: Arc>>>, + counter: Arc, + settings: Settings, + storage: DynStorage, + backup_name: String, + delta_context: Option, +) -> Result { + let mut res = WorkerResult::default(); + loop { + // recv() only awaits while the producer is mid-walk with nothing + // buffered; a closed channel (walk done) yields None and ends the worker + let batch = { + let mut rx = batch_rx.lock().await; + rx.recv().await + }; + let Some(batch) = batch else { break }; + if batch.is_empty() { + continue; + } + let file_no = counter.fetch_add(1, Ordering::SeqCst) + 1; + res.max_file_no = res.max_file_no.max(file_no); + let part_name = format!("part_{file_no:03}.tar"); + let key = tar_part_key(&backup_name, file_no, settings.compression.extension()); + + // part bytes stream through the channel to a concurrent upload task + let (byte_tx, byte_rx) = mpsc::channel::>(4); + let reader = ChannelReader::new(byte_rx); + let upload = tokio::spawn(upload_part(reader, key, settings.clone(), storage.clone())); + + let counter_bytes = Arc::new(AtomicU64::new(0)); + let mut builder = Builder::new(PartWriter::new(byte_tx, counter_bytes.clone())); + // Abort the upload if this worker errors or is cancelled before the part + // is fully written, so it can't keep reading PGDATA / uploading after + // backup-push has returned. Declared after `builder` so on drop it aborts + // before the part channel closes (no finalize of a partial object) + let upload = AbortOnDrop::new(upload); + for e in &batch { + let written = append_entry(&mut builder, e, &delta_context, &mut res).await?; + if written { + res.tar_file_sets + .entry(part_name.clone()) + .or_default() + .push(e.tar_path.clone()); + } + } + builder.finish().await.context("finish part")?; + let mut writer = builder.into_inner().await.context("into_inner part")?; + writer.shutdown().await.context("flush part")?; + // Drop the writer (and its PollSender) to close the channel so the + // upload's ChannelReader sees EOF; shutdown only flushes, it doesn't + // close. Without this the upload never completes and the worker hangs + drop(writer); + + // Count real tar bytes (headers, padding, dir entries), matching the + // BASE_BACKUP path which counts its whole input archive stream rather + // than logical file bodies + res.uncompressed += counter_bytes.load(Ordering::Relaxed) as i64; + res.compressed += upload.disarm().await.context("upload join")?? as i64; + } + Ok(res) +} + +/// Append one walked entry to `builder`, recording per-file metadata. Returns +/// whether anything was written to the tar (delta-skipped files write nothing) +async fn append_entry( + builder: &mut Builder, + e: &WalkEntry, + delta_context: &Option, + res: &mut WorkerResult, +) -> Result { + if matches!(e.kind, EntryKind::Dir) { + let mut h = header(e, EntryType::Directory, 0); + builder + .append_data(&mut h, &e.tar_path, tokio::io::empty()) + .await + .with_context(|| format!("append dir {}", e.tar_path))?; + return Ok(true); + } + + match classify_for_delta(delta_context, &e.tar_path, e.size) { + DeltaClass::Skip => { + res.files.insert( + e.tar_path.clone(), + FileDescription { + is_incremented: false, + is_skipped: true, + mtime: mtime_dt(e.mtime), + updates_count: 0, + }, + ); + Ok(false) + } + DeltaClass::Increment { + header_bytes, + blocks, + total_size, + } => { + let Some(mut file) = open_walked(&e.abs).await? else { + return Ok(false); + }; + let mut h = header(e, EntryType::Regular, total_size); + let body = IncrementBodyReader::new(header_bytes, &mut file, blocks, e.size); + builder + .append_data(&mut h, &e.tar_path, body) + .await + .with_context(|| format!("append increment {}", e.tar_path))?; + res.files.insert( + e.tar_path.clone(), + FileDescription { + is_incremented: true, + is_skipped: false, + mtime: mtime_dt(e.mtime), + updates_count: 0, + }, + ); + Ok(true) + } + DeltaClass::Passthrough => { + let Some(file) = open_walked(&e.abs).await? else { + return Ok(false); + }; + let body = FixedSizeReader::new(file, e.size); + let mut h = header(e, EntryType::Regular, e.size); + builder + .append_data(&mut h, &e.tar_path, body) + .await + .with_context(|| format!("append {}", e.tar_path))?; + res.files.insert( + e.tar_path.clone(), + FileDescription { + is_incremented: false, + is_skipped: false, + mtime: mtime_dt(e.mtime), + updates_count: 0, + }, + ); + Ok(true) + } + } +} + +/// Open a walked file, tolerating it vanishing between the walk and the pack: +/// DROP TABLE unlinks a relation, pg_internal.init is recreated, etc. Returns +/// None on ENOENT so the caller omits it — matching wal-g, which skips a file +/// removed mid-backup; the unlink is in the WAL and replays on restore +async fn open_walked(abs: &Path) -> Result>> { + match tokio::fs::File::open(abs).await { + Ok(f) => Ok(Some(BufReader::with_capacity(FILE_READ_BUF, f))), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + tracing::warn!( + target = "backup_push", + "{} vanished during backup; skipping", + abs.display(), + ); + Ok(None) + } + Err(e) => Err(e).with_context(|| format!("open {}", abs.display())), + } +} + +fn header(e: &WalkEntry, kind: EntryType, size: u64) -> Header { + let mut h = Header::new_gnu(); + h.set_size(size); + h.set_mode(e.mode); + h.set_mtime(e.mtime.max(0) as u64); + h.set_entry_type(kind); + h +} + +fn mtime_dt(secs: i64) -> chrono::DateTime { + chrono::DateTime::::from_timestamp(secs, 0) + .unwrap_or_else(|| chrono::DateTime::::from_timestamp(0, 0).unwrap()) +} + +/// Owns a spawned task handle and aborts it on drop unless `disarm`ed. Ensures +/// a per-part upload can't outlive its worker (on error or cancellation), which +/// would otherwise keep uploading after backup-push returned +struct AbortOnDrop(Option>); + +impl AbortOnDrop { + fn new(handle: tokio::task::JoinHandle) -> Self { + Self(Some(handle)) + } + + /// Take the handle back; the guard no longer aborts (caller awaits it) + fn disarm(mut self) -> tokio::task::JoinHandle { + self.0.take().expect("disarm called once") + } +} + +impl Drop for AbortOnDrop { + fn drop(&mut self) { + if let Some(h) = self.0.take() { + h.abort(); + } + } +} + +async fn upload_part( + reader: ChannelReader, + key: String, + settings: Settings, + storage: DynStorage, +) -> Result { + let reader: AsyncBufReader = Box::pin(reader); + let compressed = + compression::encode_buffered(settings.compression, reader, settings.compression_level); + let encrypted = settings.encrypt(compressed); + let counter = Arc::new(AtomicU64::new(0)); + let counting = push::wrap_counted_reader(encrypted, counter.clone()); + let throttled = settings.throttle_network(counting); + storage + .put(&key, throttled, None) + .await + .with_context(|| format!("put {key}"))?; + Ok(counter.load(Ordering::Relaxed)) +} + +/// Compress+encrypt a small in-memory tar and PUT it; returns compressed bytes +async fn upload_bytes( + settings: &Settings, + storage: &DynStorage, + key: &str, + bytes: Bytes, +) -> Result { + let raw: AsyncReader = Box::pin(std::io::Cursor::new(bytes.to_vec())); + let compressed = compression::encode(settings.compression, raw, settings.compression_level); + let encrypted = settings.encrypt(compressed); + let counter = Arc::new(AtomicU64::new(0)); + let counting = push::wrap_counted_reader(encrypted, counter.clone()); + let throttled = settings.throttle_network(counting); + storage + .put(key, throttled, None) + .await + .with_context(|| format!("put {key}"))?; + Ok(counter.load(Ordering::Relaxed)) +} + +async fn build_pg_control_tar(abs: &Path) -> Result { + let data = tokio::fs::read(abs) + .await + .with_context(|| format!("read {}", abs.display()))?; + let mut b = Builder::new(Vec::new()); + let mut h = Header::new_gnu(); + h.set_size(data.len() as u64); + h.set_mode(0o600); + h.set_mtime(0); + h.set_entry_type(EntryType::Regular); + b.append_data(&mut h, PG_CONTROL_ENTRY, &data[..]) + .await + .context("append pg_control tee")?; + b.finish().await.context("finish pg_control tar")?; + let buf = b.into_inner().await.context("into_inner pg_control tar")?; + Ok(Bytes::from(buf)) +} + +async fn build_small_tar(entries: &[(&str, &str)]) -> Result { + let mut b = Builder::new(Vec::new()); + for (name, content) in entries { + let mut h = Header::new_gnu(); + h.set_size(content.len() as u64); + h.set_mode(0o600); + h.set_mtime(0); + h.set_entry_type(EntryType::Regular); + b.append_data(&mut h, name, content.as_bytes()) + .await + .with_context(|| format!("append {name}"))?; + } + b.finish().await.context("finish tar")?; + let buf = b.into_inner().await.context("into_inner tar")?; + Ok(Bytes::from(buf)) +} + +// ─── filesystem walk ──────────────────────────────────────────────────────── + +fn walk_data_dir( + pgdata: &Path, + tar_size: u64, + tx: mpsc::Sender>, +) -> Result { + let mut out = WalkSink { + batcher: Batcher::new(tar_size, tx), + tablespaces: Vec::new(), + pg_control: None, + }; + walk_dir(pgdata, "", &mut out)?; + out.batcher.flush()?; + Ok(WalkMeta { + tablespaces: out.tablespaces, + pg_control: out.pg_control, + entry_count: out.batcher.count, + }) +} + +fn walk_dir(dir: &Path, rel_prefix: &str, out: &mut WalkSink) -> Result<()> { + let read = std::fs::read_dir(dir).with_context(|| format!("read_dir {}", dir.display()))?; + for entry in read { + let entry = entry?; + let name = entry.file_name().to_string_lossy().into_owned(); + let ft = entry.file_type()?; + let rel = if rel_prefix.is_empty() { + name.clone() + } else { + format!("{rel_prefix}/{name}") + }; + let abs = entry.path(); + + if ft.is_symlink() { + // Only pg_tblspc/ symlinks matter: record the tablespace and + // walk its target remapped under pg_tblspc// + if rel_prefix == "pg_tblspc" + && let Ok(oid) = name.parse::() + { + let target = std::fs::read_link(&abs) + .with_context(|| format!("readlink {}", abs.display()))?; + out.tablespaces.push((oid, target.display().to_string())); + walk_dir(&target, &rel, out)?; + } + continue; + } + + let excluded = EXCLUDED.contains(&name.as_str()); + + // Resolve file drops before stat: an excluded file (eg pg_internal.init) + // can vanish between readdir and stat, so stat'ing it would fail the + // walk for a file we discard anyway. pg_control rides only in + // pg_control.tar (applied last on restore), never a regular entry + if ft.is_file() { + if excluded { + continue; + } + if rel == PG_CONTROL_ENTRY { + out.pg_control = Some(abs); + continue; + } + } + + let meta = match entry.metadata() { + Ok(m) => m, + // vanished between readdir and stat (eg DROP TABLE); the removal is + // in the WAL and replays on restore, so dropping it stays consistent + Err(e) if e.kind() == std::io::ErrorKind::NotFound => continue, + Err(e) => return Err(e).with_context(|| format!("stat {}", abs.display())), + }; + let mode = meta.permissions().mode(); + let mtime = mtime_secs(&meta); + + if ft.is_dir() { + // Emit the dir entry even when excluded so it exists on restore, + // but don't recurse into excluded dirs + out.batcher.push(WalkEntry { + kind: EntryKind::Dir, + tar_path: rel.clone(), + abs: abs.clone(), + size: 0, + mode, + mtime, + })?; + if !excluded { + walk_dir(&abs, &rel, out)?; + } + } else if ft.is_file() { + out.batcher.push(WalkEntry { + kind: EntryKind::File, + tar_path: rel, + abs, + size: meta.len(), + mode, + mtime, + })?; + } + } + Ok(()) +} + +fn mtime_secs(meta: &std::fs::Metadata) -> i64 { + meta.modified() + .ok() + .and_then(|t| t.duration_since(UNIX_EPOCH).ok()) + .map(|d| d.as_secs() as i64) + .unwrap_or(0) +} + +// ─── pg_backup_start / pg_backup_stop ─────────────────────────────────────── + +async fn backup_start( + conn: &mut ReplicationConn, + pg_version: i32, + label: &str, + fast: bool, +) -> Result { + // Non-exclusive backup (session-scoped). PG15+ renamed the functions + let sql = if pg_version >= 150000 { + format!("SELECT pg_backup_start('{}', {fast})", sql_lit(label)) + } else { + format!( + "SELECT pg_start_backup('{}', {fast}, false)", + sql_lit(label) + ) + }; + let rows = conn.query_rows(&sql).await.context("pg_backup_start")?; + let lsn = first_col(&rows).ok_or_else(|| anyhow!("pg_backup_start returned no LSN"))?; + parse_pg_lsn(&lsn).context("parse start LSN") +} + +/// Returns (end_lsn, backup_label, tablespace_map) +async fn backup_stop(conn: &mut ReplicationConn, pg_version: i32) -> Result<(u64, String, String)> { + // wait_for_archive=false: walrus ships WAL separately, and waiting can hang + // when no archiver is running + let sql = if pg_version >= 150000 { + "SELECT lsn::text, labelfile, spcmapfile FROM pg_backup_stop(false)" + } else { + "SELECT lsn::text, labelfile, spcmapfile FROM pg_stop_backup(false, false)" + }; + let rows = conn.query_rows(sql).await.context("pg_backup_stop")?; + let row = rows + .first() + .ok_or_else(|| anyhow!("pg_backup_stop returned no row"))?; + let lsn = row + .first() + .and_then(|c| c.clone()) + .ok_or_else(|| anyhow!("pg_backup_stop returned no LSN"))?; + let labelfile = row.get(1).and_then(|c| c.clone()).unwrap_or_default(); + let spcmapfile = row.get(2).and_then(|c| c.clone()).unwrap_or_default(); + Ok(( + parse_pg_lsn(&lsn).context("parse end LSN")?, + labelfile, + spcmapfile, + )) +} + +async fn query_u64(conn: &mut ReplicationConn, sql: &str) -> Result { + let rows = conn.query_rows(sql).await?; + first_col(&rows) + .ok_or_else(|| anyhow!("`{sql}` returned no value"))? + .trim() + .parse() + .with_context(|| format!("parse u64 from `{sql}`")) +} + +async fn show_setting(conn: &mut ReplicationConn, name: &str) -> Result { + let rows = conn.query_rows(&format!("SHOW {name}")).await?; + first_col(&rows).ok_or_else(|| anyhow!("SHOW {name} returned no rows")) +} + +fn first_col(rows: &[Vec>]) -> Option { + rows.first().and_then(|r| r.first()).and_then(|c| c.clone()) +} + +fn sql_lit(s: &str) -> String { + s.replace('\'', "''") +} + +#[allow(clippy::too_many_arguments)] +async fn build_delta_context( + settings: &Settings, + storage: &DynStorage, + parent: Option<&delta::PrevBackupInfo>, + args: &PushArgs, + increment_format: IncrementFormat, + pgdata: &Path, + timeline: u32, + start_lsn: u64, +) -> Option { + let p = parent?; + if start_lsn <= p.start_lsn { + tracing::warn!( + target = "backup_push", + "new start LSN <= parent; producing a full backup", + ); + return None; + } + let map = if args.delta_from_wal_summaries { + push::build_delta_map_from_summaries(Some(pgdata), timeline, p.start_lsn, start_lsn) + } else { + delta::build_delta_map_from_wal( + settings, + storage, + p.timeline, + p.start_lsn, + start_lsn, + settings.compression, + ) + .await + }; + match map { + Ok(map) => { + tracing::info!( + target = "backup_push", + "delta map: {} dirty page(s)", + map.len(), + ); + Some(DeltaContext { + map: Arc::new(map), + format: increment_format, + parent_files: p.parent_files.clone(), + }) + } + Err(e) => { + tracing::warn!( + target = "backup_push", + "delta map build failed ({e:#}); producing a full backup", + ); + None + } + } +} + +// ─── fixed-size body reader ───────────────────────────────────────────────── + +/// Emits exactly `remaining` bytes from `inner`: truncates if the file grew, +/// zero-pads if it shrank, since a file can change between stat and read under +/// pg_backup_start. Keeps the tar body length matching the header size +struct FixedSizeReader { + inner: R, + remaining: u64, + inner_eof: bool, +} + +impl FixedSizeReader { + fn new(inner: R, size: u64) -> Self { + Self { + inner, + remaining: size, + inner_eof: false, + } + } +} + +impl tokio::io::AsyncRead for FixedSizeReader { + fn poll_read( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> std::task::Poll> { + use std::task::Poll; + let me = self.get_mut(); + if me.remaining == 0 { + return Poll::Ready(Ok(())); + } + let want = (buf.remaining() as u64).min(me.remaining) as usize; + if want == 0 { + return Poll::Ready(Ok(())); + } + if me.inner_eof { + // initialize_unfilled_to zeroes the region; emit padding + buf.initialize_unfilled_to(want); + buf.advance(want); + me.remaining -= want as u64; + return Poll::Ready(Ok(())); + } + let n; + { + let dst = buf.initialize_unfilled_to(want); + let mut tmp = tokio::io::ReadBuf::new(dst); + match std::pin::Pin::new(&mut me.inner).poll_read(cx, &mut tmp) { + Poll::Pending => return Poll::Pending, + Poll::Ready(Err(e)) => return Poll::Ready(Err(e)), + Poll::Ready(Ok(())) => n = tmp.filled().len(), + } + } + if n == 0 { + // file shorter than recorded size: pad the rest with zeros + me.inner_eof = true; + buf.initialize_unfilled_to(want); + buf.advance(want); + me.remaining -= want as u64; + } else { + buf.advance(n); + me.remaining -= n as u64; + } + Poll::Ready(Ok(())) + } +} + +#[cfg(test)] +mod tests { + use std::io::Read as _; + + use super::*; + use crate::compression::Method; + use crate::storage::fs::FsStorage; + use tokio::io::AsyncReadExt; + + fn write_file(root: &Path, rel: &str, content: &[u8]) { + let p = root.join(rel); + std::fs::create_dir_all(p.parent().unwrap()).unwrap(); + std::fs::write(p, content).unwrap(); + } + + /// Run the streaming walk to completion and flatten every batch back into one + /// entry list, for tests that inspect the walk's output rather than pack it + async fn walk_collect(root: &Path, tar_size: u64) -> (Vec, WalkMeta) { + let (tx, mut rx) = mpsc::channel::>(1024); + let root = root.to_path_buf(); + let handle = tokio::task::spawn_blocking(move || walk_data_dir(&root, tar_size, tx)); + let mut entries = Vec::new(); + while let Some(batch) = rx.recv().await { + entries.extend(batch); + } + let meta = handle.await.unwrap().unwrap(); + (entries, meta) + } + + /// Walk into a shared receiver for driving `pack_worker`. Buffers every batch + /// (test inputs are tiny), then drops the sender so the worker sees EOF + async fn walk_batches( + root: &Path, + tar_size: u64, + ) -> Arc>>> { + let (tx, rx) = mpsc::channel::>(1024); + let root = root.to_path_buf(); + tokio::task::spawn_blocking(move || walk_data_dir(&root, tar_size, tx)) + .await + .unwrap() + .unwrap(); + Arc::new(Mutex::new(rx)) + } + + #[test] + fn is_pgdata_dir_detects_marker() { + let dir = tempfile::tempdir().unwrap(); + assert!(!is_pgdata_dir(dir.path())); + std::fs::write(dir.path().join("PG_VERSION"), b"16").unwrap(); + assert!(is_pgdata_dir(dir.path())); + } + + #[tokio::test] + async fn walk_excludes_dirs_files_and_tees_pg_control() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + write_file(root, "PG_VERSION", b"16"); + write_file(root, "base/1/1234", b"relation"); + write_file(root, "global/pg_control", b"control"); + write_file(root, "global/pg_internal.init", b"relcache"); + write_file(root, "base/1/pg_internal.init", b"relcache"); + write_file(root, "pg_wal/000000010000000000000001", b"walseg"); + write_file(root, "postmaster.pid", b"123"); + write_file(root, "standby.signal", b""); + write_file(root, "recovery.signal", b""); + + let (entries, meta) = walk_collect(root, u64::MAX).await; + let paths: std::collections::HashSet<&str> = + entries.iter().map(|e| e.tar_path.as_str()).collect(); + + assert!(paths.contains("PG_VERSION")); + assert!(paths.contains("base/1/1234")); + // excluded dir present as an (empty) entry, its contents are not + assert!(paths.contains("pg_wal")); + assert!(!paths.iter().any(|p| p.starts_with("pg_wal/"))); + // excluded file dropped entirely + assert!(!paths.contains("postmaster.pid")); + // pg_internal.init churns under relcache invalidation; dropped in every + // directory (global + per-database) so a stat→open can't race a vanish + assert!(!paths.iter().any(|p| p.ends_with("pg_internal.init"))); + // signal files dropped so a restore controls its own recovery state + assert!(!paths.contains("standby.signal")); + assert!(!paths.contains("recovery.signal")); + // pg_control rides only in the tee, never a regular entry + assert!(!paths.contains("global/pg_control")); + assert_eq!(meta.pg_control, Some(root.join("global/pg_control"))); + + let pg_wal = entries.iter().find(|e| e.tar_path == "pg_wal").unwrap(); + assert!(matches!(pg_wal.kind, EntryKind::Dir)); + } + + /// pg_tblspc/ symlinks: record (oid, on-disk target) and remap the + /// target's contents under pg_tblspc// in the tar + #[tokio::test] + async fn walk_remaps_tablespace_symlink() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path().join("pgdata"); + write_file(&root, "PG_VERSION", b"16"); + write_file(&root, "global/pg_control", b"control"); + + // external tablespace location holding a relation file + let ts = dir.path().join("tblspc_a"); + write_file(&ts, "PG_16_202307071/16400/12345", &[9u8; 100]); + std::fs::create_dir_all(root.join("pg_tblspc")).unwrap(); + std::os::unix::fs::symlink(&ts, root.join("pg_tblspc/16384")).unwrap(); + + let (entries, meta) = walk_collect(&root, u64::MAX).await; + let paths: std::collections::HashSet<&str> = + entries.iter().map(|e| e.tar_path.as_str()).collect(); + + // tablespace recorded by oid → on-disk target + assert_eq!(meta.tablespaces, vec![(16384u32, ts.display().to_string())]); + // pg_tblspc dir emitted; target contents remapped beneath the oid + assert!(paths.contains("pg_tblspc")); + assert!(paths.contains("pg_tblspc/16384/PG_16_202307071/16400/12345")); + // symlinked relation file points back at its real on-disk location + let rel = entries + .iter() + .find(|e| e.tar_path == "pg_tblspc/16384/PG_16_202307071/16400/12345") + .unwrap(); + assert!(matches!(rel.kind, EntryKind::File)); + assert_eq!(rel.size, 100); + assert_eq!(rel.abs, ts.join("PG_16_202307071/16400/12345")); + } + + fn file_entry(path: &str, size: u64) -> WalkEntry { + WalkEntry { + kind: EntryKind::File, + tar_path: path.into(), + abs: PathBuf::new(), + size, + mode: 0o644, + mtime: 0, + } + } + + #[tokio::test] + async fn batcher_rotation() { + // threshold 100: [40, 40] fits one part; next 40 alone; oversize 500 alone + let (tx, mut rx) = mpsc::channel::>(64); + // blocking_send must run off the runtime; flush on drop is via explicit flush + tokio::task::spawn_blocking(move || { + let mut b = Batcher::new(100, tx); + for e in [ + file_entry("a", 40), + file_entry("b", 40), + file_entry("c", 40), + file_entry("big", 500), + file_entry("d", 10), + ] { + b.push(e).unwrap(); + } + b.flush().unwrap(); + }) + .await + .unwrap(); + + let mut batches: Vec> = Vec::new(); + while let Some(batch) = rx.recv().await { + batches.push(batch.iter().map(|e| e.tar_path.clone()).collect()); + } + let got: Vec> = batches + .iter() + .map(|b| b.iter().map(String::as_str).collect()) + .collect(); + assert_eq!(got, vec![vec!["a", "b"], vec!["c"], vec!["big"], vec!["d"]]); + } + + #[tokio::test] + async fn fixed_size_reader_truncates_and_pads() { + // truncate: 6 bytes available, want 4 + let mut r = FixedSizeReader::new(std::io::Cursor::new(b"abcdef".to_vec()), 4); + let mut out = Vec::new(); + r.read_to_end(&mut out).await.unwrap(); + assert_eq!(out, b"abcd"); + + // pad: 3 bytes available, want 6 → zero-filled tail + let mut r = FixedSizeReader::new(std::io::Cursor::new(b"abc".to_vec()), 6); + let mut out = Vec::new(); + r.read_to_end(&mut out).await.unwrap(); + assert_eq!(out, b"abc\0\0\0"); + } + + /// walk → concurrent pack → read parts back: every file & dir survives + /// byte-clean through the async packer (uncompressed for a simple check) + #[tokio::test] + async fn pack_roundtrip_to_storage() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path().join("pgdata"); + write_file(&root, "PG_VERSION", b"16"); + write_file(&root, "base/1/1234", &vec![7u8; 5000]); + write_file(&root, "base/1/5678", b"small"); + write_file(&root, "global/123", &vec![3u8; 9000]); + write_file(&root, "pg_wal/seg", b"excluded"); + + // expected file bodies (pg_wal/seg is excluded by the walk) + let expect: std::collections::HashMap> = [ + ("PG_VERSION".to_string(), b"16".to_vec()), + ("base/1/1234".to_string(), vec![7u8; 5000]), + ("base/1/5678".to_string(), b"small".to_vec()), + ("global/123".to_string(), vec![3u8; 9000]), + ] + .into_iter() + .collect(); + + let store_dir = tempfile::tempdir().unwrap(); + let storage: DynStorage = Arc::new(FsStorage::new(store_dir.path()).unwrap()); + let settings = Settings { + compression: Method::None, + ..Default::default() + }; + + let batch_rx = walk_batches(&root, 4096).await; + let counter = Arc::new(AtomicU32::new(0)); + let name = "base_test"; + let res = pack_worker( + batch_rx, + counter, + settings, + storage.clone(), + name.to_string(), + None, + ) + .await + .unwrap(); + assert!(res.max_file_no >= 1); + + // read every emitted part & collect file bodies + let mut got: std::collections::HashMap> = std::collections::HashMap::new(); + let mut part_bytes_total: u64 = 0; + for file_no in 1..=res.max_file_no { + let key = tar_part_key(name, file_no, ""); + let mut body = storage.get(&key).await.unwrap(); + let mut bytes = Vec::new(); + body.read_to_end(&mut bytes).await.unwrap(); + part_bytes_total += bytes.len() as u64; + let mut ar = tar::Archive::new(&bytes[..]); + for e in ar.entries().unwrap() { + let mut e = e.unwrap(); + let p = e.path().unwrap().to_string_lossy().into_owned(); + if e.header().entry_type().is_dir() { + continue; + } + let mut c = Vec::new(); + e.read_to_end(&mut c).unwrap(); + got.insert(p, c); + } + } + + assert_eq!(got.len(), expect.len(), "file count mismatch: {got:?}"); + for (path, content) in &expect { + assert_eq!(got.get(path), Some(content), "mismatch for {path}"); + } + // excluded file never made it into a part + assert!(!got.contains_key("pg_wal/seg")); + // uncompressed_size counts real tar bytes (headers, padding, dir + // entries), not just logical file bodies: with Method::None the stored + // part bytes equal the tar bytes the PartWriter counted + assert_eq!( + res.uncompressed as u64, part_bytes_total, + "uncompressed must equal actual tar part bytes" + ); + } + + #[tokio::test] + async fn open_walked_tolerates_missing() { + let dir = tempfile::tempdir().unwrap(); + let present = dir.path().join("here"); + std::fs::write(&present, b"x").unwrap(); + assert!(open_walked(&present).await.unwrap().is_some()); + assert!( + open_walked(&dir.path().join("gone")) + .await + .unwrap() + .is_none() + ); + } + + /// A relation unlinked between walk and pack (DROP TABLE) is dropped from the + /// backup without failing the part, matching wal-g + #[tokio::test] + async fn pack_skips_file_removed_after_walk() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path().join("pgdata"); + write_file(&root, "PG_VERSION", b"16"); + write_file(&root, "base/1/1234", b"relation"); + write_file(&root, "base/1/5678", b"dropme"); + + // walk records the file, then simulate DROP TABLE before the pack opens it + let batch_rx = walk_batches(&root, 4096).await; + std::fs::remove_file(root.join("base/1/5678")).unwrap(); + + let store_dir = tempfile::tempdir().unwrap(); + let storage: DynStorage = Arc::new(FsStorage::new(store_dir.path()).unwrap()); + let settings = Settings { + compression: Method::None, + ..Default::default() + }; + let res = pack_worker( + batch_rx, + Arc::new(AtomicU32::new(0)), + settings, + storage, + "base_drop".to_string(), + None, + ) + .await + .unwrap(); + + assert!(res.files.contains_key("base/1/1234")); + assert!(!res.files.contains_key("base/1/5678")); + } +} diff --git a/src/pg/backup/increment.rs b/src/pg/backup/increment.rs index 2d6ba59..92de56d 100644 --- a/src/pg/backup/increment.rs +++ b/src/pg/backup/increment.rs @@ -274,7 +274,7 @@ where R: Read, W: io::Write + io::Seek, { - let mut page = vec![0u8; PG_PAGE_SIZE as usize]; + let mut page = [0u8; PG_PAGE_SIZE as usize]; for &block_no in blocks { increment.read_exact(&mut page)?; target.seek(io::SeekFrom::Start(block_no as u64 * PG_PAGE_SIZE))?; @@ -328,7 +328,7 @@ mod tests { #[test] fn wi1_apply_writes_at_block_offsets() { - let mut target = Cursor::new(vec![0u8; PG_PAGE_SIZE as usize * 3]); + let mut target = Cursor::new([0u8; PG_PAGE_SIZE as usize * 3]); let mut inc = Vec::new(); write_increment_header(&mut inc, PG_PAGE_SIZE * 3, &[1]).unwrap(); inc.extend(std::iter::repeat_n(0xAA, PG_PAGE_SIZE as usize)); @@ -340,7 +340,7 @@ mod tests { assert_eq!(fmt, Format::Wi1); target.seek(SeekFrom::Start(0)).unwrap(); - let mut b = vec![0u8; PG_PAGE_SIZE as usize]; + let mut b = [0u8; PG_PAGE_SIZE as usize]; target.read_exact(&mut b).unwrap(); assert!(b.iter().all(|&x| x == 0)); target.read_exact(&mut b).unwrap(); @@ -351,7 +351,7 @@ mod tests { #[test] fn wi1_trailing_data_rejected() { - let mut target = Cursor::new(vec![0u8; PG_PAGE_SIZE as usize * 2]); + let mut target = Cursor::new([0u8; PG_PAGE_SIZE as usize * 2]); let mut inc = Vec::new(); write_increment_header(&mut inc, PG_PAGE_SIZE * 2, &[0]).unwrap(); inc.extend(std::iter::repeat_n(0xCC, PG_PAGE_SIZE as usize)); @@ -395,14 +395,14 @@ mod tests { // block body for block 1 inc.extend(std::iter::repeat_n(0xBB, PG_PAGE_SIZE as usize)); - let mut target = Cursor::new(vec![0xAA; PG_PAGE_SIZE as usize * 5]); + let mut target = Cursor::new([0xAA; PG_PAGE_SIZE as usize * 5]); let (size, n, fmt) = apply_increment_in_place(&mut Cursor::new(inc), &mut target).unwrap(); assert_eq!(size, PG_PAGE_SIZE * 5); assert_eq!(n, 1); assert_eq!(fmt, Format::Native); target.seek(SeekFrom::Start(PG_PAGE_SIZE)).unwrap(); - let mut buf = vec![0u8; PG_PAGE_SIZE as usize]; + let mut buf = [0u8; PG_PAGE_SIZE as usize]; target.read_exact(&mut buf).unwrap(); assert!(buf.iter().all(|&b| b == 0xBB)); } @@ -416,7 +416,7 @@ mod tests { inc.extend(std::iter::repeat_n(0x11, PG_PAGE_SIZE as usize)); inc.extend(std::iter::repeat_n(0x22, PG_PAGE_SIZE as usize)); - let mut target = Cursor::new(vec![0u8; PG_PAGE_SIZE as usize * 4]); + let mut target = Cursor::new([0u8; PG_PAGE_SIZE as usize * 4]); let (size, _, _) = apply_increment_in_place(&mut Cursor::new(inc), &mut target).unwrap(); assert_eq!(size, PG_PAGE_SIZE * 3); } @@ -428,7 +428,7 @@ mod tests { write_native_increment_header(&mut inc, 10, &[]).unwrap(); assert_eq!(inc.len(), 12); - let mut target = Cursor::new(vec![0u8; PG_PAGE_SIZE as usize * 10]); + let mut target = Cursor::new([0u8; PG_PAGE_SIZE as usize * 10]); let (size, n, fmt) = apply_increment_in_place(&mut Cursor::new(inc), &mut target).unwrap(); assert_eq!(size, PG_PAGE_SIZE * 10); assert_eq!(n, 0); @@ -442,14 +442,14 @@ mod tests { write_native_increment_header(&mut inc, 1, &blocks).unwrap(); inc.extend(std::iter::repeat_n(0xCC, PG_PAGE_SIZE as usize)); inc.push(0x42); - let mut target = Cursor::new(vec![0u8; PG_PAGE_SIZE as usize]); + let mut target = Cursor::new([0u8; PG_PAGE_SIZE as usize]); let err = apply_increment_in_place(&mut Cursor::new(inc), &mut target).unwrap_err(); assert!(matches!(err, IncrementError::UnexpectedTrailing)); } #[test] fn apply_rejects_unknown_magic() { - let mut target = Cursor::new(vec![0u8; PG_PAGE_SIZE as usize]); + let mut target = Cursor::new([0u8; PG_PAGE_SIZE as usize]); let buf = vec![0xDE, 0xAD, 0xBE, 0xEF, 0x00]; let err = apply_increment_in_place(&mut Cursor::new(buf), &mut target).unwrap_err(); assert!(matches!(err, IncrementError::BadMagic(_))); diff --git a/src/pg/backup/mod.rs b/src/pg/backup/mod.rs index a7c1a80..ecf4d48 100644 --- a/src/pg/backup/mod.rs +++ b/src/pg/backup/mod.rs @@ -12,6 +12,7 @@ pub mod copy; pub mod delete; pub mod delta; pub mod fetch; +pub mod fs_push; pub mod increment; pub mod list; pub mod push; diff --git a/src/pg/backup/push.rs b/src/pg/backup/push.rs index ea7238c..81372ae 100644 --- a/src/pg/backup/push.rs +++ b/src/pg/backup/push.rs @@ -8,7 +8,7 @@ //! The data dir's `global/pg_control` is teed into a separate `pg_control.tar` //! so `backup-fetch` can apply it last (matches wal-g's restore ordering) //! -//! `--pgdata` is optional; absent it, the sentinel records the PG-reported +//! Local PGDATA is optional; absent it, the sentinel records the PG-reported //! `data_directory` and we never touch the local filesystem use std::path::PathBuf; @@ -21,7 +21,7 @@ use bytes::{Bytes, BytesMut}; use tokio::io::{AsyncRead, ReadBuf}; use tokio::sync::mpsc; -use crate::compression::{self, AsyncReader}; +use crate::compression::{self, AsyncBufReader, AsyncReader}; use crate::concurrency::BoundedTasks; use crate::config::Settings; use crate::pg::backup::delta::{self, PrevBackupInfo}; @@ -35,7 +35,8 @@ use crate::pg::backup::{ }; use crate::pg::replication::PgConfig; use crate::pg::replication::base_backup::{ - BackupEvent, BaseBackupOpts, ChannelReader, Tablespace, run_base_backup, + BackupEvent, BaseBackupOpts, ChannelReader, Tablespace, max_rate_kib_from_bytes, + run_base_backup, }; use crate::pg::replication::conn::ReplicationConn; use crate::storage::DynStorage; @@ -66,6 +67,15 @@ pub struct PushArgs { } pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> Result<()> { + // A local PGDATA directory selects the filesystem source (wal-g + // semantics): walks the data dir & packs parts concurrently. Without a + // readable local pgdata, fall through to the single-stream BASE_BACKUP path + if let Some(pgdata) = args.pgdata.as_deref() + && super::fs_push::is_pgdata_dir(pgdata) + { + return super::fs_push::handle(settings, storage, args).await; + } + let start_time = chrono::Utc::now(); // Resolve a delta parent if WALG_DELTA_MAX_STEPS > 0 (or --delta-from- @@ -149,18 +159,25 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> } if parent.is_some() && args.pgdata.is_none() { bail!( - "--delta-from-wal-summaries requires --pgdata: WAL summaries live on \ + "--delta-from-wal-summaries requires local PGDATA: WAL summaries live on \ the PG host filesystem & cannot be read remotely" ); } } let label = format!("walrus {}", chrono::Utc::now().format("%Y%m%dT%H%M%SZ")); + let max_rate_kib = max_rate_kib_from_bytes(settings.disk_rate_limit); + if let Some(rate) = max_rate_kib { + tracing::info!( + target = "backup_push", + "BASE_BACKUP rate limited to {rate} kB/s (WALG_DISK_RATE_LIMIT)", + ); + } let opts = BaseBackupOpts { label: label.clone(), fast_checkpoint: args.fast_checkpoint, no_verify_checksums: args.no_verify_checksums, - max_rate_kib: None, + max_rate_kib, // wal-g push uploads tablespaces separately and ships WAL via // `wal-push`; inlining the segments would duplicate them wal: false, @@ -352,9 +369,12 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> let cfg = settings.clone(); uploads .spawn(async move { - let reader: AsyncReader = Box::pin(part.reader); - let compressed = - compression::encode(cfg.compression, reader, cfg.compression_level); + let reader: AsyncBufReader = Box::pin(part.reader); + let compressed = compression::encode_buffered( + cfg.compression, + reader, + cfg.compression_level, + ); let encrypted = cfg.encrypt(compressed); let counter = Arc::new(AtomicU64::new(0)); let counting = wrap_counted_reader(encrypted, counter.clone()); @@ -411,6 +431,94 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> let start_lsn = start_lsn.ok_or_else(|| anyhow!("no start LSN received"))?; let end_lsn = end_lsn.ok_or_else(|| anyhow!("no end LSN received"))?; + // Build TablespaceSpec from non-default tablespaces. Mirrors wal-g + let user_tablespaces: Vec<&Tablespace> = + tablespace_list.iter().filter(|t| !t.is_default()).collect(); + let tablespace_spec = if user_tablespaces.is_empty() { + None + } else { + let mut spec = TablespaceSpec::new(&data_directory); + for t in &user_tablespaces { + spec.add(t.oid, &t.location); + } + Some(spec) + }; + + finalize_backup(Finalize { + settings, + storage: &storage, + backup_name, + start_lsn, + end_lsn, + pg_version, + system_identifier, + uncompressed_size, + compressed_size, + data_directory, + tablespace_spec, + tablespace_count: tablespace_list.len(), + all_files, + tar_file_sets, + pg_control_tee, + parent: parent.as_ref(), + delta_context: delta_context.as_ref(), + args: &args, + start_time, + part_count: file_no, + }) + .await +} + +/// Inputs to [`finalize_backup`], shared by the BASE_BACKUP & filesystem paths +pub(crate) struct Finalize<'a> { + pub settings: &'a Settings, + pub storage: &'a DynStorage, + pub backup_name: String, + pub start_lsn: u64, + pub end_lsn: u64, + pub pg_version: i32, + pub system_identifier: u64, + pub uncompressed_size: i64, + pub compressed_size: i64, + pub data_directory: String, + pub tablespace_spec: Option, + pub tablespace_count: usize, + pub all_files: std::collections::HashMap, + pub tar_file_sets: std::collections::HashMap>, + pub pg_control_tee: Option, + pub parent: Option<&'a PrevBackupInfo>, + pub delta_context: Option<&'a DeltaContext>, + pub args: &'a PushArgs, + pub start_time: chrono::DateTime, + pub part_count: u32, +} + +/// Upload pg_control tee, files_metadata.json, sentinel & metadata. Prints the +/// backup name on success. Common tail for both backup-push source paths +pub(crate) async fn finalize_backup(f: Finalize<'_>) -> Result<()> { + let Finalize { + settings, + storage, + backup_name, + start_lsn, + end_lsn, + pg_version, + system_identifier, + uncompressed_size, + mut compressed_size, + data_directory, + tablespace_spec, + tablespace_count, + all_files, + tar_file_sets, + pg_control_tee, + parent, + delta_context, + args, + start_time, + part_count, + } = f; + // Upload pg_control.tar as a tee so restore can apply it last if let Some(bytes) = pg_control_tee { let ext = settings.compression.extension(); @@ -438,26 +546,13 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> compressed_size += put_counter.load(Ordering::Relaxed) as i64; } - // Build TablespaceSpec from non-default tablespaces. Mirrors wal-g - let user_tablespaces: Vec<&Tablespace> = - tablespace_list.iter().filter(|t| !t.is_default()).collect(); - let tablespace_spec = if user_tablespaces.is_empty() { - None - } else { - let mut spec = TablespaceSpec::new(&data_directory); - for t in &user_tablespaces { - spec.add(t.oid, &t.location); - } - Some(spec) - }; - // Emit files_metadata.json sidecar let files_meta = FilesMetadataDto { files: all_files, tar_file_sets, databases_by_names: Default::default(), }; - upload_json(&storage, &files_metadata_key(&backup_name), &files_meta).await?; + upload_json(storage, &files_metadata_key(&backup_name), &files_meta).await?; let hostname = hostname().unwrap_or_default(); let finish_time = chrono::Utc::now(); @@ -468,7 +563,7 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> // must claim FULL — otherwise restore would walk a chain whose // increments don't exist let (incr_from_lsn, incr_from_name, incr_full_name, incr_count, incr_format) = - match (parent.as_ref(), delta_context.as_ref()) { + match (parent, delta_context) { (Some(p), Some(ctx)) => ( Some(p.start_lsn), Some(p.name.clone()), @@ -523,14 +618,14 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> user_data: args.user_data.clone(), }; - upload_json(&storage, &metadata_key(&backup_name), &meta).await?; - upload_json(&storage, &sentinel_key(&backup_name), &v2).await?; + upload_json(storage, &metadata_key(&backup_name), &meta).await?; + upload_json(storage, &sentinel_key(&backup_name), &v2).await?; tracing::info!( target = "backup_push", "wrote {backup_name} ({} parts, {} tablespace(s), {} bytes uncompressed, {} bytes compressed)", - file_no, - tablespace_list.len(), + part_count, + tablespace_count, uncompressed_size, compressed_size, ); @@ -619,7 +714,7 @@ fn wrap_with_counter(input: AsyncReader) -> (CounterHandle, AsyncReader) { (CounterHandle(counter), Box::pin(r)) } -fn wrap_counted_reader(input: AsyncReader, counter: Arc) -> AsyncReader { +pub(crate) fn wrap_counted_reader(input: AsyncReader, counter: Arc) -> AsyncReader { Box::pin(CountingReader { inner: input, counter, @@ -642,15 +737,16 @@ fn resolve_increment_full_name(p: &PrevBackupInfo) -> String { } } -/// PG17 wal-summaries → delta map. Returns an error if --pgdata is absent +/// PG17 wal-summaries → delta map. Returns an error if local PGDATA is absent /// since the summaries live on the server's filesystem -fn build_delta_map_from_summaries( +pub(crate) fn build_delta_map_from_summaries( pgdata: Option<&std::path::Path>, timeline: u32, first_used_lsn: u64, first_not_used_lsn: u64, ) -> Result { - let pgdata = pgdata.ok_or_else(|| anyhow!("--delta-from-wal-summaries requires --pgdata"))?; + let pgdata = + pgdata.ok_or_else(|| anyhow!("--delta-from-wal-summaries requires local PGDATA"))?; let map = crate::pg::wal_summaries::read_for_range( pgdata, timeline, @@ -671,9 +767,9 @@ mod tests { #[test] fn delta_map_from_summaries_requires_pgdata() { - // Summaries live on the PG host filesystem; without --pgdata the map + // Summaries live on the PG host filesystem; without local PGDATA the map // can't be read, so the wrapper must bail before touching disk let err = build_delta_map_from_summaries(None, 1, 0x100, 0x200).unwrap_err(); - assert!(format!("{err:#}").contains("--pgdata"), "{err:#}"); + assert!(format!("{err:#}").contains("PGDATA"), "{err:#}"); } } diff --git a/src/pg/backup/tar_streamer.rs b/src/pg/backup/tar_streamer.rs index 00093d9..9b3f857 100644 --- a/src/pg/backup/tar_streamer.rs +++ b/src/pg/backup/tar_streamer.rs @@ -10,22 +10,25 @@ //! into its own part (wal-g matches this behavior; mirrors a real PG tar //! that occasionally carries multi-GB segment files) //! -//! The streamer runs as `spawn_blocking` because `tar::Archive` / -//! `tar::Builder` are sync. Async input is bridged via `SyncIoBridge`; -//! per-part output flows over an mpsc of `Bytes` that the caller reads as -//! an `AsyncRead` (see `ChannelReader`) +//! The streamer runs as a `tokio::spawn` task over `astral-tokio-tar`'s async +//! `Archive` / `Builder`; per-part output flows over an mpsc of `Bytes` that +//! the caller reads as an `AsyncRead` (see `ChannelReader`) use std::collections::{BTreeSet, HashMap, HashSet}; -use std::io::{Read, Write}; +use std::pin::Pin; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::task::{Context as TaskContext, Poll}; use anyhow::{Context, Result, anyhow}; use bytes::Bytes; use chrono::{DateTime, Utc}; -use tokio::io::AsyncRead; +use futures::StreamExt; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, ReadBuf}; use tokio::sync::mpsc; use tokio::task::JoinHandle; -use tokio_util::io::SyncIoBridge; +use tokio_tar::{Archive, Builder, Header}; +use tokio_util::sync::PollSender; use crate::pg::backup::delta::{self as delta_mod, PG_PAGE_SIZE, PagedFileDeltaMap}; use crate::pg::backup::increment::{ @@ -125,32 +128,29 @@ where R: AsyncRead + Send + Unpin + 'static, { let (parts_tx, parts_rx) = mpsc::channel::>(opts.queue_depth.max(1)); - let handle = tokio::task::spawn_blocking(move || -> Result { - let sync_input = SyncIoBridge::new(input); - run_blocking(sync_input, opts, parts_tx) - }); + let handle = tokio::spawn(run_async(input, opts, parts_tx)); (parts_rx, handle) } -fn run_blocking( +async fn run_async( input: R, opts: StreamerOpts, parts_tx: mpsc::Sender>, ) -> Result { - let mut archive = tar::Archive::new(input); + let mut archive = Archive::new(input); let mut entries = archive.entries().context("open tar entries")?; let mut result = StreamerResult::default(); let mut file_no = opts.starting_file_no; - let mut tee_builder: Option>> = if opts.tee_names.is_empty() { + let mut tee_builder: Option>> = if opts.tee_names.is_empty() { None } else { - Some(tar::Builder::new(Vec::new())) + Some(Builder::new(Vec::new())) }; let mut current: Option = None; - for entry in entries.by_ref() { + while let Some(entry) = entries.next().await { let mut entry = entry.context("read tar entry")?; let header = entry.header().clone(); let orig_path = entry @@ -199,11 +199,11 @@ fn run_blocking( && ctx.bytes_written() > 0 && ctx.bytes_written().saturating_add(out_body_size) > opts.max_tar_size { - finalize_part(current.take().unwrap())?; + finalize_part(current.take().unwrap()).await?; } if current.is_none() { file_no += 1; - current = Some(start_part(file_no, &parts_tx)?); + current = Some(start_part(file_no, &parts_tx).await?); } let ctx = current.as_mut().unwrap(); @@ -230,6 +230,7 @@ fn run_blocking( let body = IncrementBodyReader::new(header_bytes, &mut entry, blocks, entry_size); ctx.builder .append_data(&mut new_hdr, &mapped, body) + .await .context("append increment to current part")?; (true, false) } @@ -237,18 +238,24 @@ fn run_blocking( if tee_match { // Tee path: buffer in memory (only used for small files like pg_control) let mut buf = Vec::with_capacity(entry_size as usize); - entry.read_to_end(&mut buf).context("read tee entry")?; + entry + .read_to_end(&mut buf) + .await + .context("read tee entry")?; ctx.builder - .append_data(&mut new_hdr, &mapped, std::io::Cursor::new(&buf)) + .append_data(&mut new_hdr, &mapped, &buf[..]) + .await .context("append to current part")?; if let Some(tb) = tee_builder.as_mut() { let mut tee_hdr = header.clone(); - tb.append_data(&mut tee_hdr, &mapped, std::io::Cursor::new(&buf)) + tb.append_data(&mut tee_hdr, &mapped, &buf[..]) + .await .context("append to tee tar")?; } } else { ctx.builder .append_data(&mut new_hdr, &mapped, &mut entry) + .await .context("append to current part")?; } (false, false) @@ -274,10 +281,11 @@ fn run_blocking( } if let Some(ctx) = current.take() { - finalize_part(ctx)?; + finalize_part(ctx).await?; } - if let Some(tb) = tee_builder.take() { - let buf = tb.into_inner().context("finish tee tar")?; + if let Some(mut tb) = tee_builder.take() { + tb.finish().await.context("finish tee tar")?; + let buf = tb.into_inner().await.context("into_inner tee tar")?; if !buf.is_empty() { result.tee_bytes = Some(Bytes::from(buf)); } @@ -288,7 +296,7 @@ fn run_blocking( } /// Outcome of the delta-mode lookup for one entry -enum DeltaClass { +pub(crate) enum DeltaClass { /// Not a paged file (or no delta map): pass body through unchanged Passthrough, /// Paged file whose changed-block set intersects the file: emit increment @@ -301,7 +309,11 @@ enum DeltaClass { Skip, } -fn classify_for_delta(ctx: &Option, path: &str, entry_size: u64) -> DeltaClass { +pub(crate) fn classify_for_delta( + ctx: &Option, + path: &str, + entry_size: u64, +) -> DeltaClass { let Some(ctx) = ctx.as_ref() else { return DeltaClass::Passthrough; }; @@ -357,25 +369,41 @@ fn classify_for_delta(ctx: &Option, path: &str, entry_size: u64) - } } -/// `Read` impl that emits a pre-encoded increment header followed by the +/// `AsyncRead` impl that emits a pre-encoded increment header followed by the /// subset of input pages whose block numbers appear in `blocks`. Reads the -/// input strictly forward — for each emitted page, skips intervening pages -/// by `read_exact` into a scratch buffer -struct IncrementBodyReader<'a, R: Read> { +/// input strictly forward — pages before each target are read & discarded +enum IncrementPhase { + Header, + /// load the next target page (skipping intervening pages first) + Load, + /// emit the page currently buffered in `page_buf` + Emit, + Done, +} + +pub(crate) struct IncrementBodyReader<'a, R> { header: Vec, header_pos: usize, input: &'a mut R, blocks: Vec, next_idx: usize, + /// next block index still to be read off the input cur_block: u32, - page_buf: Vec, - page_pos: usize, - page_filled: bool, - _entry_size: u64, + page_buf: [u8; PG_PAGE_SIZE as usize], + /// bytes filled into `page_buf` while loading the current page + fill: usize, + /// emit cursor into `page_buf` + emit_pos: usize, + phase: IncrementPhase, } -impl<'a, R: Read> IncrementBodyReader<'a, R> { - fn new(header: Vec, input: &'a mut R, blocks: Vec, entry_size: u64) -> Self { +impl<'a, R: AsyncRead + Unpin> IncrementBodyReader<'a, R> { + pub(crate) fn new( + header: Vec, + input: &'a mut R, + blocks: Vec, + _entry_size: u64, + ) -> Self { Self { header, header_pos: 0, @@ -383,54 +411,86 @@ impl<'a, R: Read> IncrementBodyReader<'a, R> { blocks, next_idx: 0, cur_block: 0, - page_buf: vec![0u8; PG_PAGE_SIZE as usize], - page_pos: 0, - page_filled: false, - _entry_size: entry_size, + page_buf: [0u8; PG_PAGE_SIZE as usize], + fill: 0, + emit_pos: 0, + phase: IncrementPhase::Header, } } } -impl<'a, R: Read> Read for IncrementBodyReader<'a, R> { - fn read(&mut self, out: &mut [u8]) -> std::io::Result { - if out.is_empty() { - return Ok(0); - } - // Phase 1: emit header bytes - if self.header_pos < self.header.len() { - let n = (self.header.len() - self.header_pos).min(out.len()); - out[..n].copy_from_slice(&self.header[self.header_pos..self.header_pos + n]); - self.header_pos += n; - return Ok(n); - } - // Phase 2: emit current page - if self.page_filled { - let blcksz = PG_PAGE_SIZE as usize; - let n = (blcksz - self.page_pos).min(out.len()); - out[..n].copy_from_slice(&self.page_buf[self.page_pos..self.page_pos + n]); - self.page_pos += n; - if self.page_pos == blcksz { - self.page_filled = false; - self.next_idx += 1; +impl<'a, R: AsyncRead + Unpin> AsyncRead for IncrementBodyReader<'a, R> { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut TaskContext<'_>, + out: &mut ReadBuf<'_>, + ) -> Poll> { + let me = self.get_mut(); + let page = PG_PAGE_SIZE as usize; + loop { + match me.phase { + IncrementPhase::Header => { + if me.header_pos < me.header.len() { + let n = (me.header.len() - me.header_pos).min(out.remaining()); + if n == 0 { + return Poll::Ready(Ok(())); + } + out.put_slice(&me.header[me.header_pos..me.header_pos + n]); + me.header_pos += n; + return Poll::Ready(Ok(())); + } + me.phase = IncrementPhase::Load; + } + IncrementPhase::Load => { + if me.next_idx >= me.blocks.len() { + me.phase = IncrementPhase::Done; + continue; + } + let target = me.blocks[me.next_idx]; + // fill page_buf with one full page from the input + while me.fill < page { + let mut rb = ReadBuf::new(&mut me.page_buf[me.fill..]); + match Pin::new(&mut *me.input).poll_read(cx, &mut rb) { + Poll::Pending => return Poll::Pending, + Poll::Ready(Err(e)) => return Poll::Ready(Err(e)), + Poll::Ready(Ok(())) => { + let got = rb.filled().len(); + if got == 0 { + return Poll::Ready(Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "increment input ended mid-page", + ))); + } + me.fill += got; + } + } + } + me.fill = 0; + if me.cur_block < target { + // intervening page: discard & advance + me.cur_block += 1; + continue; + } + me.emit_pos = 0; + me.phase = IncrementPhase::Emit; + } + IncrementPhase::Emit => { + let n = (page - me.emit_pos).min(out.remaining()); + if n == 0 { + return Poll::Ready(Ok(())); + } + out.put_slice(&me.page_buf[me.emit_pos..me.emit_pos + n]); + me.emit_pos += n; + if me.emit_pos == page { + me.next_idx += 1; + me.cur_block += 1; + me.phase = IncrementPhase::Load; + } + return Poll::Ready(Ok(())); + } + IncrementPhase::Done => return Poll::Ready(Ok(())), } - return Ok(n); - } - // Phase 3: load the next target page - if self.next_idx >= self.blocks.len() { - return Ok(0); - } - let target = self.blocks[self.next_idx]; - // Skip pages before target by reading & discarding - while self.cur_block < target { - self.input.read_exact(&mut self.page_buf)?; - self.cur_block += 1; } - self.input.read_exact(&mut self.page_buf)?; - self.cur_block += 1; - self.page_filled = true; - self.page_pos = 0; - // Tail-recurse via loop semantics: the next read() call will pump out - Read::read(self, out) } } @@ -441,45 +501,39 @@ fn _bind_increment(_: increment::IncrementHeader) {} struct PartCtx { file_no: u32, - builder: tar::Builder>, - bytes_counter: std::sync::Arc, + builder: Builder, + bytes_counter: Arc, } impl PartCtx { fn bytes_written(&self) -> u64 { - self.bytes_counter - .load(std::sync::atomic::Ordering::Relaxed) + self.bytes_counter.load(Ordering::Relaxed) } } -fn start_part(file_no: u32, parts_tx: &mpsc::Sender>) -> Result { +async fn start_part(file_no: u32, parts_tx: &mpsc::Sender>) -> Result { let (byte_tx, byte_rx) = mpsc::channel::>(4); let reader = ChannelReader::new(byte_rx); parts_tx - .blocking_send(Ok(Part { file_no, reader })) + .send(Ok(Part { file_no, reader })) + .await .map_err(|_| anyhow!("parts consumer dropped"))?; - let counter = std::sync::Arc::new(std::sync::atomic::AtomicU64::new(0)); - let writer = CountingWriter { - inner: BlockingSender { - tx: byte_tx, - scratch: Vec::with_capacity(CHUNK_BYTES), - }, - counter: counter.clone(), - }; + let counter = Arc::new(AtomicU64::new(0)); + let writer = PartWriter::new(byte_tx, counter.clone()); Ok(PartCtx { file_no, - builder: tar::Builder::new(writer), + builder: Builder::new(writer), bytes_counter: counter, }) } -fn finalize_part(ctx: PartCtx) -> Result<()> { - // tar::Builder::into_inner writes the two trailing zero blocks then - // returns the inner writer - let writer = ctx.builder.into_inner().context("finish tar part")?; - let CountingWriter { mut inner, .. } = writer; - inner.flush().context("flush part")?; - drop(inner); // drop sender → ChannelReader sees EOF +async fn finalize_part(ctx: PartCtx) -> Result<()> { + // finish writes the two trailing zero blocks; shutdown flushes the tail + // chunk, then dropping the writer closes the channel → ChannelReader EOF + let mut builder = ctx.builder; + builder.finish().await.context("finish tar part")?; + let mut writer = builder.into_inner().await.context("into_inner tar part")?; + writer.shutdown().await.context("flush part")?; Ok(()) } @@ -487,75 +541,85 @@ fn strip_dotslash(s: &str) -> &str { s.strip_prefix("./").unwrap_or(s) } -fn header_mtime(h: &tar::Header) -> DateTime { +fn header_mtime(h: &Header) -> DateTime { let secs = h.mtime().unwrap_or(0) as i64; DateTime::::from_timestamp(secs, 0) .unwrap_or_else(|| DateTime::::from_timestamp(0, 0).unwrap()) } -/// Sync writer that pushes its writes through a tokio mpsc as `Bytes`. -/// `blocking_send` parks the writer thread when the channel is full — -/// that's the backpressure -struct BlockingSender { - tx: mpsc::Sender>, +fn broken_pipe() -> std::io::Error { + std::io::Error::new(std::io::ErrorKind::BrokenPipe, "part consumer dropped") +} + +/// Async writer that pushes coalesced chunks through a tokio mpsc as `Bytes`. +/// `PollSender::poll_reserve` parks the task when the channel is full — that's +/// the backpressure. `counter` tracks total tar bytes for rotation budgeting +pub(crate) struct PartWriter { + sink: PollSender>, scratch: Vec, + counter: Arc, } -impl Write for BlockingSender { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - // Coalesce small writes into a single channel send to avoid per-512-byte - // tar block traffic across the channel - self.scratch.extend_from_slice(buf); - if self.scratch.len() >= CHUNK_BYTES { - self.flush_scratch()?; +impl PartWriter { + pub(crate) fn new(tx: mpsc::Sender>, counter: Arc) -> Self { + Self { + sink: PollSender::new(tx), + scratch: Vec::with_capacity(CHUNK_BYTES), + counter, } - Ok(buf.len()) - } - - fn flush(&mut self) -> std::io::Result<()> { - self.flush_scratch() } -} -impl BlockingSender { - fn flush_scratch(&mut self) -> std::io::Result<()> { + /// Send the buffered scratch as one `Bytes` chunk, swapping in a fresh + /// buffer. Avoids the per-CHUNK_BYTES memcpy that `Bytes::copy_from_slice` + /// would do + fn flush_chunk(&mut self, cx: &mut TaskContext<'_>) -> Poll> { if self.scratch.is_empty() { - return Ok(()); + return Poll::Ready(Ok(())); + } + match self.sink.poll_reserve(cx) { + Poll::Pending => return Poll::Pending, + Poll::Ready(Err(_)) => return Poll::Ready(Err(broken_pipe())), + Poll::Ready(Ok(())) => {} } - // Move the scratch Vec into a Bytes owner, swap in a fresh - // buffer for the next chunk. Avoids the per-CHUNK_BYTES memcpy - // that `Bytes::copy_from_slice` does let chunk = Bytes::from(std::mem::replace( &mut self.scratch, Vec::with_capacity(CHUNK_BYTES), )); - self.tx.blocking_send(Ok(chunk)).map_err(|_| { - std::io::Error::new(std::io::ErrorKind::BrokenPipe, "part consumer dropped") - }) + self.sink.send_item(Ok(chunk)).map_err(|_| broken_pipe())?; + Poll::Ready(Ok(())) } } -impl Drop for BlockingSender { - fn drop(&mut self) { - // Best-effort flush of any tail bytes before EOF - let _ = self.flush_scratch(); +impl AsyncWrite for PartWriter { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut TaskContext<'_>, + buf: &[u8], + ) -> Poll> { + let me = self.get_mut(); + // Flush the pending chunk before growing scratch past the threshold. + // Re-poll re-enters here without re-buffering `buf` (extend happens once, + // after the flush completes Ready) + if me.scratch.len() >= CHUNK_BYTES { + match me.flush_chunk(cx) { + Poll::Pending => return Poll::Pending, + Poll::Ready(Err(e)) => return Poll::Ready(Err(e)), + Poll::Ready(Ok(())) => {} + } + } + me.scratch.extend_from_slice(buf); + me.counter.fetch_add(buf.len() as u64, Ordering::Relaxed); + Poll::Ready(Ok(buf.len())) } -} -struct CountingWriter { - inner: W, - counter: std::sync::Arc, -} - -impl Write for CountingWriter { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - let n = self.inner.write(buf)?; - self.counter - .fetch_add(n as u64, std::sync::atomic::Ordering::Relaxed); - Ok(n) + fn poll_flush(self: Pin<&mut Self>, cx: &mut TaskContext<'_>) -> Poll> { + self.get_mut().flush_chunk(cx) } - fn flush(&mut self) -> std::io::Result<()> { - self.inner.flush() + + fn poll_shutdown(self: Pin<&mut Self>, cx: &mut TaskContext<'_>) -> Poll> { + // Flush tail bytes; dropping the writer (and its PollSender) closes the + // channel so the ChannelReader sees EOF + self.get_mut().flush_chunk(cx) } } @@ -567,6 +631,10 @@ pub fn tablespace_prefix(oid: u32) -> String { #[cfg(test)] mod tests { + // Test fixtures build & inspect archives with the sync `tar` crate; the + // `Read` import drives `read_to_end` on those sync entries + use std::io::Read as _; + use super::*; use tokio::io::AsyncReadExt; diff --git a/src/pg/replication/base_backup.rs b/src/pg/replication/base_backup.rs index 9cc8525..24fa54a 100644 --- a/src/pg/replication/base_backup.rs +++ b/src/pg/replication/base_backup.rs @@ -9,11 +9,11 @@ //! as `AsyncReader` for `Storage::put` use anyhow::{Context, Result, anyhow, bail}; -use bytes::Bytes; +use bytes::{Buf, Bytes}; use postgres_protocol::message::backend::Message; use std::pin::Pin; use std::task::{Context as TaskCtx, Poll}; -use tokio::io::AsyncRead; +use tokio::io::{AsyncBufRead, AsyncRead}; use tokio::sync::mpsc; use crate::pg::backup::parse_pg_lsn; @@ -33,6 +33,22 @@ pub struct BaseBackupOpts { pub wal: bool, } +/// PG `BASE_BACKUP ... MAX_RATE` accepts kB/s within these bounds +/// (src/include/backup/basebackup.h). Out-of-range is a protocol error, not a +/// clamp, so callers must pre-clamp +const MAX_RATE_LOWER_KIB: i64 = 32; +const MAX_RATE_UPPER_KIB: i64 = 1_048_576; + +/// Convert a bytes/sec budget (WALG_DISK_RATE_LIMIT) into a `MAX_RATE` argument +/// in kB/s. None when unset (0) +pub fn max_rate_kib_from_bytes(bytes_per_sec: u64) -> Option { + if bytes_per_sec == 0 { + return None; + } + let kib = (bytes_per_sec / 1024) as i64; + Some(kib.clamp(MAX_RATE_LOWER_KIB, MAX_RATE_UPPER_KIB) as i32) +} + #[derive(Debug, Clone)] pub struct ArchiveMeta { pub name: String, @@ -138,6 +154,34 @@ impl AsyncRead for ChannelReader { } } +/// Feed leftover `Bytes` slice directly, skipping a per-read memcpy +impl AsyncBufRead for ChannelReader { + fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut TaskCtx<'_>) -> Poll> { + let this = self.get_mut(); + // Same empty-payload guard as poll_read: an empty CopyData frame must + // not be reported as EOF (empty slice ≡ EOF for AsyncBufRead callers) + while this.leftover.is_empty() { + if this.closed { + return Poll::Ready(Ok(&[])); + } + match this.rx.poll_recv(cx) { + Poll::Pending => return Poll::Pending, + Poll::Ready(None) => { + this.closed = true; + return Poll::Ready(Ok(&[])); + } + Poll::Ready(Some(Err(e))) => return Poll::Ready(Err(e)), + Poll::Ready(Some(Ok(b))) => this.leftover = b, + } + } + Poll::Ready(Ok(&this.leftover)) + } + + fn consume(self: Pin<&mut Self>, amt: usize) { + self.get_mut().leftover.advance(amt); + } +} + /// Drive a BASE_BACKUP session, emitting events on `events`. /// Returns when the session is fully drained or an error occurs pub async fn run_base_backup( @@ -486,6 +530,7 @@ async fn expect_ready_for_query(conn: &mut ReplicationConn) -> Result<()> { #[cfg(test)] mod tests { use super::*; + use tokio::io::AsyncBufReadExt as _BufReadExt; use tokio::io::AsyncReadExt as _ReadExt; /// Regression: PG can send empty CopyData frames (eg sparse-file padding @@ -511,6 +556,33 @@ mod tests { assert_eq!(&out, b"hello world!"); } + /// Same empty-payload guard, but on the AsyncBufRead path the codec uses. + /// Drives partial `consume` so leftover tail survives across fill_buf. + #[tokio::test] + async fn channel_reader_bufread_skips_empty_payloads() { + let (tx, rx) = mpsc::channel::>(16); + let mut reader = ChannelReader::new(rx); + + tokio::spawn(async move { + tx.send(Ok(Bytes::from_static(b"hello "))).await.unwrap(); + tx.send(Ok(Bytes::new())).await.unwrap(); + tx.send(Ok(Bytes::from_static(b"world"))).await.unwrap(); + tx.send(Ok(Bytes::from_static(b"!"))).await.unwrap(); + }); + + let mut out = Vec::new(); + loop { + let chunk = reader.fill_buf().await.unwrap(); + if chunk.is_empty() { + break; + } + // consume one byte at a time to exercise the leftover tail + out.push(chunk[0]); + reader.consume(1); + } + assert_eq!(&out, b"hello world!"); + } + #[test] fn parses_archive_header_data_dir() { // base.tar\0\0 @@ -655,6 +727,23 @@ mod tests { assert_eq!(quote_pg_str("it's"), "'it''s'"); } + #[test] + fn max_rate_kib_conversion() { + // unset → no MAX_RATE + assert_eq!(max_rate_kib_from_bytes(0), None); + // 8 MiB/s → 8192 kB/s (wal-g divides bytes by 1024) + assert_eq!(max_rate_kib_from_bytes(8 * 1024 * 1024), Some(8192)); + // below PG's 32 kB/s floor clamps up rather than degrading to unlimited + assert_eq!(max_rate_kib_from_bytes(1), Some(32)); + assert_eq!(max_rate_kib_from_bytes(31 * 1024), Some(32)); + assert_eq!(max_rate_kib_from_bytes(32 * 1024), Some(32)); + // above PG's 1 GiB/s ceiling clamps down (effectively unlimited anyway) + assert_eq!( + max_rate_kib_from_bytes(2 * 1024 * 1024 * 1024), + Some(1_048_576) + ); + } + use bytes::{BufMut, BytesMut}; use std::time::Duration; use tokio::io::AsyncWriteExt; diff --git a/src/pg/walparser/state.rs b/src/pg/walparser/state.rs index 5b0fe98..fd70795 100644 --- a/src/pg/walparser/state.rs +++ b/src/pg/walparser/state.rs @@ -298,7 +298,7 @@ pub fn extract_locations_from_wal_file( mut r: R, ) -> Result, ExtractError> { let mut out = Vec::new(); - let mut page_buf = vec![0u8; WAL_PAGE_SIZE as usize]; + let mut page_buf = [0u8; WAL_PAGE_SIZE as usize]; loop { match read_exact_or_eof(&mut r, &mut page_buf)? { ReadStatus::Eof => return Ok(out), diff --git a/src/storage/s3.rs b/src/storage/s3.rs index f6a44cf..03ebf62 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -8,6 +8,7 @@ //! EC2 metadata service (see [`super::creds`]) use std::io::Cursor; +use std::sync::Arc; use std::time::{Duration, SystemTime}; use async_trait::async_trait; @@ -20,6 +21,8 @@ use quick_xml::Reader; use quick_xml::events::Event; use reqwest::Client; use tokio::io::AsyncReadExt; +use tokio::sync::Semaphore; +use tokio::task::JoinSet; use tokio_util::io::StreamReader; use url::Url; @@ -30,6 +33,13 @@ use crate::retry::{RetryPolicy, with_retry}; const MULTIPART_THRESHOLD: u64 = 32 * 1024 * 1024; const PART_SIZE: usize = 8 * 1024 * 1024; +/// Parts kept in flight across every concurrent multipart upload, capped by a +/// per-backend permit pool. Shared so a single stream and an N-way fan-out +/// converge on the same aggregate in-flight budget: deep enough to hide per-part +/// network RTT behind compression, bounded so resident part buffers stay at +/// MAX_INFLIGHT_PARTS × PART_SIZE regardless of stream count +const MAX_INFLIGHT_PARTS: usize = 8; + /// Path component encoding per SigV4 spec /// Same set as URL path-segment, but '/' kept literal const PATH_ENCODE: &AsciiSet = &NON_ALPHANUMERIC @@ -60,6 +70,8 @@ pub struct S3Storage { client: Client, base: String, retry_policy: RetryPolicy, + /// In-flight part budget shared across all concurrent multipart uploads + part_permits: Arc, } impl S3Storage { @@ -79,6 +91,7 @@ impl S3Storage { client, base, retry_policy, + part_permits: Arc::new(Semaphore::new(MAX_INFLIGHT_PARTS)), }) } @@ -86,6 +99,19 @@ impl S3Storage { super::join_prefix_key(&self.cfg.prefix, key) } + /// Cheap clone of the request context (shared `Client`, copied config, same + /// permit pool) so a per-part PUT can run as its own `'static` task instead + /// of borrowing `&self` + fn worker(&self) -> S3Storage { + S3Storage { + cfg: self.cfg.clone(), + client: self.client.clone(), + base: self.base.clone(), + retry_policy: self.retry_policy, + part_permits: self.part_permits.clone(), + } + } + /// Server-side copy identity: same endpoint/region + same credential. /// Conservative: AWS allows cross-region CopyObject, but mismatched /// region ids fall back to stream-through rather than risk custom @@ -188,6 +214,42 @@ impl S3Storage { .await } + /// PUT one already-buffered part, retrying transients in place (the buffer + /// is owned so the body replays without re-reading source). Returns the + /// part's ETag for the completion manifest + async fn put_one_part( + &self, + key_full: &str, + part_no: u32, + upload_id: &str, + chunk: Bytes, + ) -> Result { + let part_no_str = part_no.to_string(); + with_retry(&self.retry_policy, StorageError::is_transient, || async { + let resp = self + .signed_request( + "PUT", + key_full, + &[ + ("partNumber", part_no_str.as_str()), + ("uploadId", upload_id), + ], + chunk.clone(), + &[], + ) + .await?; + let resp = check_status(resp).await?; + let etag = resp + .headers() + .get("etag") + .and_then(|v| v.to_str().ok()) + .ok_or_else(|| StorageError::InvalidResponse("missing ETag".into()))? + .to_string(); + Ok::(etag) + }) + .await + } + async fn put_multipart(&self, key: &str, mut body: AsyncReader) -> Result<()> { // initiate let init_resp = self @@ -205,14 +267,29 @@ impl S3Storage { StorageError::InvalidResponse("missing UploadId in CreateMultipartUpload".into()) })?; - let mut parts: Vec<(u32, String)> = Vec::new(); + // Pipeline parts: fill one PART_SIZE buffer (reads stay sequential, so + // byte boundaries match the serial path), spawn its PUT under a shared + // in-flight permit, then read the next part while prior PUTs run. The + // permit pool overlaps compression with several concurrent PUTs and + // bounds aggregate in-flight parts across streams. PUTs finish out of + // order, so collected ETags are sorted by partNumber before completion + let ctx = Arc::new(self.worker()); + let key_full = Arc::new(self.full_key(key)); + let upload_id = Arc::new(upload_id); + let mut join: JoinSet> = JoinSet::new(); let mut part_no: u32 = 0; + let mut read_result: Result<()> = Ok(()); - loop { + 'read: loop { let mut buf = BytesMut::with_capacity(PART_SIZE); while buf.len() < PART_SIZE { - if body.read_buf(&mut buf).await? == 0 { - break; + match body.read_buf(&mut buf).await { + Ok(0) => break, + Ok(_) => {} + Err(e) => { + read_result = Err(e.into()); + break 'read; + } } } let filled = buf.len(); @@ -220,76 +297,63 @@ impl S3Storage { break; } part_no += 1; - let part_no_str = part_no.to_string(); let chunk = buf.freeze(); - // Per-part retry: chunk is already buffered, so transient failures - // (5xx, transport) replay the same body without re-reading source - let key_full = self.full_key(key); - let result = with_retry(&self.retry_policy, StorageError::is_transient, || async { - let resp = self - .signed_request( - "PUT", - &key_full, - &[ - ("partNumber", part_no_str.as_str()), - ("uploadId", upload_id.as_str()), - ], - chunk.clone(), - &[], - ) - .await?; - let resp = check_status(resp).await?; - let etag = resp - .headers() - .get("etag") - .and_then(|v| v.to_str().ok()) - .ok_or_else(|| StorageError::InvalidResponse("missing ETag".into()))? - .to_string(); - Ok::(etag) - }) - .await; - - let etag = match result { - Ok(e) => e, - Err(e) => { - let _ = self.abort_multipart(key, &upload_id).await; - return Err(e); + // Acquire before spawning so reading backpressures once the budget + // is exhausted, capping resident part buffers + let permit = match self.part_permits.clone().acquire_owned().await { + Ok(p) => p, + Err(_) => { + read_result = Err(StorageError::Config("part permit pool closed".into())); + break; } }; - parts.push((part_no, etag)); + let ctx = ctx.clone(); + let key_full = key_full.clone(); + let upload_id = upload_id.clone(); + join.spawn(async move { + let _permit = permit; + let etag = ctx + .put_one_part(&key_full, part_no, &upload_id, chunk) + .await?; + Ok((part_no, etag)) + }); if filled < PART_SIZE { break; } } - if parts.is_empty() { - // empty body, send a single empty part - part_no += 1; - let resp = self - .signed_request( - "PUT", - &self.full_key(key), - &[ - ("partNumber", part_no.to_string().as_str()), - ("uploadId", upload_id.as_str()), - ], - Bytes::new(), - &[], - ) - .await?; - let resp = check_status(resp).await?; - let etag = resp - .headers() - .get("etag") - .and_then(|v| v.to_str().ok()) - .unwrap_or("\"d41d8cd98f00b204e9800998ecf8427e\"") - .to_string(); - parts.push((part_no, etag)); + // Drain finished PUTs; on the first failure (read error or part PUT) + // stop collecting, cancel the siblings still in flight, then abort the + // whole upload so no parts outlive the aborted multipart + let mut parts: Vec<(u32, String)> = Vec::with_capacity(part_no as usize); + let mut first_err = read_result.err(); + if first_err.is_none() { + while let Some(joined) = join.join_next().await { + match joined { + Ok(Ok(pe)) => parts.push(pe), + Ok(Err(e)) => { + first_err = Some(e); + break; + } + Err(je) => { + first_err = Some(StorageError::Transport(format!( + "multipart part task: {je}" + ))); + break; + } + } + } + } + if let Some(e) = first_err { + join.shutdown().await; + let _ = self.abort_multipart(key, &upload_id).await; + return Err(e); } - // complete + // CompleteMultipartUpload lists every part sorted by partNumber + parts.sort_by_key(|(n, _)| *n); let mut xml = String::from(""); for (n, etag) in &parts { xml.push_str(&format!( @@ -390,43 +454,50 @@ impl Storage for S3Storage { let client = self.client.clone(); let base = self.base.clone(); let retry_policy = self.retry_policy; + let part_permits = self.part_permits.clone(); let s = stream::unfold( (Some(String::new()), full_prefix, cfg, client, base), - move |(token, prefix, cfg, client, base)| async move { - let token = token?; - let s = S3Storage { - cfg: cfg.clone(), - client: client.clone(), - base: base.clone(), - retry_policy, - }; - let q: [(&str, &str); _] = [ - ("list-type", "2"), - ("prefix", prefix.as_str()), - ("continuation-token", token.as_str()), - ]; - let q = if token.is_empty() { &q[..2] } else { &q[..] }; - let resp = match s.signed_request("GET", "", q, Bytes::new(), &[]).await { - Ok(r) => r, - Err(e) => return Some((Err(e), (None, prefix, cfg, client, base))), - }; - let resp = match check_status(resp).await { - Ok(r) => r, - Err(e) => return Some((Err(e), (None, prefix, cfg, client, base))), - }; - let body = match resp.text().await { - Ok(b) => b, - Err(e) => { - return Some((Err(e.into()), (None, prefix, cfg, client, base))); - } - }; - match parse_list_v2(&body, &cfg.prefix) { - Ok((objects, next)) => { - let next_state = (next, prefix, cfg, client, base); - Some((Ok(objects), next_state)) + move |(token, prefix, cfg, client, base)| { + // list only issues GETs; the reconstructed handle shares the + // real permit pool but never reaches the multipart path + let part_permits = part_permits.clone(); + async move { + let token = token?; + let s = S3Storage { + cfg: cfg.clone(), + client: client.clone(), + base: base.clone(), + retry_policy, + part_permits, + }; + let q: [(&str, &str); _] = [ + ("list-type", "2"), + ("prefix", prefix.as_str()), + ("continuation-token", token.as_str()), + ]; + let q = if token.is_empty() { &q[..2] } else { &q[..] }; + let resp = match s.signed_request("GET", "", q, Bytes::new(), &[]).await { + Ok(r) => r, + Err(e) => return Some((Err(e), (None, prefix, cfg, client, base))), + }; + let resp = match check_status(resp).await { + Ok(r) => r, + Err(e) => return Some((Err(e), (None, prefix, cfg, client, base))), + }; + let body = match resp.text().await { + Ok(b) => b, + Err(e) => { + return Some((Err(e.into()), (None, prefix, cfg, client, base))); + } + }; + match parse_list_v2(&body, &cfg.prefix) { + Ok((objects, next)) => { + let next_state = (next, prefix, cfg, client, base); + Some((Ok(objects), next_state)) + } + Err(e) => Some((Err(e), (None, prefix, cfg, client, base))), } - Err(e) => Some((Err(e), (None, prefix, cfg, client, base))), } }, ) @@ -1036,12 +1107,11 @@ mod tests { if key.contains("boom") && part >= 2 { return Resp::new(503).body(b"".to_vec()); } - let id = req.query("uploadId").unwrap().to_string(); - u.lock() - .unwrap() - .entry(id) - .or_default() - .insert(part, req.body.clone()); + // late parts of an aborted upload must not resurrect it + let id = req.query("uploadId").unwrap(); + if let Some(parts) = u.lock().unwrap().get_mut(id) { + parts.insert(part, req.body.clone()); + } Resp::new(200).header("etag", &format!("\"etag-{part}\"")) } "PUT" if req.headers.contains_key("x-amz-copy-source") => { @@ -1202,4 +1272,105 @@ mod tests { assert!(matches!(err, Err(StorageError::Http { status: 503, .. }))); assert!(uploads.lock().unwrap().is_empty(), "abort must clean up"); } + + /// Pipelined multipart keeps several part PUTs in flight, so they finish + /// out of order; CompleteMultipartUpload must still list every part + /// ascending by partNumber with its matching ETag (S3 rejects unsorted + /// manifests). Force part 1 to land last via a one-shot transient — its + /// retry backoff outlasts the other parts' PUTs — then assert the captured + /// completion XML is sorted and the bytes survive the reorder. + #[tokio::test] + async fn multipart_completion_orders_parts_by_number() { + use crate::storage::test_http::{Req, Resp, payload, read_all, reader, serve}; + use std::collections::BTreeMap; + use std::sync::atomic::{AtomicBool, Ordering}; + use std::sync::{Arc, Mutex}; + + let parts: Arc>>> = Arc::new(Mutex::new(BTreeMap::new())); + let complete_xml: Arc>> = Arc::new(Mutex::new(None)); + let part1_failed = Arc::new(AtomicBool::new(false)); + + let (p, cx, pf) = (parts.clone(), complete_xml.clone(), part1_failed.clone()); + let base = serve(move |req: &Req| match req.method.as_str() { + "POST" if req.has_query("uploads") => Resp::new(200).body( + b"u1" + .to_vec(), + ), + "PUT" if req.has_query("partNumber") => { + let part: u32 = req.query("partNumber").unwrap().parse().unwrap(); + // fail part 1's first attempt once; its retry backoff lands it + // last in completion order, forcing the sort to do real work + if part == 1 && !pf.swap(true, Ordering::SeqCst) { + return Resp::new(503).body(b"".to_vec()); + } + p.lock().unwrap().insert(part, req.body.clone()); + Resp::new(200).header("etag", &format!("\"e{part}\"")) + } + "POST" if req.has_query("uploadId") => { + *cx.lock().unwrap() = Some(String::from_utf8_lossy(&req.body).into_owned()); + Resp::new(200).body(b"".to_vec()) + } + "GET" => { + let buf: Vec = p.lock().unwrap().values().flatten().copied().collect(); + Resp::new(200).body(buf) + } + _ => Resp::new(400), + }) + .await; + + let cfg = S3Config { + bucket: "bkt".into(), + prefix: "p".into(), + region: "us-east-1".into(), + creds: CredentialSource::Static(Credentials { + access_key: "AKID".into(), + secret_key: "sek".into(), + session_token: None, + expires_at: None, + }), + endpoint: Some(base), + force_path_style: true, + }; + // base_delay long enough that part 1's retry completes well after the + // sub-ms loopback PUTs of parts 2..5 + let policy = RetryPolicy { + max_attempts: 3, + base_delay: Duration::from_millis(80), + max_delay: Duration::from_millis(80), + jitter: false, + }; + let s = S3Storage::with_retry_policy(cfg, policy).unwrap(); + + let big = payload(33 * 1024 * 1024); // 5 parts: 8,8,8,8,1 MiB + s.put("big.zst", reader(&big), Some(big.len() as u64)) + .await + .unwrap(); + + let xml = complete_xml + .lock() + .unwrap() + .clone() + .expect("completion sent"); + let nums: Vec = xml + .split("") + .skip(1) + .filter_map(|s| s.split("").next()) + .filter_map(|n| n.parse().ok()) + .collect(); + assert_eq!( + nums, + vec![1, 2, 3, 4, 5], + "parts must be sorted by partNumber: {xml}" + ); + for n in 1..=5u32 { + assert!( + xml.contains(&format!( + "{n}\"e{n}\"" + )), + "part {n} etag mapping wrong in {xml}" + ); + } + // bytes survive the out-of-order pipeline + assert_eq!(read_all(s.get("big.zst").await.unwrap()).await, big); + } } diff --git a/tests/backup_roundtrip.rs b/tests/backup_roundtrip.rs index dcd3ddc..89d1686 100644 --- a/tests/backup_roundtrip.rs +++ b/tests/backup_roundtrip.rs @@ -349,6 +349,86 @@ async fn fetch_recreates_tablespace_symlinks() { assert_eq!(std::fs::read(target.join("PG_VERSION")).unwrap(), b"16"); } +/// A part carrying its own `pg_tblspc/` symlink entry must not override +/// the sentinel-restored link: the sentinel target (which honors +/// --tablespace-mapping) is authoritative, and recreating the link mid-restore +/// would race the concurrent data fan-out. Regression for the archived link +/// target clobbering a mapped relocation +#[cfg(unix)] +#[tokio::test] +async fn fetch_ignores_archived_tablespace_symlink_entry() { + let dir = tempfile::tempdir().unwrap(); + let storage_dir = dir.path().join("storage"); + let restore = dir.path().join("restore"); + let sentinel_target = dir.path().join("ts_target"); + let archived_target = dir.path().join("archived_ts"); + let store = Arc::new(FsStorage::new(&storage_dir).unwrap()); + + let backup_name = format_backup_name(1, 0x0300_0000, 16 * 1024 * 1024); + + let mut spec = TablespaceSpec::new(restore.to_string_lossy()); + spec.add(16384, sentinel_target.to_string_lossy()); + let mut sentinel = make_sentinel_v2(restore.to_str().unwrap()); + sentinel.sentinel.tablespace_spec = Some(spec); + put_bytes( + store.clone(), + &sentinel_key(&backup_name), + serde_json::to_vec(&sentinel).unwrap(), + ) + .await; + + // Part has BOTH an archived symlink entry pointing at the backup-time + // location AND the file beneath it. The symlink entry must be ignored + let tar_bytes = { + let mut buf: Vec = Vec::new(); + { + let mut b = tar::Builder::new(&mut buf); + let mut link = tar::Header::new_gnu(); + link.set_entry_type(tar::EntryType::Symlink); + link.set_size(0); + link.set_mode(0o777); + link.set_link_name(&archived_target).unwrap(); + link.set_path("pg_tblspc/16384").unwrap(); + link.set_cksum(); + b.append(&link, std::io::empty()).unwrap(); + + let mut file = tar::Header::new_gnu(); + file.set_size(2); + file.set_mode(0o644); + file.set_path("pg_tblspc/16384/PG_VERSION").unwrap(); + file.set_cksum(); + b.append(&file, &b"16"[..]).unwrap(); + b.finish().unwrap(); + } + buf + }; + put_bytes(store.clone(), &tar_part_key(&backup_name, 1, ""), tar_bytes).await; + + fetch_mod::handle( + &test_settings(), + store as Arc, + &backup_name, + &restore, + ) + .await + .unwrap(); + + let link = restore.join("pg_tblspc/16384"); + let md = std::fs::symlink_metadata(&link).unwrap(); + assert!(md.file_type().is_symlink(), "expected symlink at {link:?}"); + // Link must still point at the sentinel target, not the archived one + assert_eq!(std::fs::read_link(&link).unwrap(), sentinel_target); + assert!( + !archived_target.exists(), + "archived target must never be materialized" + ); + // File lands through the sentinel link + assert_eq!( + std::fs::read(sentinel_target.join("PG_VERSION")).unwrap(), + b"16" + ); +} + #[tokio::test] async fn show_round_trip_and_mark_flips_permanent() { use walrus::pg::backup::show as show_mod; diff --git a/tests/vm_live.rs b/tests/vm_live.rs index de6726d..be62811 100644 --- a/tests/vm_live.rs +++ b/tests/vm_live.rs @@ -876,7 +876,7 @@ async fn wal_summaries_parse_real_pg_files() { } /// End-to-end `--delta-from-wal-summaries`: the `summarize_wal=off` and -/// missing-`--pgdata` preconditions must abort, and the success path must +/// missing local PGDATA preconditions must abort, and success path must /// reconstruct byte-for-byte against a non-delta backup of the same state. #[tokio::test] async fn delta_from_summaries_against_live_pg() { @@ -952,7 +952,7 @@ async fn delta_from_summaries_against_live_pg() { let mut s_delta = s.clone(); s_delta.delta.max_steps = 1; - // ── precondition bail: summaries live on the host fs, so --pgdata is + // ── precondition bail: summaries live on host fs, so local PGDATA is // required once a delta parent is in play ── let pgdata_err = backup::push::handle( &s_delta, @@ -964,9 +964,9 @@ async fn delta_from_summaries_against_live_pg() { }, ) .await - .expect_err("--delta-from-wal-summaries without --pgdata must abort"); + .expect_err("--delta-from-wal-summaries without local PGDATA must abort"); assert!( - format!("{pgdata_err:#}").contains("--pgdata"), + format!("{pgdata_err:#}").contains("PGDATA"), "{pgdata_err:#}" );