diff --git a/Cargo.lock b/Cargo.lock
index 45c4f13..0981273 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -97,6 +97,22 @@ version = "1.0.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
 
+[[package]]
+name = "astral-tokio-tar"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb50a7aae84a03bf55b067832bc376f4961b790c97e64d3eacee97d389b90277"
+dependencies = [
+ "filetime",
+ "futures-core",
+ "libc",
+ "portable-atomic",
+ "rustc-hash",
+ "tokio",
+ "tokio-stream",
+ "xattr",
+]
+
 [[package]]
 name = "async-compression"
 version = "0.4.42"
@@ -1297,6 +1313,12 @@ version = "0.3.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
 
+[[package]]
+name = "portable-atomic"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
+
 [[package]]
 name = "postgres-protocol"
 version = "0.6.12"
@@ -2048,6 +2070,17 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "tokio-stream"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-util"
 version = "0.7.18"
@@ -2253,6 +2286,7 @@ name = "wal-rus"
 version = "0.1.1"
 dependencies = [
  "anyhow",
+ "astral-tokio-tar",
  "async-compression",
  "async-trait",
  "aws-lc-rs",
@@ -2265,6 +2299,7 @@ dependencies = [
  "fallible-iterator",
  "futures",
  "hex",
+ "libc",
  "percent-encoding",
  "postgres-protocol",
  "quick-xml",
diff --git a/Cargo.toml b/Cargo.toml
index 249e67c..16d0753 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,7 +41,7 @@ roaring = "0.11"
 aws-lc-rs = "1"
 quick-xml = "0.40"
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
-tar = "0.4"
+astral-tokio-tar = "0.6"
 postgres-protocol = "0.6"
 fallible-iterator = "0.2"
 tokio-rustls = { version = "0.26", default-features = false, features = ["aws-lc-rs", "tls12"] }
@@ -50,9 +50,11 @@ rustls-pki-types = "1"
 rustls-pemfile = "2"
 webpki-roots = "1"
 dryoc = { version = "0.8", default-features = false, features = ["u64_backend"] }
+libc = "0.2"
 
 [dev-dependencies]
 tempfile = "3"
+tar = "0.4"
 
 [features]
 # Enabled only on the VM test runner: hits a real PG cluster at PGPORT
diff --git a/bench/README.md b/bench/README.md
index 403d2c7..5ee4d1c 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -3,8 +3,8 @@
 Reproducible single-host benchmark comparing three PostgreSQL 18 WAL archivers on
 **throughput** and **memory** under heavy write load:
 
-- **walrus** (this repo, Rust) — serial wal-push daemon
-- **wal-g** (Go) — fan-out daemon (`WALG_UPLOAD_CONCURRENCY`)
+- **walrus** (this repo, Rust) — look-ahead fan-out daemon (`WALG_UPLOAD_CONCURRENCY`; pre-uploads `concurrency-1` segments, streaming per-upload, no full-segment buffer)
+- **wal-g** (Go) — fan-out daemon (same `WALG_UPLOAD_CONCURRENCY`)
 - **pgbackrest** (C) — daemonless; PG forks `archive-push`, async `process-max` workers
 
 All three are driven identically: PG `archive_command` → the tool's own client → S3.
@@ -118,7 +118,7 @@ daemon (~27 MB for walrus; wal-g's fan-out daemon adds more baseline).
 
 | OP | walrus / wal-g | pgbackrest | measures |
 |---|---|---|---|
-| `backup-send` | `backup-push --full` | `backup --type=full` | full base backup → S3 |
+| `backup-send` | `backup-push <PGDATA> --full` | `backup --type=full` | full base backup → S3 |
 | `backup-fetch` | `backup-fetch <dst> LATEST` | `restore` | restore ← S3 |
 | `backup-delta` | `backup-push` (delta, `wi1`) | `backup --type=incr` | delta backup → S3 |
 | `backup-delta-summaries` | `backup-push --delta-from-wal-summaries` | — (walrus-only) | delta from PG17 WAL summaries → S3 |
@@ -174,7 +174,9 @@ Notes:
 ## Config knobs
 
 See `config.env.example`. Common ones: `UPLOAD_CONCURRENCY` (wal-g concurrency /
-pgbackrest `process-max`), `SCALE` (pgbench DB size), `CHURN_ROWS`, `BURST_SECONDS`,
+pgbackrest `process-max`; also seeds `WALG_DOWNLOAD_CONCURRENCY` so `backup-fetch`
+scales with the same knob — set `DOWNLOAD_CONCURRENCY` to decouple), `SCALE`
+(pgbench DB size), `CHURN_ROWS`, `BURST_SECONDS`,
 `BURST_WORKERS`. `matrix.sh` honors `DAEMONS` (and `RUN_ID`). Operation benchmarks add
 `RESTORE_DIR`, `WAL_RECV_DIR`, `WAL_RECEIVE_SECONDS`, `DELTA_CHURN_SECONDS`,
 `DELTA_MAX_STEPS`, `DELTA_ORIGIN`; `op_matrix.sh` honors `OPS`, `TOOLS` (and
diff --git a/bench/op_matrix.sh b/bench/op_matrix.sh
index 77abfaa..b3190c4 100755
--- a/bench/op_matrix.sh
+++ b/bench/op_matrix.sh
@@ -13,6 +13,10 @@
 # Skipped cells: pgbackrest has no wal-receive equivalent; backup-delta-summaries
 # is walrus-only (no wal-g / pgbackrest WAL-summary delta). Override OPS / TOOLS
 # via env. Counterpart of matrix.sh (archive path).
+#
+# backup-delta-chain (DELTA_MAX_STEPS-deep chain + leaf restore) is omitted from
+# the default sweep — it churns once per step, so its cost scales with depth. Opt
+# in with OPS="backup-send backup-delta-chain" (backup-send must precede it).
 set -euo pipefail
 
 SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
diff --git a/bench/run_op.sh b/bench/run_op.sh
index 73aa769..2b51427 100755
--- a/bench/run_op.sh
+++ b/bench/run_op.sh
@@ -2,8 +2,8 @@
 #
 # run_op.sh OP TOOL RUN_ID
 #
-#   OP     - backup-send | backup-fetch | backup-delta |
-#            backup-delta-summaries | wal-receive       (data-movement operation)
+#   OP     - backup-send | backup-fetch | backup-delta | backup-delta-summaries |
+#            backup-delta-chain | wal-receive            (data-movement operation)
 #   TOOL   - walrus | walg | pgbackrest                  (implementation)
 #   RUN_ID - free-form label, e.g. r1 / 2026-06-22
 #
@@ -11,11 +11,14 @@
 # local), cross-tool where an equivalent exists. Counterpart of run.sh, which
 # benches the archive_command (wal-push) path; this covers the rest of walrus:
 #
-#   backup-send             base backup -> S3   walrus/wal-g backup-push --full | pgbackrest backup --type=full
+#   backup-send             base backup -> S3   walrus/wal-g backup-push ... --full |
+#                                               pgbackrest backup --type=full
 #   backup-fetch            restore   <- S3     walrus/wal-g backup-fetch       | pgbackrest restore
 #   backup-delta            delta backup -> S3  walrus/wal-g backup-push (wi1)  | pgbackrest backup --type=incr
 #   backup-delta-summaries  delta from WAL      walrus backup-push              | (walrus-only)
 #                           summaries -> S3      --delta-from-wal-summaries
+#   backup-delta-chain      N-deep delta chain  walrus/wal-g backup-push xN     | pgbackrest backup --type=incr xN
+#                           + restore of leaf    (origin=LATEST), then backup-fetch LATEST
 #   wal-receive             stream WAL from PG  walrus/wal-g wal-receive        | (no pgbackrest peer)
 #
 # Delta cells need a parent full backup (backup-send must precede them) and a
@@ -28,6 +31,14 @@
 # anchor to chain root. Delta size is S3-inventory byte growth across the push,
 # not on-disk cluster size.
 #
+# backup-delta-chain builds a real DELTA_MAX_STEPS-deep chain: each step churns,
+# drains, then pushes a delta with WALG_DELTA_ORIGIN=LATEST so it extends the
+# PREVIOUS delta (LATEST_FULL would re-anchor each to the root, leaving restore
+# depth 2). Every step is timed + sized on its own (chain_metrics.txt), then a
+# backup-fetch LATEST walks full + all N deltas to exercise restore-time replay.
+# Its churn is per-step and INSIDE the sampler window, so the daemon's archiving
+# during churn is sampled too; the per-step push timings isolate the push.
+#
 # walrus's walsender (serving WAL via the replication protocol) has no CLI entry
 # point yet, so wal-send is intentionally absent.
 #
@@ -55,7 +66,7 @@ LOG_TAG=op
 load_config
 
 if [[ $# -ne 3 ]]; then
-  echo "usage: $0 <backup-send|backup-fetch|backup-delta|backup-delta-summaries|wal-receive> <walrus|walg|pgbackrest> <run_id>" >&2
+  echo "usage: $0 <backup-send|backup-fetch|backup-delta|backup-delta-summaries|backup-delta-chain|wal-receive> <walrus|walg|pgbackrest> <run_id>" >&2
   exit 2
 fi
 OP="$1"
@@ -63,8 +74,8 @@ TOOL="$2"
 RUN_ID="$3"
 
 case "${OP}" in
-  backup-send|backup-fetch|backup-delta|backup-delta-summaries|wal-receive) ;;
-  *) echo "error: OP must be backup-send|backup-fetch|backup-delta|backup-delta-summaries|wal-receive, got '${OP}'" >&2; exit 2 ;;
+  backup-send|backup-fetch|backup-delta|backup-delta-summaries|backup-delta-chain|wal-receive) ;;
+  *) echo "error: OP must be backup-send|backup-fetch|backup-delta|backup-delta-summaries|backup-delta-chain|wal-receive, got '${OP}'" >&2; exit 2 ;;
 esac
 case "${TOOL}" in
   walrus|walg|pgbackrest) ;;
@@ -80,9 +91,12 @@ if [[ "${OP}" == "backup-delta-summaries" && "${TOOL}" != "walrus" ]]; then
   exit 2
 fi
 
-# Delta ops drive a churn phase, then a delta push; group them for branch tests.
+# Single-delta ops drive one churn phase, then one delta push; group for branches.
 IS_DELTA=0
 [[ "${OP}" == "backup-delta" || "${OP}" == "backup-delta-summaries" ]] && IS_DELTA=1
+# Chain op churns + pushes per step inside the timed loop (not the single step 1b).
+IS_CHAIN=0
+[[ "${OP}" == "backup-delta-chain" ]] && IS_CHAIN=1
 
 # Backup-push ops (full + delta) take a base backup, whose pg_backup_stop blocks
 # on BackupWaitWalArchive until the backup's WAL is archived. So the tool's
@@ -90,7 +104,7 @@ IS_DELTA=0
 # process plus the mostly-idle daemon; for walrus that baseline is ~27 MB).
 # backup-fetch (restore) and wal-receive need no archiver.
 NEEDS_ARCHIVE=0
-case "${OP}" in backup-send|backup-delta|backup-delta-summaries) NEEDS_ARCHIVE=1 ;; esac
+case "${OP}" in backup-send|backup-delta|backup-delta-summaries|backup-delta-chain) NEEDS_ARCHIVE=1 ;; esac
 
 : "${BUCKET:?set BUCKET in config.env}"
 : "${PGUSER:?set PGUSER in config.env}"
@@ -122,7 +136,7 @@ WAL_RECEIVE_SECONDS="${WAL_RECEIVE_SECONDS:-300}"
 # Delta cells: churn window that dirties pages between the parent full and the
 # delta push, and the delta-chain depth handed to walrus/wal-g (WALG_DELTA_MAX_STEPS).
 DELTA_CHURN_SECONDS="${DELTA_CHURN_SECONDS:-300}"
-DELTA_MAX_STEPS="${DELTA_MAX_STEPS:-7}"
+DELTA_MAX_STEPS="${DELTA_MAX_STEPS:-3}"
 DELTA_ORIGIN="${DELTA_ORIGIN:-LATEST_FULL}"
 
 case "${TOOL}" in
@@ -161,6 +175,28 @@ inv_size() {
     | awk '/Total Size:/ {print $3}' | tail -1
 }
 
+# Fail fast if no parent backup exists for a delta to anchor to. Without one,
+# backup-push silently emits a FULL (mislabeled as a delta) and inv-growth sizing
+# reports a full's bytes. op_matrix runs backup-send first; this guards lone runs.
+assert_delta_parent() {
+  local roots
+  if [[ "${TOOL}" == "pgbackrest" ]]; then
+    # full backup-set dirs end in 'F/'; incr (delta) dirs end in 'I/'
+    roots="$(sudo aws s3 ls "s3://${BUCKET}${PGBACKREST_REPO_PATH}/backup/${PGBACKREST_STANZA}/" \
+      --region "${AWS_REGION}" 2>/dev/null | awk '/ PRE / && /F\/$/ {n++} END{print n+0}')"
+  else
+    # walrus/wal-g chain root = base_<lsn> without the _D_ delta suffix
+    roots="$(sudo aws s3 ls "${WALG_PREFIX}/basebackups_005/" \
+      --region "${AWS_REGION}" 2>/dev/null | awk '/ PRE base_/ && !/_D_/ {n++} END{print n+0}')"
+  fi
+  if [[ "${roots:-0}" -eq 0 ]]; then
+    echo "error: no parent full backup under ${INV_PREFIX}; run backup-send ${TOOL} ${RUN_ID} first" >&2
+    echo "       (a delta with no parent silently becomes a full, corrupting the measurement)" >&2
+    exit 1
+  fi
+  log "parent check: ${roots} full backup(s) under ${INV_PREFIX}"
+}
+
 # --- pre-flight: DB seeded? (backup-send + wal-receive need a populated DB) ---
 [[ "${OP}" == "backup-fetch" ]] || require_seeded
 
@@ -197,7 +233,7 @@ sudo -u postgres pgbackrest --stanza="${STANZA}" stanza-create || true
 # backup (full or incr) needs WAL archiving live (pgbackrest blocks on the
 # start-WAL archive), so point archive_command at pgbackrest and drain. restore
 # reads only the repo. backup-delta (incr) churns + drains in the delta-prep step.
-if [[ "${OP}" == "backup-send" || "${OP}" == "backup-delta" ]]; then
+if [[ "${OP}" == "backup-send" || "${OP}" == "backup-delta" || "${OP}" == "backup-delta-chain" ]]; then
   ARCHIVE_CMD="pgbackrest --stanza=${STANZA} archive-push %p"
   sudo -u postgres "${PGBIN}/psql" -p 5432 -tA \
     -c "ALTER SYSTEM SET archive_library = '';" \
@@ -238,6 +274,9 @@ if [[ "${OP}" == "backup-send" || "${OP}" == "wal-receive" ]]; then
   CHECKPOINT_BEFORE_WORKLOAD=1
 fi
 
+# Delta ops must extend an existing full; bail before churning if none exists.
+[[ "${IS_DELTA}" -eq 1 || "${IS_CHAIN}" -eq 1 ]] && assert_delta_parent
+
 # --- step 1b: delta prep — churn between the parent full and the delta push ---
 # The default delta map walks ARCHIVED WAL, so the churn WAL must reach the repo
 # before the push. The tool's archiver is already live (step 1, NEEDS_ARCHIVE)
@@ -272,7 +311,7 @@ case "${OP}" in
   backup-send)
     log "base backup -> ${INV_PREFIX} (full)"
     case "${TOOL}" in
-      walrus) run_tool "${WALRUS_BIN}" backup-push --full ;;
+      walrus) run_tool "${WALRUS_BIN}" backup-push "${PGDATA_DIR}" --full ;;
       walg)   run_tool "${WALG_BIN}" backup-push "${PGDATA_DIR}" --full ;;
       pgbackrest) sudo -u postgres pgbackrest --stanza="${PGBACKREST_STANZA}" backup --type=full ;;
     esac
@@ -285,7 +324,7 @@ case "${OP}" in
     case "${TOOL}" in
       walrus) run_tool env WALG_DELTA_MAX_STEPS="${DELTA_MAX_STEPS}" \
                 WALG_DELTA_ORIGIN="${DELTA_ORIGIN}" \
-                "${WALRUS_BIN}" backup-push --pgdata "${PGDATA_DIR}" ;;
+                "${WALRUS_BIN}" backup-push "${PGDATA_DIR}" ;;
       walg)   run_tool env WALG_DELTA_MAX_STEPS="${DELTA_MAX_STEPS}" \
                 WALG_DELTA_ORIGIN="${DELTA_ORIGIN}" \
                 "${WALG_BIN}" backup-push "${PGDATA_DIR}" ;;
@@ -300,10 +339,93 @@ case "${OP}" in
     log "delta-from-wal-summaries backup -> ${INV_PREFIX} (origin=${DELTA_ORIGIN}; parent inventory ${inv_before} B)"
     run_tool env WALG_DELTA_MAX_STEPS="${DELTA_MAX_STEPS}" \
       WALG_DELTA_ORIGIN="${DELTA_ORIGIN}" \
-      "${WALRUS_BIN}" backup-push --pgdata "${PGDATA_DIR}" --delta-from-wal-summaries
+      "${WALRUS_BIN}" backup-push "${PGDATA_DIR}" --delta-from-wal-summaries
     inv_after="$(inv_size)"; inv_after="${inv_after:-0}"
     BYTES=$(( inv_after - inv_before )); (( BYTES < 0 )) && BYTES=0
     ;;
+  backup-delta-chain)
+    # Build a DELTA_MAX_STEPS-deep chain (origin=LATEST: each delta extends the
+    # prior one). Per step: churn, drain, then time + size the push alone. BYTES
+    # accumulates per-step delta payloads (not END-START inventory: that would
+    # also count the inter-step churn WAL). chain_metrics.txt holds the breakdown.
+    DELTA_ORIGIN=LATEST
+    CHAIN_METRICS="${RESULT_DIR}/chain_metrics.txt"
+    push_s_total=0
+    chain_rows=""
+    log "delta chain: ${DELTA_MAX_STEPS} steps (origin=LATEST, cap WALG_DELTA_MAX_STEPS=${DELTA_MAX_STEPS}) -> ${INV_PREFIX}"
+    for ((i=1; i<=DELTA_MAX_STEPS; i++)); do
+      log "chain step ${i}/${DELTA_MAX_STEPS}: checkpoint + churn ${DELTA_CHURN_SECONDS}s"
+      checkpoint_pg
+      CHECKPOINT_BEFORE_WORKLOAD=1
+      CH_ENV=(PGHOST="${PGHOST_DRIVER}" PGUSER="${PGUSER}" PGPASSWORD="${PGPASSWORD}"
+        DURATION="${DELTA_CHURN_SECONDS}" CHURN_ROWS="${CHURN_ROWS:-2000000}")
+      [[ -n "${BURST_WORKERS:-}" ]] && CH_ENV+=("WORKERS=${BURST_WORKERS}")
+      if ! env "${CH_ENV[@]}" bash "${SCRIPT_DIR}/scripts/driver/workload_burst.sh"; then
+        mark_invalid "chain step ${i} churn degraded (non-comparable delta)"
+      fi
+      drain_backlog 5 600
+      step_before="$(inv_size)"; step_before="${step_before:-0}"
+      step_t0="$(date +%s.%N)"
+      case "${TOOL}" in
+        walrus) run_tool env WALG_DELTA_MAX_STEPS="${DELTA_MAX_STEPS}" WALG_DELTA_ORIGIN=LATEST \
+                  "${WALRUS_BIN}" backup-push "${PGDATA_DIR}" ;;
+        walg)   run_tool env WALG_DELTA_MAX_STEPS="${DELTA_MAX_STEPS}" WALG_DELTA_ORIGIN=LATEST \
+                  "${WALG_BIN}" backup-push "${PGDATA_DIR}" ;;
+        pgbackrest) sudo -u postgres pgbackrest --stanza="${PGBACKREST_STANZA}" backup --type=incr ;;
+      esac
+      step_t1="$(date +%s.%N)"
+      step_after="$(inv_size)"; step_after="${step_after:-0}"
+      step_bytes=$(( step_after - step_before )); (( step_bytes < 0 )) && step_bytes=0
+      step_s="$(awk -v a="${step_t0}" -v b="${step_t1}" 'BEGIN{printf "%.3f", b-a}')"
+      step_mbps="$(awk -v by="${step_bytes}" -v s="${step_s}" 'BEGIN{printf "%.2f",(s>0)?by/1e6/s:0}')"
+      push_s_total="$(awk -v a="${push_s_total}" -v b="${step_s}" 'BEGIN{printf "%.3f", a+b}')"
+      BYTES=$(( BYTES + step_bytes ))
+      log "chain step ${i}: elapsed=${step_s}s delta=${step_bytes} B (${step_mbps} MB/s)"
+      chain_rows+="step=${i} elapsed_s=${step_s} bytes=${step_bytes} mb_s=${step_mbps}"$'\n'
+    done
+
+    log "chain restore: backup-fetch LATEST (walks full + ${DELTA_MAX_STEPS} deltas) -> ${RESTORE_DIR}"
+    run_root "${RESTORE_DIR}" <<'REMOTE'
+set -euo pipefail
+RESTORE_DIR="$1"
+rm -rf "${RESTORE_DIR}"
+install -d -o postgres -g postgres "${RESTORE_DIR}"
+REMOTE
+    restore_t0="$(date +%s.%N)"
+    case "${TOOL}" in
+      walrus) run_tool "${WALRUS_BIN}" backup-fetch "${RESTORE_DIR}" LATEST ;;
+      walg)   run_tool "${WALG_BIN}" backup-fetch "${RESTORE_DIR}" LATEST ;;
+      pgbackrest)
+        sudo -u postgres pgbackrest --stanza="${PGBACKREST_STANZA}" \
+          --pg1-path="${RESTORE_DIR}" --type=none restore ;;
+    esac
+    restore_t1="$(date +%s.%N)"
+    restore_s="$(awk -v a="${restore_t0}" -v b="${restore_t1}" 'BEGIN{printf "%.3f", b-a}')"
+    restore_bytes="$(sudo du -sb "${RESTORE_DIR}" | awk '{print $1}')"
+    log "chain restore: elapsed=${restore_s}s restored=${restore_bytes} B"
+    sudo rm -rf "${RESTORE_DIR}"
+
+    run_root "${CHAIN_METRICS}" "${TOOL}" "${RUN_ID}" "${DELTA_MAX_STEPS}" \
+      "${push_s_total}" "${BYTES}" "${restore_s}" "${restore_bytes}" "${chain_rows}" <<'REMOTE'
+set -euo pipefail
+CHAIN_METRICS="$1"; TOOL="$2"; RUN_ID="$3"; STEPS="$4"; PUSH_S_TOTAL="$5"
+TOTAL_BYTES="$6"; RESTORE_S="$7"; RESTORE_BYTES="$8"; ROWS="$9"
+{
+  echo "op=backup-delta-chain"
+  echo "tool=${TOOL}"
+  echo "run_id=${RUN_ID}"
+  echo "delta_origin=LATEST"
+  echo "chain_steps=${STEPS}"
+  printf '%s' "${ROWS}"
+  echo "push_s_total=${PUSH_S_TOTAL}"
+  echo "chain_delta_bytes=${TOTAL_BYTES}"
+  echo "restore_s=${RESTORE_S}"
+  echo "restore_bytes=${RESTORE_BYTES}"
+} >"${CHAIN_METRICS}"
+chown postgres:postgres "${CHAIN_METRICS}" 2>/dev/null || true
+cat "${CHAIN_METRICS}"
+REMOTE
+    ;;
   backup-fetch)
     log "restore LATEST -> ${RESTORE_DIR}"
     run_root "${RESTORE_DIR}" <<'REMOTE'
diff --git a/bench/scripts/sut/05_install_pgbackrest.sh b/bench/scripts/sut/05_install_pgbackrest.sh
index cb130bb..f09302e 100755
--- a/bench/scripts/sut/05_install_pgbackrest.sh
+++ b/bench/scripts/sut/05_install_pgbackrest.sh
@@ -19,7 +19,7 @@
 set -euo pipefail
 
 BUCKET="${BUCKET:-${1:-}}"
-UPLOAD_CONCURRENCY="${UPLOAD_CONCURRENCY:-${2:-16}}"
+UPLOAD_CONCURRENCY="${UPLOAD_CONCURRENCY:-${2:-4}}"
 AWS_REGION="${AWS_REGION:-us-east-1}"
 STANZA="${PGBACKREST_STANZA:-walbench}"
 REPO_PATH="${PGBACKREST_REPO_PATH:-/pgbackrest-bench}"
diff --git a/bench/scripts/sut/11_write_walg_env.sh b/bench/scripts/sut/11_write_walg_env.sh
index ea18c35..94edb74 100755
--- a/bench/scripts/sut/11_write_walg_env.sh
+++ b/bench/scripts/sut/11_write_walg_env.sh
@@ -14,7 +14,11 @@
 set -euo pipefail
 
 BUCKET="${BUCKET:-${1:-}}"
-UPLOAD_CONCURRENCY="${UPLOAD_CONCURRENCY:-${2:-16}}"
+UPLOAD_CONCURRENCY="${UPLOAD_CONCURRENCY:-${2:-4}}"
+# backup-fetch / wal-fetch download fan-out; defaults to upload concurrency so a
+# single concurrency sweep tunes both directions (override DOWNLOAD_CONCURRENCY
+# to decouple).
+DOWNLOAD_CONCURRENCY="${DOWNLOAD_CONCURRENCY:-${UPLOAD_CONCURRENCY}}"
 ENV_FILE="${ENV_FILE:-/etc/postgresql/wal-g.env}"
 AWS_REGION="${AWS_REGION:-us-east-1}"
 COMPRESSION_METHOD="${WALG_COMPRESSION_METHOD:-lz4}"
@@ -81,7 +85,7 @@ if [[ -z "${ACCESS_KEY}" || -z "${SECRET_KEY}" ]]; then
   exit 1
 fi
 
-echo "=== Writing ${ENV_FILE} (UPLOAD_CONCURRENCY=${UPLOAD_CONCURRENCY}) ==="
+echo "=== Writing ${ENV_FILE} (UPLOAD_CONCURRENCY=${UPLOAD_CONCURRENCY} DOWNLOAD_CONCURRENCY=${DOWNLOAD_CONCURRENCY}) ==="
 install -d -o postgres -g postgres -m 0755 "$(dirname "${ENV_FILE}")"
 umask 077
 tmp="$(mktemp)"
@@ -90,6 +94,7 @@ WALG_S3_PREFIX=${WALG_S3_PREFIX}
 AWS_REGION=${AWS_REGION}
 WALG_COMPRESSION_METHOD=${COMPRESSION_METHOD}
 WALG_UPLOAD_CONCURRENCY=${UPLOAD_CONCURRENCY}
+WALG_DOWNLOAD_CONCURRENCY=${DOWNLOAD_CONCURRENCY}
 PGHOST=/var/run/postgresql
 PGDATA=/dat/18/data
 AWS_ACCESS_KEY_ID=${ACCESS_KEY}
diff --git a/docs/DESIGN.md b/docs/DESIGN.md
index a13b503..cbbb256 100644
--- a/docs/DESIGN.md
+++ b/docs/DESIGN.md
@@ -1,31 +1,25 @@
 ## Goal
 
 Functional parity with wal-g's Postgres surface so an on-prem shop can
-swap binaries without touching `archive_command`, sentinels, bucket
-layout, or operator runbooks. North star: a backup written by either
-tool restorable by the other.
+swap binaries without touching `archive_command`, sentinels, or bucket
+layout. Backups written by either tool restorable by either.
 
-Optimized for no-overcommit hosts: every pipeline stage is streaming,
-no full-segment or full-file buffering.
+Optimized for https://www.postgresql.org/docs/current/kernel-resources.html#LINUX-MEMORY-OVERCOMMIT
 
 ## Runtime
 
-Runtime flavor is picked per command before construction
-(`Cli::worker_threads`), overridable via `--threads` / `WALG_THREADS`;
-1 builds current-thread, >1 multi-thread with that many workers.
+Runtime flavor is picked per command (`Cli::worker_threads`),
+overridable via `--threads` / `WALG_THREADS`: 1 builds a current-thread
+runtime, >1 multi-thread with that many workers.
 
-Default 1 for most commands: `wal-push` as `archive_command` runs once
-per 16 MB segment; multi-thread runtime would spawn worker threads +
-per-thread malloc arenas for nothing. Daemon mode stays at 1 since I/O
-is the bottleneck.
-
-Commands whose fan-out does real CPU work per task (compress, encrypt,
-checksum, TLS) default to multi-thread capped by the matching
-concurrency knob, otherwise `WALG_UPLOAD_CONCURRENCY` tasks timeshare
-one core and uploads overlap only on network: `backup-push`
+Default 1 for most commands. `wal-push` runs once per 16 MB segment as
+`archive_command`, so extra worker threads would only add per-thread
+malloc arenas; daemon mode stays at 1 (I/O bound). Commands with real
+per-task CPU work (compress, encrypt, checksum, TLS) default to
+multi-thread capped by the matching concurrency knob: `backup-push`
 min(cores, upload concurrency); `backup-fetch` / `wal-prefetch` /
-`wal-restore` min(cores, download concurrency). Worker count stays
-bounded so arenas + stacks don't balloon and postgres keeps its cores.
+`wal-restore` min(cores, download concurrency). Bounded so arenas +
+stacks stay small and postgres keeps its cores.
 
 ## Storage trait
 
@@ -35,116 +29,119 @@ async fn get(&self, key: &str) -> Result<AsyncReader>;
 ```
 
 `AsyncReader = Pin<Box<dyn AsyncRead + Send + Unpin>>`. Compression and
-encryption are also `AsyncReader`s, so push pipelines as
-`File → compress → encrypt → storage.put` without materializing
-anything. `size_hint` lets s3 pick single-PUT vs multipart, left unset
-under compression/encryption since variable-length output makes the
-hint lie, then the unknown-size path takes over (see S3).
+encryption are also `AsyncReader`s, so a push pipeline is
+`File → compress → encrypt → storage.put` with nothing materialized.
+`size_hint` lets S3 pick single-PUT vs multipart, left unset under
+compression/encryption (variable-length output makes the hint lie) so
+the unknown-size path takes over.
 
 Pipeline order matches wal-g: push `raw → compress → encrypt → storage`,
-fetch inverse. Sentinel / metadata JSON bypass compress+encrypt entirely
-(wal-g `UploadDto` behavior), so `backup-list` and `delete` work against
-an encrypted bucket without the key.
+fetch inverse. Sentinel / metadata JSON bypass compress+encrypt (wal-g
+`UploadDto` behavior), so `backup-list` and `delete` work against an
+encrypted bucket without the key.
 
 ### S3
 
 Hand-rolled SigV4 instead of `aws-sdk-rust` (multi-MB dependency
-footprint) or `object_store` (arrow deps, abstracts away streaming
-control). UNSIGNED-PAYLOAD over HTTPS streams bodies without hashing up
-front, TLS covers integrity. Multipart parts buffer in memory so a
-transient retry replays identical bytes, the safety net since
-UNSIGNED-PAYLOAD leaves the signature off the body. Unknown-size bodies
-buffer up to the single-PUT cap and skip multipart's
-create/upload/complete trio when they fit, so a compressed 16 MiB
-segment lands in one PUT.
-
-Credentials resolve as a small chain (`storage/creds.rs`): explicit
-static keys (`AWS_ACCESS_KEY_ID`/`_SECRET_ACCESS_KEY`, optional
-`AWS_SESSION_TOKEN`), else the EC2 metadata service (IMDS). IMDS uses
-IMDSv2 (token PUT then authenticated GET, falling back to unauthenticated
-v1 if the token is refused), caching temporary creds and refetching 5 min
-before expiry; the lock spans the fetch so concurrent signers single-flight.
-Set `AWS_EC2_METADATA_DISABLED` to force the static-only path,
-`AWS_EC2_METADATA_SERVICE_ENDPOINT` to override the link-local address.
-Rotating IMDS keys would break the key-based server-side-copy identity, so
-IMDS folds to a constant identity. Profile/shared-credentials files and STS
-web-identity (`AWS_WEB_IDENTITY_TOKEN_FILE`) are not implemented.
+footprint) or `object_store` (arrow deps, hides streaming control).
+UNSIGNED-PAYLOAD over HTTPS streams bodies without hashing up front, TLS
+covers integrity. Multipart parts buffer in memory so a transient retry
+replays identical bytes, the safety net since UNSIGNED-PAYLOAD leaves
+the body unsigned. Unknown-size bodies buffer up to the single-PUT cap
+and skip the multipart create/upload/complete trio when they fit, so a
+compressed 16 MiB segment lands in one PUT.
+
+Credentials resolve as a chain (`storage/creds.rs`): static keys
+(`AWS_ACCESS_KEY_ID`/`_SECRET_ACCESS_KEY`, optional `AWS_SESSION_TOKEN`),
+else IMDSv2 (token PUT then authenticated GET, falling back to
+unauthenticated v1 if the token is refused), caching temporary creds and
+refetching 5 min before expiry; the lock spans the fetch so concurrent
+signers single-flight. `AWS_EC2_METADATA_DISABLED` forces the
+static-only path, `AWS_EC2_METADATA_SERVICE_ENDPOINT` overrides the
+link-local address. Rotating IMDS keys would break the key-based
+server-side-copy identity, so IMDS folds to a constant identity.
+Profile/shared-credentials files and STS web-identity
+(`AWS_WEB_IDENTITY_TOKEN_FILE`) are not implemented.
 
 ### GCS
 
-Service-account JWT (RS256 via aws-lc-rs) exchanged for OAuth bearer,
-cached until 60 s before expiry. Uploads stream via `uploadType=media`
-chunked transfer. Resumable uploads and metadata-server auth not
-implemented (see PLAN.md).
+Service-account JWT (RS256 via aws-lc-rs) exchanged for an OAuth bearer,
+cached until 60 s before expiry. Uploads stream via `uploadType=media`.
+Resumable uploads and metadata-server auth not implemented.
 
 ### Retry classification
 
-`StorageError::Http { status, body }` + `Transport` let `is_transient()`
-classify retryable failures. Reads retry unconditionally on transient.
-The `RetryingStorage` wrapper retries small bounded-size `put`s
-(sentinels, manifests, history files) by buffering the body once;
-larger or unknown-size streams pass through to S3's own in-place retry,
-which replays its per-PUT / per-part buffer. `fs` skips the wrapper, no
-transient classes worth wrapping.
+`is_transient()` classifies `StorageError::Http { status, body }` +
+`Transport`. Reads retry unconditionally on transient. `RetryingStorage`
+retries small bounded-size `put`s (sentinels, manifests, history files)
+by buffering the body once; larger or unknown-size streams pass through
+to S3's own in-place retry, which replays its per-PUT / per-part buffer.
+`fs` skips the wrapper.
 
 ## Compression
 
 `async_compression` bufread encoders chain as
-`File → BufReader → Encoder → put`, no thread per stream. First
-iteration used `spawn_blocking` + mpsc around sync zstd: worked, but
-143 MB VmPeak vs 7.3 MB after the switch.
-
-`wal-fetch` probes the configured extension first, then `.zst`, then
-bare, then remaining codec extensions, handling buckets with
+`File → BufReader → Encoder → put`: no thread per stream, resident
+memory stays tiny. `wal-fetch` probes the configured extension first,
+then the other codec extensions and bare, handling buckets with
 mixed-method writes across a compression-setting migration.
 
 ## Replication client
 
 Speaks the PG replication wire protocol directly, no `pg_basebackup`
 subprocess, no disk spool. PG14- and PG15+ BASE_BACKUP wire forms both
-handled. Auth: trust, cleartext, SCRAM-SHA-256; MD5 rejected. Without
-`--pgdata`, `backup-push` is purely network-driven (sidecar host needs
-no filesystem access, `data_dir` filled from `SHOW data_directory`).
-`PGHOST` starting with `/` dials a Unix socket per libpq convention,
-skipping TLS.
+handled. Auth: trust, cleartext, SCRAM-SHA-256; MD5 rejected. `PGHOST`
+starting with `/` dials a Unix socket per libpq convention, skipping TLS.
 
 A tokio task owns the connection and emits `BackupEvent`s over mpsc;
 each archive carries an mpsc of `Bytes` chunks wrapped as `ChannelReader`.
-Backpressure flows naturally: upload stalls → channel fills → pump's
-send blocks → TCP window closes. `ChannelReader` loops on empty chunks,
-a real PG 13 stream contains empty CopyData frames mid-stream and an
-empty poll-fill reads as EOF per the AsyncRead contract.
+Backpressure flows naturally: upload stall → channel fills → pump's send
+blocks → TCP window closes. `ChannelReader` loops on empty chunks, since
+a real PG 13 stream carries empty CopyData frames mid-stream and an empty
+poll-fill would otherwise read as EOF.
 
 ### TLS
 
 `sslmode` mirrors libpq exactly: `disable | allow | prefer (default) |
 require | verify-ca | verify-full`. `prefer`/`require` encrypt without
-authenticating (matches libpq, same operator surprise). `verify-ca`
-delegates to `WebPkiServerVerifier`, suppressing only
-`NotValidForName{,Context}`.
+authenticating (matches libpq). `verify-ca` delegates to
+`WebPkiServerVerifier`, suppressing only `NotValidForName{,Context}`.
 
-Client certificate auth (mutual TLS): set `PGSSLCERT` and `PGSSLKEY` to a
-PEM cert chain and unencrypted private key (PKCS#8 / PKCS#1 / SEC1) and
-walrus presents them in every TLS mode. Both must be set together;
-encrypted keys (`PGSSLPASSWORD`) and libpq's `~/.postgresql/postgresql.{crt,key}`
-default location aren't honored, matching the env-only `PGSSLROOTCERT` handling.
+Client certificate auth (mTLS): set `PGSSLCERT` and `PGSSLKEY` to a PEM
+cert chain and unencrypted private key (PKCS#8 / PKCS#1 / SEC1), presented
+in every TLS mode; both required together. Encrypted keys
+(`PGSSLPASSWORD`) and libpq's `~/.postgresql/postgresql.{crt,key}` default
+location aren't honored, matching the env-only `PGSSLROOTCERT` handling.
 
 ## Tar streamer
 
-One `spawn_blocking` task per archive bridges async→sync via
-`SyncIoBridge`, re-tars with tablespace path remap, rotates parts at
-`WALG_TAR_SIZE_THRESHOLD`, tees `global/pg_control` into its own part
-uploaded last, collects per-file metadata.
+The BASE_BACKUP path uses `astral-tokio-tar` async archive and builder
+APIs. One task per archive re-tars with tablespace path remap, rotates
+parts at `WALG_TAR_SIZE_THRESHOLD`, tees `global/pg_control` into its own
+part uploaded last, and collects per-file metadata. Part bytes flow
+through bounded mpsc chunks into upload workers, overlapping
+compression/encryption/storage with re-tarring without a sync bridge
+thread.
+
+With a positional `PGDATA`, `backup-push` reads the local data directory
+instead of BASE_BACKUP: it brackets the copy with `pg_backup_start` /
+`pg_backup_stop`, walks `$PGDATA` plus tablespace symlink targets, and
+runs `WALG_UPLOAD_CONCURRENCY` pack workers each streaming one
+size-bounded tar part. This is the throughput path for local full and
+delta backups; the replication path remains a single source stream
+bounded by the BASE_BACKUP protocol. Without `PGDATA` the push is purely
+network-driven (`data_dir` from `SHOW data_directory`), so a sidecar host
+needs no filesystem access.
 
 `backup-fetch` extracts manually rather than via `Archive::unpack`: the
 tar crate's canonicalize guard refuses writes through `pg_tblspc/<oid>`
-symlinks, which legitimate PG restores require. `..`-traversal still
-blocked. Tablespace symlinks created before extraction so the first
-entry under `pg_tblspc/<oid>/` can't materialize a real directory there.
+symlinks that legitimate restores require. `..`-traversal stays blocked.
+Tablespace symlinks are created before extraction so the first entry
+under `pg_tblspc/<oid>/` can't materialize a real directory there.
 
-Uploads drain through a `JoinSet` bounded by
-`Semaphore(WALG_UPLOAD_CONCURRENCY)`, JoinSet over `FuturesUnordered`
-so the bail path aborts in-flight tasks instead of detaching them.
+BASE_BACKUP uploads drain through `BoundedTasks`, filesystem-source
+workers through a `JoinSet`; both bounded by `WALG_UPLOAD_CONCURRENCY`,
+bail paths abort in-flight work instead of detaching it.
 
 ## Delta backups
 
@@ -154,86 +151,76 @@ Two per-file payload formats, magic-dispatched on apply:
 - PG17 native INCREMENTAL (magic `0xd3ae1f0d`), built from
   `pg_wal/summaries/*.summary` via `--delta-from-wal-summaries`
 
-`IncrementBodyReader` streams header + dirty pages with one BLCKSZ
-scratch page, no file-sized buffer regardless of dirty density (naive
-buffering worst case: 1 GiB resident per concurrent paged file). Three
-outcomes per paged file: incremented, skipped (entry omitted, metadata
-record kept), passthrough. Dirty blocks past EOF filtered, apply-side
+`IncrementBodyReader` streams header + dirty pages with one BLCKSZ scratch
+page, so resident memory is independent of dirty density. Three outcomes
+per paged file: incremented, skipped (entry omitted, metadata record
+kept), passthrough. Dirty blocks past EOF filtered, apply-side
 `read_exact` would underflow otherwise.
 
-Map build fails closed: on any WAL-walk error, warn + fall back to full
-*and* leave `increment_from` unset. The sentinel never claims a delta
-the bucket can't deliver. Fetch walks `increment_from` root→leaf,
-capped at 64 steps + visited-set against cyclic sentinels; only the
-leaf's tablespace `Spec` is applied (it's a property of pgdata, not
-LSN).
-
-In-memory delta map is `BTreeMap<RelFileNode, RoaringBitmap>`, matching
-wal-g's `map[RelFileNode]*roaring.Bitmap`. A `BTreeSet<u32>` costs a flat
-~13 B/block regardless of density, so a large-rewrite delta (VACUUM FULL,
-CREATE INDEX, bulk load: 100 GiB rel ≈ 13 M blocks) balloons to ~160 MB
-resident; roaring run/bitmap-compresses dense rewrites to ~1.6 MB and
-keeps sparse OLTP deltas comparable. The on-disk format is a flat tuple
-list either way, so it costs nothing in interop.
-
-The sidecar (`<group>_delta`) is never materialized as a struct: the
-running working file accumulates location tuples append-only across the
-group's 16 segments, then completion appends the boundary-record tuples,
-terminator, and parser seed and streams the file to the bucket. The map
-build folds each sidecar's tuples back in one at a time. So neither the
-sidecar write nor the map read holds a whole group's locations in memory.
-
-Walparser operates on byte slices rather than wal-g's reader-of-reader
-chains; one segment is 16 MiB and already in memory. wal summaries
-parsing cross-referenced field-by-field against postgres
-`src/common/blkreftable.c` (see WALG_COMPAT.md).
+Map build fails closed: any WAL-walk error warns, falls back to full, and
+leaves `increment_from` unset, so the sentinel never claims a delta the
+bucket can't deliver. Fetch walks `increment_from` root→leaf, capped at
+64 steps with a visited-set against cyclic sentinels; only the leaf's
+tablespace `Spec` is applied (a property of pgdata, not LSN).
+
+The in-memory map is `BTreeMap<RelFileNode, RoaringBitmap>`, matching
+wal-g's `map[RelFileNode]*roaring.Bitmap`; roaring keeps dense rewrites
+(VACUUM FULL, CREATE INDEX, bulk load) from ballooning resident memory
+while staying comparable on sparse OLTP deltas. The on-disk format is a
+flat tuple list either way, so it costs nothing in interop.
+
+The sidecar (`<group>_delta`) is never materialized as a struct: a working
+file accumulates location tuples append-only across the group's 16
+segments, then completion appends the boundary-record tuples, terminator,
+and parser seed and streams the file out. Map build folds each sidecar's
+tuples back in one at a time, so neither the write nor the read holds a
+whole group's locations in memory.
+
+Walparser operates on byte slices (one segment is 16 MiB, already in
+memory). WAL-summary and native INCREMENTAL parsing cross-referenced
+field-by-field against postgres `src/common/blkreftable.c`,
+`src/backend/backup/basebackup.c`, and
+`src/bin/pg_combinebackup/reconstruct.c`.
 
 ## Encryption
 
-libsodium `crypto_secretstream_xchacha20poly1305` via `dryoc`
-(pure Rust, no C toolchain). Key transforms `none | hex | base64`
-mirror wal-g, `none` requires ≥ 25 bytes so low-entropy keys can't
-sneak through the legacy path.
-
-OpenPGP intentionally unsupported. rPGP pulls dozens of transitives and
-its async wrapper buffers whole payloads, breaking the streaming
-contract; symmetric AEAD already covers the single-tenant on-prem
-threat model; a migrating PGP bucket re-encrypts once. To prevent
-silent plaintext regressions, any `WALG_PGP_*` env var is a hard error
-at startup.
+libsodium `crypto_secretstream_xchacha20poly1305` via `dryoc` (pure Rust,
+no C toolchain). Key transforms `none | hex | base64` mirror wal-g;
+`none` requires ≥ 25 bytes so low-entropy keys can't sneak through.
 
-Buckets don't tag objects encrypted-or-not (matches wal-g), so the key
-must stay consistently configured per prefix; mismatch fails loudly on
-first read.
+OpenPGP intentionally unsupported: rPGP pulls dozens of transitives and
+its async wrapper buffers whole payloads, breaking the streaming contract;
+symmetric AEAD already covers the single-tenant on-prem threat model. Any
+`WALG_PGP_*` env var is a hard startup error to prevent silent plaintext
+regressions. Buckets don't tag objects encrypted-or-not (matches wal-g),
+so the key must stay consistently configured per prefix; a mismatch fails
+loudly on first read.
 
 ## Retention & copy
 
-Objects ordered by `(timeline, global_seg_no)` extracted from the
-24-hex segment substring, wal-g's `timelineAndSegmentNoLess`. Permanent
-backups reserve WAL `[(start_lsn-1)/seg_size, (finish_lsn-1)/seg_size]`
-inclusive. `delete` is dry-run by default, `--confirm` executes; the
-plan struct is returned so tests assert without parsing logs.
-`delete target` BFS-walks the increment graph for dependants.
+Objects ordered by `(timeline, global_seg_no)` from the 24-hex segment
+substring (wal-g's `timelineAndSegmentNoLess`). Permanent backups reserve
+WAL `[(start_lsn-1)/seg_size, (finish_lsn-1)/seg_size]` inclusive.
+`delete` is dry-run by default, `--confirm` executes; the plan struct is
+returned so tests assert without parsing logs. `delete target` BFS-walks
+the increment graph for dependants.
 
 `copy` reuses source credentials for the destination URI, stream-through
-for cross-backend; WAL window `[start_seg, finish_seg]` copied with a
-single backup, `--with-history` extends to all WAL ≤ finish_lsn.
+for cross-backend. A single backup's WAL window is `[start_seg,
+finish_seg]`; `--with-history` extends to all WAL ≤ finish_lsn.
 
 ## Daemon
 
 Byte-compatible with wal-g's Unix-socket protocol
 (`[type][u16 BE len][body]`), so `archive_command` can point at either
-tool's daemon-client unchanged. Implemented ops: Check, WalPush,
-WalFetch.
-
-PG's archiver is serial, so a standing `Uploader`
-(`src/daemon/uploader.rs`) keeps a look-ahead pool saturated across
-invocations. Foreground `WalPush(N)` acks only once `N` is durable
-(no early ack), but `N+1..` pre-upload concurrently
-(`lookahead = WALG_UPLOAD_CONCURRENCY - 1`, serial and byte-identical
-at 1). Replaces wal-g's per-invocation `BgUploader` + on-disk marker
-dir with an in-memory inflight/done map deduping foreground pushes
-against look-ahead. See PLAN.md.
+tool's daemon-client unchanged. Implemented ops: Check, WalPush, WalFetch.
+
+PG's archiver is serial, so a standing `Uploader` keeps a look-ahead pool
+saturated across invocations. Foreground `WalPush(N)` acks only once `N`
+is durable; `N+1..` pre-upload concurrently
+(`lookahead = WALG_UPLOAD_CONCURRENCY - 1`, serial and byte-identical at
+1). Replaces wal-g's per-invocation `BgUploader` + on-disk marker dir with
+an in-memory inflight/done map.
 
 ## wal-receive
 
@@ -244,15 +231,3 @@ stay consistent with archive_command pushes. Shutdown finalizes the
 in-flight segment as `<seg>.partial` locally, never uploaded, matching
 `pg_receivewal`. Status updates on a 10 s cadence, immediate on
 server-requested-reply keepalives.
-
-## Dependency budget
-
-Recurring theme: prefer hand-rolling small fixed formats over pulling
-crates. No `regex` (summary filenames + tablespace prefixes are trivial
-decodes), no aws-sdk. `roaring` is the one earned exception (+`bytemuck`,
-both pure-Rust leaves): a stdlib `BTreeSet` can't compress dense deltas,
-so it broke the no-overcommit budget by ~100x on large rewrites (see
-Delta backups). `quick-xml` parses S3 list + multipart responses
-(pull-parser does charset decode + entity unescape, replacing earlier
-hand-rolled string extraction). Single crypto stack on aws-lc-rs
-(rustls provider + GCS RS256), no transitive ring.
diff --git a/docs/WALG_COMPAT.md b/docs/WALG_COMPAT.md
index 370f8be..a68bee5 100644
--- a/docs/WALG_COMPAT.md
+++ b/docs/WALG_COMPAT.md
@@ -10,34 +10,44 @@ the bump PR, not master).
 
 ## Shared on-bucket format
 
-- Key layout version `005`: `wal_005/<segment>[.<ext>]`,
-  `basebackups_005/<name>/tar_partitions/part_NNN.tar.<ext>`,
-  `pg_control.tar.<ext>` tee, sentinel at
-  `basebackups_005/<name>_backup_stop_sentinel.json` (one level above
-  the per-backup dir, same asymmetry as wal-g)
-- Sentinel mirrors `BackupSentinelDtoV2` field-for-field, PascalCase
-  keys, `Spec` for tablespaces; every Option field tolerant-deserializes
-  so sentinels from either tool parse
-- `files_metadata.json` schema (`Files`, `TarFileSets`)
-- Delta naming `base_<24hex>_D_<parent_24hex>`; chain discovered via
-  sentinel `IncrementFrom`, format detected per-file by magic byte, no
-  sentinel format flag (wal-g convention)
-- `wi1` increment format and PG17 native INCREMENTAL format
-  (magic `0xd3ae1f0d`); native layout verified field-by-field against
-  postgres source (`src/common/blkreftable.c`,
-  `src/backend/backup/basebackup.c`,
-  `src/bin/pg_combinebackup/reconstruct.c`)
-- libsodium framing: 24-byte secretstream header, 8 KiB plaintext
-  chunks, 17-byte per-chunk overhead, explicit FINAL chunk on close;
-  a wire-format pin test fails on any drift
-- Prefetch dir layout `pg_wal/.wal-g/prefetch/{running/,}` so a sidecar
-  can run either tool against the same pg_wal
-- Daemon Unix-socket protocol byte format (Check / WalPush / WalFetch)
-- `delete` mode + modifier vocabulary (`before` / `retain` /
-  `everything` / `target` / `garbage`; `FULL`, `FIND_FULL`, `FORCE`,
-  `ARCHIVES`, `BACKUPS`, `--after`), permanent-backup WAL reservation,
-  `--confirm` gate
-- Env vars follow `WALG_*` / `PG*` / `AWS_*` / `GOOGLE_*` naming
+The on-bucket format is wal-g's verbatim, so this doc covers only the
+gaps. Matched without further note: key layout `005`, the
+`BackupSentinelDtoV2` sentinel (PascalCase, tolerant-deserialized so
+either tool's sentinels parse), `files_metadata.json`, delta naming
+(`base_<24hex>_D_<parent_24hex>`, chain via sentinel `IncrementFrom`,
+format magic-detected per file), `wi1` and PG17 native INCREMENTAL
+payloads, libsodium secretstream framing, prefetch dir layout, the
+daemon Unix-socket protocol, the `delete` mode + modifier vocabulary,
+and `WALG_*` / `PG*` / `AWS_*` / `GOOGLE_*` env naming.
+
+## Delta page selection
+
+Both tools emit byte-identical `wi1` / native increments (see above), so a
+delta produced by either restores under either. They diverge only in how the
+producer decides *which* blocks an increment carries.
+
+wal-g defaults to a full scan (`WALG_USE_WAL_DELTA` is false by default): it
+reads every page of every paged relation and ships a page only if the page is
+new (`pd_upper == 0`) or its header LSN is at or past the increment-base LSN
+(`incremental_page_reader.go:SelectNewValidPage`, a predicate lifted from
+PostgreSQL's own page-validity checks and refined on pgsql-hackers). This is
+self-validating: it needs no WAL and re-derives "changed" from each page's own
+header, so a gap in the archived WAL cannot silently drop a changed block.
+Setting `WALG_USE_WAL_DELTA=true` switches wal-g to instead trust a WAL-derived
+changed-block bitmap (file-size gated, no per-page LSN recheck), warning and
+falling back to the full scan when the bitmap can't be loaded
+(`WALG_FORCE_WAL_DELTA` forbids that fallback).
+
+walrus implements only the map-trusting path. `classify_for_delta` ships
+exactly the blocks the changed-block map reports, filtered to blocks within the
+current file size, with no page-LSN recheck. The map is built from WAL
+`<group>_delta` sidecars (raw-WAL walk when a sidecar is missing) or from
+`pg_walsummary` under `--delta-from-wal-summaries`; if it can't be built, walrus
+produces a full backup rather than a scan-based delta. So for walrus
+`WALG_USE_WAL_DELTA` only governs sidecar recording during wal-push, not
+selection — backup-push always selects blocks from a WAL/summary map regardless
+— and a walrus delta is correct only if that map is complete, whereas wal-g's
+default would still catch a missed block by its page LSN.
 
 ## Deliberate divergences
 
@@ -82,7 +92,8 @@ which accepts more connection variables:
 Partial support:
 
 - `PGDATA`: walrus uses it only for daemon path resolution, not as
-  backup-push data directory config
+  backup-push data directory config. `backup-push <PGDATA>` positional
+  syntax matches wal-g CLI behavior
 - `PGHOST`, `PGPORT`: walrus supports single host/port only, not pgx
   multihost semantics
 
@@ -209,78 +220,11 @@ GCE/GKE metadata-server auth is not implemented.
 
 ### Storage backends not implemented
 
-Azure:
-
-- `WALG_AZ_PREFIX`
-- `WALE_AZ_PREFIX`
-- `AZURE_STORAGE_ACCOUNT`
-- `AZURE_STORAGE_ACCESS_KEY`
-- `AZURE_STORAGE_SAS_TOKEN`
-- `AZURE_CLIENT_ID`
-- `AZURE_TENANT_ID`
-- `AZURE_CLIENT_SECRET`
-- `AZURE_ENVIRONMENT_NAME`
-- `AZURE_ENDPOINT_SUFFIX`
-- `AZURE_BUFFER_SIZE`
-- `WALG_AZURE_BUFFER_SIZE`
-- `AZURE_MAX_BUFFERS`
-- `WALG_AZURE_MAX_BUFFERS`
-- `AZURE_TRY_TIMEOUT`
-- `AZURE_BLOB_STORE_API_VERSION`
-
-Alicloud OSS:
-
-- `WALG_OSS_PREFIX`
-- `WALE_OSS_PREFIX`
-- `OSS_ACCESS_KEY_ID`
-- `OSS_ACCESS_KEY_SECRET`
-- `OSS_SESSION_TOKEN`
-- `OSS_ENDPOINT`
-- `OSS_REGION`
-- `OSS_ROLE_ARN`
-- `OSS_ROLE_SESSION_NAME`
-- `OSS_SKIP_VALIDATION`
-- `OSS_MAX_RETRIES`
-- `OSS_CONNECT_TIMEOUT`
-- `OSS_UPLOAD_PART_SIZE`
-- `OSS_COPY_PART_SIZE`
-
-Swift:
-
-- `WALG_SWIFT_PREFIX`
-- `WALE_SWIFT_PREFIX`
-- `OS_AUTH_URL`
-- `OS_USERNAME`
-- `OS_PASSWORD`
-- `OS_TENANT_NAME`
-- `OS_REGION_NAME`
-
-SSH storage:
-
-- `WALG_SSH_PREFIX`
-- `WALE_SSH_PREFIX`
-- `SSH_PORT`
-- `SSH_USERNAME`
-- `SSH_PASSWORD`
-- `SSH_PRIVATE_KEY_PATH`
-
-File storage alias:
-
-- `WALE_FILE_PREFIX`
-
-### Failover storage
-
-- `WALG_FAILOVER_STORAGES`
-- `WALG_FAILOVER_STORAGES_CHECK`
-- `WALG_FAILOVER_STORAGES_CHECK_TIMEOUT`
-- `WALG_FAILOVER_STORAGES_CHECK_SIZE`
-- `WALG_FAILOVER_STORAGES_CACHE_LIFETIME`
-- `WALG_FAILOVER_STORAGES_CACHE_EMA_ALIVE_LIMIT`
-- `WALG_FAILOVER_STORAGES_CACHE_EMA_DEAD_LIMIT`
-- `WALG_FAILOVER_STORAGES_CACHE_EMA_ALPHA_ALIVE_MAX`
-- `WALG_FAILOVER_STORAGES_CACHE_EMA_ALPHA_ALIVE_MIN`
-- `WALG_FAILOVER_STORAGES_CACHE_EMA_ALPHA_DEAD_MAX`
-- `WALG_FAILOVER_STORAGES_CACHE_EMA_ALPHA_DEAD_MIN`
+Azure, Alicloud OSS, Swift, and SSH backends are absent (see the
+divergence table), so all their env vars (`WALG_AZ_PREFIX` / `AZURE_*`,
+`WALG_OSS_PREFIX` / `OSS_*`, `WALG_SWIFT_PREFIX` / `OS_*`,
+`WALG_SSH_PREFIX` / `SSH_*`), the `WALE_FILE_PREFIX` file alias, and
+failover storages (`WALG_FAILOVER_STORAGES*`) are unsupported.
 
 ### Storage aliases
 
diff --git a/src/cli/mod.rs b/src/cli/mod.rs
index 77ee5b4..7224aba 100644
--- a/src/cli/mod.rs
+++ b/src/cli/mod.rs
@@ -106,13 +106,14 @@ pub enum Cmd {
         #[arg(long)]
         target_user_data: Option<String>,
     },
-    /// Take a streaming base backup via the replication BASE_BACKUP protocol
+    /// Take a base backup
     ///
     /// Uses libpq env vars (PGHOST/PGPORT/PGUSER/PGPASSWORD/PGDATABASE).
-    /// Without --pgdata, the sentinel records the server-reported data_directory.
+    /// With PGDATA, reads local filesystem like wal-g. Without PGDATA, streams
+    /// through replication BASE_BACKUP and records server-reported data_directory.
     BackupPush {
-        /// Optional path to local PostgreSQL data directory (sentinel only)
-        #[arg(long)]
+        /// Optional path to local PostgreSQL data directory
+        #[arg(value_name = "PGDATA")]
         pgdata: Option<PathBuf>,
         /// Mark this backup as permanent
         #[arg(long)]
@@ -535,6 +536,18 @@ mod tests {
         assert_eq!(Format::from(IncrementFormatArg::Native), Format::Native);
     }
 
+    #[test]
+    fn backup_push_accepts_positional_pgdata() {
+        let cli = Cli::parse_from(["walrus", "backup-push", "/dat/18/data", "--full"]);
+        match cli.cmd {
+            Cmd::BackupPush { pgdata, full, .. } => {
+                assert_eq!(pgdata, Some(PathBuf::from("/dat/18/data")));
+                assert!(full);
+            }
+            _ => panic!("expected backup-push"),
+        }
+    }
+
     fn worker_threads_of(args: &[&str]) -> usize {
         Cli::parse_from(args).worker_threads().unwrap()
     }
diff --git a/src/compression/mod.rs b/src/compression/mod.rs
index 3ea4cc9..1d0447d 100644
--- a/src/compression/mod.rs
+++ b/src/compression/mod.rs
@@ -6,16 +6,18 @@
 use std::pin::Pin;
 
 use async_compression::Level;
+use async_compression::lz4::{BlockSize, EncoderParams};
 use async_compression::tokio::bufread::{
     BrotliDecoder, BrotliEncoder, GzipDecoder, GzipEncoder, Lz4Decoder, Lz4Encoder, LzmaDecoder,
     LzmaEncoder, ZstdDecoder, ZstdEncoder,
 };
 use thiserror::Error;
-use tokio::io::{AsyncRead, BufReader};
+use tokio::io::{AsyncBufRead, AsyncRead, BufReader};
 
-const BUF_CAPACITY: usize = 64 * 1024;
+const BUF_CAPACITY: usize = 256 * 1024;
 
 pub type AsyncReader = Pin<Box<dyn AsyncRead + Send + Unpin>>;
+pub type AsyncBufReader = Pin<Box<dyn AsyncBufRead + Send + Unpin>>;
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum Method {
@@ -73,35 +75,35 @@ pub enum CompressionError {
 pub fn encode(method: Method, input: AsyncReader, level: i32) -> AsyncReader {
     match method {
         Method::None => input,
-        Method::Zstd => {
-            let buffered = BufReader::with_capacity(BUF_CAPACITY, input);
-            Box::pin(ZstdEncoder::with_quality(buffered, Level::Precise(level)))
-        }
-        Method::Brotli => {
-            let buffered = BufReader::with_capacity(BUF_CAPACITY, input);
-            Box::pin(BrotliEncoder::with_quality(
-                buffered,
-                Level::Precise(brotli_quality(level)),
-            ))
-        }
-        Method::Lz4 => {
-            let buffered = BufReader::with_capacity(BUF_CAPACITY, input);
-            Box::pin(Lz4Encoder::new(buffered))
-        }
-        Method::Lzma => {
-            let buffered = BufReader::with_capacity(BUF_CAPACITY, input);
-            Box::pin(LzmaEncoder::with_quality(
-                buffered,
-                Level::Precise(lzma_preset(level)),
-            ))
-        }
-        Method::Gz => {
-            let buffered = BufReader::with_capacity(BUF_CAPACITY, input);
-            Box::pin(GzipEncoder::with_quality(
-                buffered,
-                Level::Precise(gzip_level(level)),
-            ))
-        }
+        _ => encode_buffered(
+            method,
+            Box::pin(BufReader::with_capacity(BUF_CAPACITY, input)),
+            level,
+        ),
+    }
+}
+
+pub fn encode_buffered(method: Method, input: AsyncBufReader, level: i32) -> AsyncReader {
+    match method {
+        Method::None => Box::pin(input),
+        Method::Zstd => Box::pin(ZstdEncoder::with_quality(input, Level::Precise(level))),
+        Method::Brotli => Box::pin(BrotliEncoder::with_quality(
+            input,
+            Level::Precise(brotli_quality(level)),
+        )),
+        Method::Lz4 => Box::pin(Lz4Encoder::with_quality_and_params(
+            input,
+            Level::Precise(level),
+            EncoderParams::default().block_size(BlockSize::Max256KB),
+        )),
+        Method::Lzma => Box::pin(LzmaEncoder::with_quality(
+            input,
+            Level::Precise(lzma_preset(level)),
+        )),
+        Method::Gz => Box::pin(GzipEncoder::with_quality(
+            input,
+            Level::Precise(gzip_level(level)),
+        )),
     }
 }
 
@@ -194,6 +196,19 @@ mod tests {
         roundtrip(Method::Gz).await;
     }
 
+    #[tokio::test]
+    async fn encode_buffered_matches_encode() {
+        // encode_buffered feeds the codec an AsyncBufRead directly (no internal
+        // BufReader); output must still decode back to the original
+        let original = payload();
+        let buffered: AsyncBufReader = Box::pin(Cursor::new(original.clone()));
+        let enc = encode_buffered(Method::Lz4, buffered, 3);
+        let mut dec = decode(Method::Lz4, enc);
+        let mut out = Vec::new();
+        dec.read_to_end(&mut out).await.unwrap();
+        assert_eq!(out, original);
+    }
+
     #[tokio::test]
     async fn none_passthrough() {
         let mut r = encode(Method::None, reader(b"hello"), 3);
diff --git a/src/config/mod.rs b/src/config/mod.rs
index 26840a3..bbcd23a 100644
--- a/src/config/mod.rs
+++ b/src/config/mod.rs
@@ -60,7 +60,7 @@ pub enum StorageSettings {
 }
 
 impl Default for Settings {
-    /// Convenience defaults: single-worker fs pipeline at zstd-3, no throttling
+    /// Convenience defaults: single-worker fs pipeline at lz4, no throttling
     /// or encryption. Production constructs via [`Settings::from_env`]; this
     /// lets tests vary only the fields they exercise via `..Default::default()`
     fn default() -> Self {
@@ -68,7 +68,7 @@ impl Default for Settings {
             storage: StorageSettings::Fs {
                 path: String::new(),
             },
-            compression: compression::Method::Zstd,
+            compression: compression::Method::Lz4,
             compression_level: 3,
             upload_concurrency: 1,
             upload_queue: 1,
@@ -88,11 +88,11 @@ impl Settings {
     pub fn from_env() -> Result<Self> {
         let storage = detect_storage()?;
         let compression = match std::env::var("WALG_COMPRESSION_METHOD").ok().as_deref() {
-            None => compression::Method::Zstd,
+            None => compression::Method::Lz4,
             Some(s) => compression::Method::from_name(s)
                 .ok_or_else(|| anyhow!("unsupported WALG_COMPRESSION_METHOD={s}"))?,
         };
-        let compression_level = parse_env_int("WALG_COMPRESSION_LEVEL", 3)? as i32;
+        let compression_level = parse_env_int("WALG_COMPRESSION_LEVEL", 1)? as i32;
         let upload_concurrency = upload_concurrency_from_env()?;
         let upload_queue = parse_env_int("WALG_UPLOAD_QUEUE", 2)?.max(1) as usize;
         let download_concurrency = download_concurrency_from_env()?;
diff --git a/src/daemon/uploader.rs b/src/daemon/uploader.rs
index e8efb43..315863e 100644
--- a/src/daemon/uploader.rs
+++ b/src/daemon/uploader.rs
@@ -1,11 +1,14 @@
 //! Standing background WAL uploader for the daemon archive path.
 //!
 //! PG's archiver is serial — it runs `archive_command` for one segment, waits
-//! for success, then the next — so walrus's per-connection `wal-push` is
-//! serial too and `WALG_UPLOAD_CONCURRENCY` is a no-op here: the archiver
-//! falls behind a high WAL rate. wal-g closes the gap with a per-invocation
-//! `BgUploader` (wal-g `internal/databases/postgres/bguploader.go`) that scans
-//! `archive_status/` and uploads look-ahead segments concurrently.
+//! for success, then the next — so a per-connection `wal-push` (one CLI
+//! invocation per segment) is serial too, with no opening for
+//! `WALG_UPLOAD_CONCURRENCY`, and the archiver falls behind a high WAL rate.
+//! wal-g closes the gap with a per-invocation `BgUploader` (wal-g
+//! `internal/databases/postgres/bguploader.go`) that scans `archive_status/`
+//! and uploads look-ahead segments concurrently. This module is walrus's
+//! standing-daemon equivalent, where `WALG_UPLOAD_CONCURRENCY` does take
+//! effect via the look-ahead below.
 //!
 //! Because the walrus daemon is one long-lived process, bookkeeping stays
 //! in-memory: a shared `inflight` map dedups foreground pushes against
diff --git a/src/main.rs b/src/main.rs
index 37f8b0d..bd7e004 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -17,6 +17,7 @@ fn main() -> ExitCode {
 
 fn run(cli: walrus::cli::Cli) -> anyhow::Result<()> {
     let threads = cli.worker_threads()?;
+    cap_malloc_arenas(threads);
     // current_thread when 1: no worker threads, single glibc malloc arena
     // (see docs/DESIGN.md Runtime)
     let mut builder = if threads > 1 {
@@ -37,3 +38,19 @@ fn run(cli: walrus::cli::Cli) -> anyhow::Result<()> {
         .build()?
         .block_on(cli.run())
 }
+
+/// Cap glibc malloc arenas to the CPU count. glibc otherwise grows to 8*ncpu
+/// arenas, each reserving a 64 MiB heap by mmap; once the multi-thread runtime
+/// drives concurrent allocation that inflates virtual memory far past the
+/// resident set. One arena per core keeps VSZ bounded without measurably
+/// hurting allocator throughput. Must run before any worker thread spawns
+#[cfg(all(target_os = "linux", target_env = "gnu"))]
+fn cap_malloc_arenas(n: usize) {
+    // SAFETY: mallopt is thread-safe; called once on the main thread pre-runtime
+    unsafe {
+        libc::mallopt(libc::M_ARENA_MAX, n as libc::c_int);
+    }
+}
+
+#[cfg(not(all(target_os = "linux", target_env = "gnu")))]
+fn cap_malloc_arenas() {}
diff --git a/src/pg/backup/fetch.rs b/src/pg/backup/fetch.rs
index 12b0c0e..7f2a197 100644
--- a/src/pg/backup/fetch.rs
+++ b/src/pg/backup/fetch.rs
@@ -11,6 +11,7 @@ use std::sync::Arc;
 
 use anyhow::{Context, Result, anyhow, bail};
 use futures::StreamExt;
+use tokio_tar::Archive;
 use tokio_util::io::SyncIoBridge;
 
 use crate::compression;
@@ -331,108 +332,133 @@ async fn unpack_part(
     let throttled = settings.throttle_network(body);
     let decrypted = settings.decrypt(throttled);
     let decoded = compression::decode(method, decrypted);
-    let dst: PathBuf = dst.to_path_buf();
 
-    let res: std::io::Result<()> = tokio::task::spawn_blocking(move || {
-        let sync_r = SyncIoBridge::new(decoded);
-        let mut archive = tar::Archive::new(sync_r);
-        unpack_manual(&mut archive, &dst, &incremented)
-    })
-    .await
-    .context("tar unpack join")?;
-    res.with_context(|| format!("unpack {key}"))?;
+    let mut archive = Archive::new(decoded);
+    let mut entries = archive.entries().context("open tar entries")?;
+    while let Some(entry) = entries.next().await {
+        let entry = entry.context("read tar entry")?;
+        unpack_entry(entry, dst, &incremented)
+            .await
+            .with_context(|| format!("unpack {key}"))?;
+    }
     tracing::info!(target = "backup_fetch", "unpacked {key}");
     Ok(())
 }
 
-/// Manual tar extraction without the `tar` crate's "stays inside dst"
-/// canonicalization check. PG restores legitimately need to follow
-/// `pg_tblspc/<oid>` symlinks that point outside `dst` — the safe-extract
-/// behavior in `tar::Archive::unpack` refuses that
-fn unpack_manual<R: std::io::Read>(
-    archive: &mut tar::Archive<R>,
+/// Restore one tar entry. PG restores legitimately follow `pg_tblspc/<oid>`
+/// symlinks pointing outside `dst`, so we skip the tar crate's "stays inside
+/// dst" canonicalization. File bodies bridge to a `spawn_blocking` apply path
+/// because `apply_increment_in_place` needs `Seek`
+async fn unpack_entry<R>(
+    entry: tokio_tar::Entry<R>,
     dst: &Path,
     incremented: &HashSet<String>,
-) -> std::io::Result<()> {
-    use std::io::Write;
-
-    for entry in archive.entries()? {
-        let mut entry = entry?;
-        let path = entry.path()?.into_owned();
-        // Skip absolute / parent-dir traversals
-        let rel = strip_to_relative(&path);
-        if rel.as_os_str().is_empty() {
-            continue;
+) -> Result<()>
+where
+    R: tokio::io::AsyncRead + Unpin + Send + 'static,
+{
+    let path = entry.path().context("entry path")?.into_owned();
+    // Skip absolute / parent-dir traversals
+    let rel = strip_to_relative(&path);
+    if rel.as_os_str().is_empty() {
+        return Ok(());
+    }
+    let target = dst.join(&rel);
+    let header = entry.header().clone();
+    let etype = header.entry_type();
+    if let Some(parent) = target.parent() {
+        tokio::fs::create_dir_all(parent).await?;
+    }
+    if etype.is_dir() {
+        return tokio::fs::create_dir(&target).await.or_else(|e| {
+            if e.kind() == std::io::ErrorKind::AlreadyExists {
+                Ok(())
+            } else {
+                Err(e.into())
+            }
+        });
+    }
+    if etype.is_symlink() {
+        // pg_tblspc/<oid> links are restored up-front from the sentinel
+        // TablespaceSpec (mapping-aware) before any part unpacks. Recreating
+        // them from a part entry would race the concurrent data fan-out — its
+        // remove+recreate window vs another part materializing the link's
+        // pg_tblspc/<oid>/... contents — and would clobber a
+        // --tablespace-mapping relocation with the archived (backup-time)
+        // target. PG basebackup emits symlinks only under pg_tblspc, so the
+        // sentinel link is authoritative; skip the entry
+        if rel.parent() == Some(Path::new("pg_tblspc")) {
+            return Ok(());
         }
-        let target = dst.join(&rel);
-        let header = entry.header().clone();
-        let etype = header.entry_type();
-        if let Some(parent) = target.parent() {
-            std::fs::create_dir_all(parent)?;
+        #[cfg(unix)]
+        {
+            let link = header
+                .link_name()
+                .context("symlink target")?
+                .ok_or_else(|| anyhow!("symlink without target"))?;
+            // best-effort overwrite
+            let _ = tokio::fs::remove_file(&target).await;
+            tokio::fs::symlink(link.as_ref(), &target).await?;
         }
-        if etype.is_dir() {
-            match std::fs::create_dir(&target) {
-                Ok(()) => {}
-                Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
-                Err(e) => return Err(e),
-            }
-        } else if etype.is_symlink() {
-            #[cfg(unix)]
-            {
-                let link = header.link_name()?.ok_or_else(|| {
-                    std::io::Error::new(std::io::ErrorKind::InvalidData, "symlink without target")
+        return Ok(());
+    }
+    // ignore fifo, char/block devices — none appear in a PG basebackup. Hard
+    // links are treated like regular files (basebackup emits none)
+    if !(etype.is_file() || etype.is_hard_link()) {
+        return Ok(());
+    }
+
+    let path_key = rel.to_string_lossy().into_owned();
+    let is_increment = incremented.contains(&path_key);
+    let target = target.clone();
+    let mode = header.mode().ok();
+    let bridge = SyncIoBridge::new(entry);
+    tokio::task::spawn_blocking(move || -> std::io::Result<()> {
+        use std::io::Write;
+        let mut bridge = bridge;
+        if is_increment {
+            // Increment path: apply onto whatever the earlier chain step left
+            // in place. The target must already exist (chain root wrote the
+            // full file). open() in r+w (not truncate)
+            let mut f = std::fs::OpenOptions::new()
+                .read(true)
+                .write(true)
+                .open(&target)
+                .map_err(|e| {
+                    std::io::Error::new(
+                        e.kind(),
+                        format!("apply increment {path_key}: open target: {e}"),
+                    )
                 })?;
-                // best-effort overwrite
-                let _ = std::fs::remove_file(&target);
-                std::os::unix::fs::symlink(link.as_ref(), &target)?;
-            }
-        } else if etype.is_file() || etype.is_hard_link() {
-            // ignore hard links to keep this simple; PG basebackup doesn't emit any
-            let path_key = rel.to_string_lossy().into_owned();
-            if incremented.contains(&path_key) {
-                // Increment path: apply onto whatever the earlier chain step
-                // left in place. The target must already exist (chain root
-                // wrote the full file). open() in r+w (not truncate)
-                let mut f = std::fs::OpenOptions::new()
-                    .read(true)
-                    .write(true)
-                    .open(&target)
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            e.kind(),
-                            format!("apply increment {path_key}: open target: {e}"),
-                        )
-                    })?;
-                let (final_size, _, _) =
-                    apply_increment_in_place(&mut entry, &mut f).map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::InvalidData,
-                            format!("apply increment {path_key}: {e}"),
-                        )
-                    })?;
-                f.set_len(final_size)?;
-                f.flush()?;
-            } else {
-                let mut f = std::fs::OpenOptions::new()
-                    .write(true)
-                    .create(true)
-                    .truncate(true)
-                    .open(&target)?;
-                std::io::copy(&mut entry, &mut f)?;
-                f.flush()?;
-            }
-            #[cfg(unix)]
-            {
-                use std::os::unix::fs::PermissionsExt;
-                if let Ok(mode) = header.mode() {
-                    let _ =
-                        std::fs::set_permissions(&target, std::fs::Permissions::from_mode(mode));
-                }
+            let (final_size, _, _) =
+                apply_increment_in_place(&mut bridge, &mut f).map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::InvalidData,
+                        format!("apply increment {path_key}: {e}"),
+                    )
+                })?;
+            f.set_len(final_size)?;
+            f.flush()?;
+        } else {
+            let mut f = std::fs::OpenOptions::new()
+                .write(true)
+                .create(true)
+                .truncate(true)
+                .open(&target)?;
+            std::io::copy(&mut bridge, &mut f)?;
+            f.flush()?;
+        }
+        #[cfg(unix)]
+        {
+            use std::os::unix::fs::PermissionsExt;
+            if let Some(mode) = mode {
+                let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(mode));
             }
         }
-        // entry types we don't restore: hard links, fifo, char/block devices —
-        // none appear in a PG basebackup
-    }
+        Ok(())
+    })
+    .await
+    .context("unpack file join")??;
     Ok(())
 }
 
diff --git a/src/pg/backup/fs_push.rs b/src/pg/backup/fs_push.rs
new file mode 100644
index 0000000..816713c
--- /dev/null
+++ b/src/pg/backup/fs_push.rs
@@ -0,0 +1,1341 @@
+//! backup-push from a local data directory (wal-g-style filesystem source)
+//!
+//! Walks `$PGDATA`, packs files into tar parts across N concurrent workers —
+//! each worker streams one part through compression to S3 — and brackets the
+//! copy with `pg_backup_start` / `pg_backup_stop` over a non-replication SQL
+//! session. Output layout matches the BASE_BACKUP path (`tar_partitions/
+//! part_NNN.tar.<ext>`, `pg_control.tar.<ext>`, files_metadata.json, sentinel,
+//! metadata) so backup-fetch is identical
+//!
+//! Concurrency is the throughput win over the single-stream BASE_BACKUP path:
+//! `WALG_UPLOAD_CONCURRENCY` parts pack + compress + upload simultaneously, so
+//! several S3 connections and CPU cores run at once instead of one
+
+use std::collections::HashMap;
+use std::os::unix::fs::PermissionsExt;
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+use std::time::UNIX_EPOCH;
+
+use anyhow::{Context, Result, anyhow, bail};
+use bytes::Bytes;
+use chrono::Utc;
+use tokio::io::{AsyncWriteExt, BufReader};
+use tokio::sync::{Mutex, mpsc};
+use tokio_tar::{Builder, EntryType, Header};
+
+use crate::compression::{self, AsyncBufReader, AsyncReader};
+use crate::config::Settings;
+use crate::pg::backup::delta;
+use crate::pg::backup::increment::Format as IncrementFormat;
+use crate::pg::backup::push::{self, Finalize, PushArgs};
+use crate::pg::backup::tar_streamer::{
+    DeltaClass, DeltaContext, IncrementBodyReader, PartWriter, classify_for_delta,
+};
+use crate::pg::backup::{
+    BACKUP_NAME_PREFIX, FileDescription, TablespaceSpec, format_backup_name, format_pg_lsn,
+    parse_pg_lsn, tar_part_key,
+};
+use crate::pg::replication::PgConfig;
+use crate::pg::replication::base_backup::ChannelReader;
+use crate::pg::replication::conn::ReplicationConn;
+use crate::storage::DynStorage;
+
+const PG_CONTROL_ENTRY: &str = "global/pg_control";
+
+/// Coalesce file-body reads. tokio_tar copies each body through io::copy's 8 KB
+/// buffer, and every tokio::fs::File read is a blocking-pool dispatch; reading a
+/// multi-GB relation in 8 KB units floods the pool and bounds single-stream
+/// throughput. A BufReader turns ~CAP/8KB dispatches into one. 256 KB is the knee
+/// (matches CHUNK_BYTES); peak resident is CAP × upload_concurrency (one open
+/// file per packer)
+const FILE_READ_BUF: usize = 256 * 1024;
+
+/// Filenames dropped from the copy, matched by basename anywhere in the tree.
+/// Mirrors wal-g's `ExcludedFilenames` plus `pg_internal.init` / `recovery.signal`
+/// (which pgbackrest also drops). Directories appear as empty entries (recreated
+/// on restore) but aren't recursed; files are dropped entirely. `pg_control` is
+/// handled separately (tee'd into `pg_control.tar`)
+const EXCLUDED: &[&str] = &[
+    "log",
+    "pg_log",
+    "pg_xlog",
+    "pg_wal",
+    "pgsql_tmp",
+    "postgresql.auto.conf.tmp",
+    "postmaster.pid",
+    "postmaster.opts",
+    "recovery.conf",
+    "pg_dynshmem",
+    "pg_notify",
+    "pg_replslot",
+    "pg_serial",
+    "pg_stat_tmp",
+    "pg_snapshots",
+    "pg_subtrans",
+    "pg_internal.init",
+    "standby.signal",
+    "recovery.signal",
+];
+
+/// True when `path` looks like a local PG data directory (so backup-push reads
+/// the filesystem rather than streaming BASE_BACKUP)
+pub fn is_pgdata_dir(path: &Path) -> bool {
+    path.join("PG_VERSION").is_file() || path.join("global/pg_control").is_file()
+}
+
+#[derive(Clone)]
+enum EntryKind {
+    Dir,
+    File,
+}
+
+#[derive(Clone)]
+struct WalkEntry {
+    kind: EntryKind,
+    /// path inside the tar (relative to the data dir; tablespaces remapped
+    /// under `pg_tblspc/<oid>/`)
+    tar_path: String,
+    /// absolute on-disk path (files only)
+    abs: PathBuf,
+    /// size recorded at stat time; the body is padded/truncated to match
+    size: u64,
+    mode: u32,
+    mtime: i64,
+}
+
+/// Walk results not carried in the entry stream: tablespace list, pg_control
+/// path, and the entry count for the post-walk log
+struct WalkMeta {
+    /// (oid, location) for each non-default tablespace
+    tablespaces: Vec<(u32, String)>,
+    pg_control: Option<PathBuf>,
+    entry_count: usize,
+}
+
+/// Accumulates walked entries into `tar_size`-bounded batches and blocking-sends
+/// each completed batch downstream. Rotation matches the old consumer-side
+/// `next_batch`: split before an entry would overflow a non-empty batch, close a
+/// batch once it reaches the threshold, let a lone oversize entry stand alone.
+/// Runs inside `spawn_blocking`, so `blocking_send` backpressures the walk when
+/// the packers fall behind, capping resident entries instead of materializing
+/// the whole tree
+struct Batcher {
+    tar_size: u64,
+    tx: mpsc::Sender<Vec<WalkEntry>>,
+    cur: Vec<WalkEntry>,
+    cur_size: u64,
+    count: usize,
+}
+
+impl Batcher {
+    fn new(tar_size: u64, tx: mpsc::Sender<Vec<WalkEntry>>) -> Self {
+        Self {
+            tar_size,
+            tx,
+            cur: Vec::new(),
+            cur_size: 0,
+            count: 0,
+        }
+    }
+
+    fn push(&mut self, e: WalkEntry) -> Result<()> {
+        if !self.cur.is_empty() && self.cur_size.saturating_add(e.size) > self.tar_size {
+            self.flush()?;
+        }
+        self.cur_size = self.cur_size.saturating_add(e.size);
+        self.count += 1;
+        self.cur.push(e);
+        if self.cur_size >= self.tar_size {
+            self.flush()?;
+        }
+        Ok(())
+    }
+
+    fn flush(&mut self) -> Result<()> {
+        if self.cur.is_empty() {
+            return Ok(());
+        }
+        self.cur_size = 0;
+        let batch = std::mem::take(&mut self.cur);
+        self.tx
+            .blocking_send(batch)
+            .map_err(|_| anyhow!("pack workers dropped before walk completed"))
+    }
+}
+
+/// Sink threaded through the recursive walk: batches entries, records
+/// tablespaces and the pg_control path
+struct WalkSink {
+    batcher: Batcher,
+    tablespaces: Vec<(u32, String)>,
+    pg_control: Option<PathBuf>,
+}
+
+#[derive(Default)]
+struct WorkerResult {
+    files: HashMap<String, FileDescription>,
+    tar_file_sets: HashMap<String, Vec<String>>,
+    compressed: i64,
+    uncompressed: i64,
+    max_file_no: u32,
+}
+
+pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> Result<()> {
+    let start_time = Utc::now();
+    let pgdata = args
+        .pgdata
+        .clone()
+        .ok_or_else(|| anyhow!("filesystem backup-push requires local PGDATA"))?;
+
+    // Resolve a delta parent unless --full (matches BASE_BACKUP path)
+    let parent = if args.full {
+        None
+    } else {
+        delta::configure_delta_parent(&storage, &settings.delta, args.is_permanent).await?
+    };
+    let increment_format = args.increment_format;
+    if let Some(p) = parent.as_ref()
+        && let Some(parent_fmt) = p.parent_increment_format
+        && parent_fmt != increment_format
+    {
+        bail!(
+            "increment format mismatch: delta parent {} uses {parent_fmt:?} but \
+             --increment-format requests {increment_format:?}; a chain must use one \
+             format end-to-end (match the parent, or pass --full for a new chain)",
+            p.name,
+        );
+    }
+
+    let cfg = PgConfig::from_env()?;
+    tracing::info!(
+        target = "backup_push",
+        "filesystem backup-push from {} (connecting to {}:{} as {})",
+        pgdata.display(),
+        cfg.host,
+        cfg.port,
+        cfg.user,
+    );
+    let mut conn = ReplicationConn::connect_with(&cfg, false).await?;
+    let pg_version = conn.server_pg_version();
+    let system_identifier = query_u64(
+        &mut conn,
+        "SELECT system_identifier FROM pg_control_system()",
+    )
+    .await
+    .context("read system_identifier")?;
+    let timeline =
+        query_u64(&mut conn, "SELECT timeline_id FROM pg_control_checkpoint()").await? as u32;
+    let data_directory = pgdata
+        .canonicalize()
+        .unwrap_or_else(|_| pgdata.clone())
+        .display()
+        .to_string();
+
+    if args.delta_from_wal_summaries {
+        if pg_version < 170000 {
+            bail!(
+                "--delta-from-wal-summaries requires PostgreSQL 17 or newer (server reports {pg_version})"
+            );
+        }
+        let on = show_setting(&mut conn, "summarize_wal").await?;
+        if on.trim() != "on" {
+            bail!("--delta-from-wal-summaries requires summarize_wal=on on the server");
+        }
+    }
+
+    // pg_backup_start brackets the copy; the session must stay open until stop
+    let label = format!("walrus {}", Utc::now().format("%Y%m%dT%H%M%SZ"));
+    let start_lsn = backup_start(&mut conn, pg_version, &label, args.fast_checkpoint).await?;
+    tracing::info!(
+        target = "backup_push",
+        "pg_backup_start: lsn={} timeline={}",
+        format_pg_lsn(start_lsn),
+        timeline,
+    );
+
+    let seg_size = crate::pg::wal::segment::wal_segment_size();
+    let base_name = format_backup_name(timeline, start_lsn, seg_size);
+    let backup_name = match parent.as_ref() {
+        Some(p) => format!(
+            "{base_name}_D_{}",
+            p.name.strip_prefix(BACKUP_NAME_PREFIX).unwrap_or(&p.name),
+        ),
+        None => base_name.clone(),
+    };
+
+    // Build the delta map now that the upper LSN bound is known. Failure drops
+    // to a full backup (wal-g semantics: a partial delta is worse than a full)
+    let delta_context = build_delta_context(
+        settings,
+        &storage,
+        parent.as_ref(),
+        &args,
+        increment_format,
+        pgdata.as_path(),
+        timeline,
+        start_lsn,
+    )
+    .await;
+
+    let tar_size = if args.tar_size_threshold == 0 {
+        crate::pg::backup::tar_streamer::DEFAULT_TAR_SIZE_THRESHOLD
+    } else {
+        args.tar_size_threshold
+    };
+
+    let n_workers = settings.upload_concurrency.max(1);
+    tracing::info!(
+        target = "backup_push",
+        "packing with upload_concurrency={}",
+        settings.upload_concurrency,
+    );
+
+    // Stream the walk into a bounded batch channel instead of materializing every
+    // WalkEntry resident first. The metadata-only walk far outruns packing, so an
+    // unbounded handoff would hold the whole entry list in memory; channel depth =
+    // worker count, so blocking_send backpressures the walk and packing overlaps it
+    let (batch_tx, batch_rx) = mpsc::channel::<Vec<WalkEntry>>(n_workers);
+    let walk_pgdata = pgdata.clone();
+    let walk_task =
+        tokio::task::spawn_blocking(move || walk_data_dir(&walk_pgdata, tar_size, batch_tx));
+
+    // Concurrent packing: N workers steal batches off the shared receiver, each
+    // streaming one part through compression to S3. A JoinSet ensures that if
+    // one worker fails, dropping the set aborts the rest (and each aborted
+    // worker aborts its in-flight upload via AbortOnDrop) — nothing keeps
+    // touching PGDATA / S3 after this returns and the backup session closes.
+    // Dropping every receiver clone also unblocks the walk's blocking_send,
+    // ending the producer
+    let batch_rx = Arc::new(Mutex::new(batch_rx));
+    let counter = Arc::new(AtomicU32::new(0));
+    let mut set: tokio::task::JoinSet<Result<WorkerResult>> = tokio::task::JoinSet::new();
+    for _ in 0..n_workers {
+        let batch_rx = batch_rx.clone();
+        let counter = counter.clone();
+        let settings = settings.clone();
+        let storage = storage.clone();
+        let backup_name = backup_name.clone();
+        let delta_context = delta_context.clone();
+        set.spawn(async move {
+            pack_worker(
+                batch_rx,
+                counter,
+                settings,
+                storage,
+                backup_name,
+                delta_context,
+            )
+            .await
+        });
+    }
+
+    let mut all_files: HashMap<String, FileDescription> = HashMap::new();
+    let mut tar_file_sets: HashMap<String, Vec<String>> = HashMap::new();
+    let mut compressed_size: i64 = 0;
+    let mut uncompressed_size: i64 = 0;
+    let mut max_file_no: u32 = 0;
+    while let Some(joined) = set.join_next().await {
+        let r = joined.context("pack worker join")??;
+        all_files.extend(r.files);
+        for (k, v) in r.tar_file_sets {
+            tar_file_sets.entry(k).or_default().extend(v);
+        }
+        compressed_size += r.compressed;
+        uncompressed_size += r.uncompressed;
+        max_file_no = max_file_no.max(r.max_file_no);
+    }
+
+    // Producer closed the channel once the walk finished, so every worker has
+    // drained and exited by here; its tablespace list & pg_control path are final
+    let walk = walk_task.await.context("walk join")??;
+    let pg_control = walk.pg_control;
+    let tablespaces = walk.tablespaces;
+    tracing::info!(
+        target = "backup_push",
+        "walked {} entries, {} tablespace(s)",
+        walk.entry_count,
+        tablespaces.len(),
+    );
+
+    // pg_control tee → pg_control.tar (applied last on restore). BASE_BACKUP
+    // counts pg_control inline in its archive stream; here it never enters a
+    // data part, so add the tee tar bytes to keep uncompressed_size consistent
+    let pg_control_tee = match pg_control {
+        Some(abs) => Some(build_pg_control_tar(&abs).await?),
+        None => None,
+    };
+    if let Some(tee) = pg_control_tee.as_ref() {
+        uncompressed_size += tee.len() as i64;
+    }
+
+    // pg_backup_stop: end LSN + non-exclusive backup_label / tablespace_map
+    let (end_lsn, labelfile, spcmapfile) = backup_stop(&mut conn, pg_version).await?;
+    tracing::info!(
+        target = "backup_push",
+        "pg_backup_stop at {}",
+        format_pg_lsn(end_lsn)
+    );
+
+    // backup_label (+ tablespace_map) ship as a final part so restore writes
+    // them into the data dir; they don't exist on disk in non-exclusive backup
+    let label_file_no = counter.fetch_add(1, Ordering::SeqCst) + 1;
+    max_file_no = max_file_no.max(label_file_no);
+    let part_name = format!("part_{label_file_no:03}.tar");
+    let mut label_entries: Vec<(&str, &str)> = vec![("backup_label", labelfile.as_str())];
+    if !spcmapfile.trim().is_empty() {
+        label_entries.push(("tablespace_map", spcmapfile.as_str()));
+    }
+    let label_tar = build_small_tar(&label_entries).await?;
+    let key = tar_part_key(
+        &backup_name,
+        label_file_no,
+        settings.compression.extension(),
+    );
+    uncompressed_size += label_tar.len() as i64;
+    compressed_size += upload_bytes(settings, &storage, &key, label_tar).await? as i64;
+    let now = Utc::now();
+    for (name, _) in &label_entries {
+        all_files.insert(
+            (*name).to_string(),
+            FileDescription {
+                is_incremented: false,
+                is_skipped: false,
+                mtime: now,
+                updates_count: 0,
+            },
+        );
+        tar_file_sets
+            .entry(part_name.clone())
+            .or_default()
+            .push((*name).to_string());
+    }
+
+    let tablespace_spec = if tablespaces.is_empty() {
+        None
+    } else {
+        let mut spec = TablespaceSpec::new(&data_directory);
+        for (oid, location) in &tablespaces {
+            spec.add(*oid, location);
+        }
+        Some(spec)
+    };
+
+    push::finalize_backup(Finalize {
+        settings,
+        storage: &storage,
+        backup_name,
+        start_lsn,
+        end_lsn,
+        pg_version,
+        system_identifier,
+        uncompressed_size,
+        compressed_size,
+        data_directory,
+        tablespace_spec,
+        tablespace_count: tablespaces.len(),
+        all_files,
+        tar_file_sets,
+        pg_control_tee,
+        parent: parent.as_ref(),
+        delta_context: delta_context.as_ref(),
+        args: &args,
+        start_time,
+        part_count: max_file_no,
+    })
+    .await
+}
+
+/// One packing worker: repeatedly steals a pre-batched part off the shared
+/// receiver and packs it into a single part streamed to S3, until the producer
+/// closes the channel
+async fn pack_worker(
+    batch_rx: Arc<Mutex<mpsc::Receiver<Vec<WalkEntry>>>>,
+    counter: Arc<AtomicU32>,
+    settings: Settings,
+    storage: DynStorage,
+    backup_name: String,
+    delta_context: Option<DeltaContext>,
+) -> Result<WorkerResult> {
+    let mut res = WorkerResult::default();
+    loop {
+        // recv() only awaits while the producer is mid-walk with nothing
+        // buffered; a closed channel (walk done) yields None and ends the worker
+        let batch = {
+            let mut rx = batch_rx.lock().await;
+            rx.recv().await
+        };
+        let Some(batch) = batch else { break };
+        if batch.is_empty() {
+            continue;
+        }
+        let file_no = counter.fetch_add(1, Ordering::SeqCst) + 1;
+        res.max_file_no = res.max_file_no.max(file_no);
+        let part_name = format!("part_{file_no:03}.tar");
+        let key = tar_part_key(&backup_name, file_no, settings.compression.extension());
+
+        // part bytes stream through the channel to a concurrent upload task
+        let (byte_tx, byte_rx) = mpsc::channel::<std::io::Result<Bytes>>(4);
+        let reader = ChannelReader::new(byte_rx);
+        let upload = tokio::spawn(upload_part(reader, key, settings.clone(), storage.clone()));
+
+        let counter_bytes = Arc::new(AtomicU64::new(0));
+        let mut builder = Builder::new(PartWriter::new(byte_tx, counter_bytes.clone()));
+        // Abort the upload if this worker errors or is cancelled before the part
+        // is fully written, so it can't keep reading PGDATA / uploading after
+        // backup-push has returned. Declared after `builder` so on drop it aborts
+        // before the part channel closes (no finalize of a partial object)
+        let upload = AbortOnDrop::new(upload);
+        for e in &batch {
+            let written = append_entry(&mut builder, e, &delta_context, &mut res).await?;
+            if written {
+                res.tar_file_sets
+                    .entry(part_name.clone())
+                    .or_default()
+                    .push(e.tar_path.clone());
+            }
+        }
+        builder.finish().await.context("finish part")?;
+        let mut writer = builder.into_inner().await.context("into_inner part")?;
+        writer.shutdown().await.context("flush part")?;
+        // Drop the writer (and its PollSender) to close the channel so the
+        // upload's ChannelReader sees EOF; shutdown only flushes, it doesn't
+        // close. Without this the upload never completes and the worker hangs
+        drop(writer);
+
+        // Count real tar bytes (headers, padding, dir entries), matching the
+        // BASE_BACKUP path which counts its whole input archive stream rather
+        // than logical file bodies
+        res.uncompressed += counter_bytes.load(Ordering::Relaxed) as i64;
+        res.compressed += upload.disarm().await.context("upload join")?? as i64;
+    }
+    Ok(res)
+}
+
+/// Append one walked entry to `builder`, recording per-file metadata. Returns
+/// whether anything was written to the tar (delta-skipped files write nothing)
+async fn append_entry(
+    builder: &mut Builder<PartWriter>,
+    e: &WalkEntry,
+    delta_context: &Option<DeltaContext>,
+    res: &mut WorkerResult,
+) -> Result<bool> {
+    if matches!(e.kind, EntryKind::Dir) {
+        let mut h = header(e, EntryType::Directory, 0);
+        builder
+            .append_data(&mut h, &e.tar_path, tokio::io::empty())
+            .await
+            .with_context(|| format!("append dir {}", e.tar_path))?;
+        return Ok(true);
+    }
+
+    match classify_for_delta(delta_context, &e.tar_path, e.size) {
+        DeltaClass::Skip => {
+            res.files.insert(
+                e.tar_path.clone(),
+                FileDescription {
+                    is_incremented: false,
+                    is_skipped: true,
+                    mtime: mtime_dt(e.mtime),
+                    updates_count: 0,
+                },
+            );
+            Ok(false)
+        }
+        DeltaClass::Increment {
+            header_bytes,
+            blocks,
+            total_size,
+        } => {
+            let Some(mut file) = open_walked(&e.abs).await? else {
+                return Ok(false);
+            };
+            let mut h = header(e, EntryType::Regular, total_size);
+            let body = IncrementBodyReader::new(header_bytes, &mut file, blocks, e.size);
+            builder
+                .append_data(&mut h, &e.tar_path, body)
+                .await
+                .with_context(|| format!("append increment {}", e.tar_path))?;
+            res.files.insert(
+                e.tar_path.clone(),
+                FileDescription {
+                    is_incremented: true,
+                    is_skipped: false,
+                    mtime: mtime_dt(e.mtime),
+                    updates_count: 0,
+                },
+            );
+            Ok(true)
+        }
+        DeltaClass::Passthrough => {
+            let Some(file) = open_walked(&e.abs).await? else {
+                return Ok(false);
+            };
+            let body = FixedSizeReader::new(file, e.size);
+            let mut h = header(e, EntryType::Regular, e.size);
+            builder
+                .append_data(&mut h, &e.tar_path, body)
+                .await
+                .with_context(|| format!("append {}", e.tar_path))?;
+            res.files.insert(
+                e.tar_path.clone(),
+                FileDescription {
+                    is_incremented: false,
+                    is_skipped: false,
+                    mtime: mtime_dt(e.mtime),
+                    updates_count: 0,
+                },
+            );
+            Ok(true)
+        }
+    }
+}
+
+/// Open a walked file, tolerating it vanishing between the walk and the pack:
+/// DROP TABLE unlinks a relation, pg_internal.init is recreated, etc. Returns
+/// None on ENOENT so the caller omits it — matching wal-g, which skips a file
+/// removed mid-backup; the unlink is in the WAL and replays on restore
+async fn open_walked(abs: &Path) -> Result<Option<BufReader<tokio::fs::File>>> {
+    match tokio::fs::File::open(abs).await {
+        Ok(f) => Ok(Some(BufReader::with_capacity(FILE_READ_BUF, f))),
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+            tracing::warn!(
+                target = "backup_push",
+                "{} vanished during backup; skipping",
+                abs.display(),
+            );
+            Ok(None)
+        }
+        Err(e) => Err(e).with_context(|| format!("open {}", abs.display())),
+    }
+}
+
+fn header(e: &WalkEntry, kind: EntryType, size: u64) -> Header {
+    let mut h = Header::new_gnu();
+    h.set_size(size);
+    h.set_mode(e.mode);
+    h.set_mtime(e.mtime.max(0) as u64);
+    h.set_entry_type(kind);
+    h
+}
+
+fn mtime_dt(secs: i64) -> chrono::DateTime<Utc> {
+    chrono::DateTime::<Utc>::from_timestamp(secs, 0)
+        .unwrap_or_else(|| chrono::DateTime::<Utc>::from_timestamp(0, 0).unwrap())
+}
+
+/// Owns a spawned task handle and aborts it on drop unless `disarm`ed. Ensures
+/// a per-part upload can't outlive its worker (on error or cancellation), which
+/// would otherwise keep uploading after backup-push returned
+struct AbortOnDrop<T>(Option<tokio::task::JoinHandle<T>>);
+
+impl<T> AbortOnDrop<T> {
+    fn new(handle: tokio::task::JoinHandle<T>) -> Self {
+        Self(Some(handle))
+    }
+
+    /// Take the handle back; the guard no longer aborts (caller awaits it)
+    fn disarm(mut self) -> tokio::task::JoinHandle<T> {
+        self.0.take().expect("disarm called once")
+    }
+}
+
+impl<T> Drop for AbortOnDrop<T> {
+    fn drop(&mut self) {
+        if let Some(h) = self.0.take() {
+            h.abort();
+        }
+    }
+}
+
+async fn upload_part(
+    reader: ChannelReader,
+    key: String,
+    settings: Settings,
+    storage: DynStorage,
+) -> Result<u64> {
+    let reader: AsyncBufReader = Box::pin(reader);
+    let compressed =
+        compression::encode_buffered(settings.compression, reader, settings.compression_level);
+    let encrypted = settings.encrypt(compressed);
+    let counter = Arc::new(AtomicU64::new(0));
+    let counting = push::wrap_counted_reader(encrypted, counter.clone());
+    let throttled = settings.throttle_network(counting);
+    storage
+        .put(&key, throttled, None)
+        .await
+        .with_context(|| format!("put {key}"))?;
+    Ok(counter.load(Ordering::Relaxed))
+}
+
+/// Compress+encrypt a small in-memory tar and PUT it; returns compressed bytes
+async fn upload_bytes(
+    settings: &Settings,
+    storage: &DynStorage,
+    key: &str,
+    bytes: Bytes,
+) -> Result<u64> {
+    let raw: AsyncReader = Box::pin(std::io::Cursor::new(bytes.to_vec()));
+    let compressed = compression::encode(settings.compression, raw, settings.compression_level);
+    let encrypted = settings.encrypt(compressed);
+    let counter = Arc::new(AtomicU64::new(0));
+    let counting = push::wrap_counted_reader(encrypted, counter.clone());
+    let throttled = settings.throttle_network(counting);
+    storage
+        .put(key, throttled, None)
+        .await
+        .with_context(|| format!("put {key}"))?;
+    Ok(counter.load(Ordering::Relaxed))
+}
+
+async fn build_pg_control_tar(abs: &Path) -> Result<Bytes> {
+    let data = tokio::fs::read(abs)
+        .await
+        .with_context(|| format!("read {}", abs.display()))?;
+    let mut b = Builder::new(Vec::new());
+    let mut h = Header::new_gnu();
+    h.set_size(data.len() as u64);
+    h.set_mode(0o600);
+    h.set_mtime(0);
+    h.set_entry_type(EntryType::Regular);
+    b.append_data(&mut h, PG_CONTROL_ENTRY, &data[..])
+        .await
+        .context("append pg_control tee")?;
+    b.finish().await.context("finish pg_control tar")?;
+    let buf = b.into_inner().await.context("into_inner pg_control tar")?;
+    Ok(Bytes::from(buf))
+}
+
+async fn build_small_tar(entries: &[(&str, &str)]) -> Result<Bytes> {
+    let mut b = Builder::new(Vec::new());
+    for (name, content) in entries {
+        let mut h = Header::new_gnu();
+        h.set_size(content.len() as u64);
+        h.set_mode(0o600);
+        h.set_mtime(0);
+        h.set_entry_type(EntryType::Regular);
+        b.append_data(&mut h, name, content.as_bytes())
+            .await
+            .with_context(|| format!("append {name}"))?;
+    }
+    b.finish().await.context("finish tar")?;
+    let buf = b.into_inner().await.context("into_inner tar")?;
+    Ok(Bytes::from(buf))
+}
+
+// ─── filesystem walk ────────────────────────────────────────────────────────
+
+fn walk_data_dir(
+    pgdata: &Path,
+    tar_size: u64,
+    tx: mpsc::Sender<Vec<WalkEntry>>,
+) -> Result<WalkMeta> {
+    let mut out = WalkSink {
+        batcher: Batcher::new(tar_size, tx),
+        tablespaces: Vec::new(),
+        pg_control: None,
+    };
+    walk_dir(pgdata, "", &mut out)?;
+    out.batcher.flush()?;
+    Ok(WalkMeta {
+        tablespaces: out.tablespaces,
+        pg_control: out.pg_control,
+        entry_count: out.batcher.count,
+    })
+}
+
+fn walk_dir(dir: &Path, rel_prefix: &str, out: &mut WalkSink) -> Result<()> {
+    let read = std::fs::read_dir(dir).with_context(|| format!("read_dir {}", dir.display()))?;
+    for entry in read {
+        let entry = entry?;
+        let name = entry.file_name().to_string_lossy().into_owned();
+        let ft = entry.file_type()?;
+        let rel = if rel_prefix.is_empty() {
+            name.clone()
+        } else {
+            format!("{rel_prefix}/{name}")
+        };
+        let abs = entry.path();
+
+        if ft.is_symlink() {
+            // Only pg_tblspc/<oid> symlinks matter: record the tablespace and
+            // walk its target remapped under pg_tblspc/<oid>/
+            if rel_prefix == "pg_tblspc"
+                && let Ok(oid) = name.parse::<u32>()
+            {
+                let target = std::fs::read_link(&abs)
+                    .with_context(|| format!("readlink {}", abs.display()))?;
+                out.tablespaces.push((oid, target.display().to_string()));
+                walk_dir(&target, &rel, out)?;
+            }
+            continue;
+        }
+
+        let excluded = EXCLUDED.contains(&name.as_str());
+
+        // Resolve file drops before stat: an excluded file (eg pg_internal.init)
+        // can vanish between readdir and stat, so stat'ing it would fail the
+        // walk for a file we discard anyway. pg_control rides only in
+        // pg_control.tar (applied last on restore), never a regular entry
+        if ft.is_file() {
+            if excluded {
+                continue;
+            }
+            if rel == PG_CONTROL_ENTRY {
+                out.pg_control = Some(abs);
+                continue;
+            }
+        }
+
+        let meta = match entry.metadata() {
+            Ok(m) => m,
+            // vanished between readdir and stat (eg DROP TABLE); the removal is
+            // in the WAL and replays on restore, so dropping it stays consistent
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => continue,
+            Err(e) => return Err(e).with_context(|| format!("stat {}", abs.display())),
+        };
+        let mode = meta.permissions().mode();
+        let mtime = mtime_secs(&meta);
+
+        if ft.is_dir() {
+            // Emit the dir entry even when excluded so it exists on restore,
+            // but don't recurse into excluded dirs
+            out.batcher.push(WalkEntry {
+                kind: EntryKind::Dir,
+                tar_path: rel.clone(),
+                abs: abs.clone(),
+                size: 0,
+                mode,
+                mtime,
+            })?;
+            if !excluded {
+                walk_dir(&abs, &rel, out)?;
+            }
+        } else if ft.is_file() {
+            out.batcher.push(WalkEntry {
+                kind: EntryKind::File,
+                tar_path: rel,
+                abs,
+                size: meta.len(),
+                mode,
+                mtime,
+            })?;
+        }
+    }
+    Ok(())
+}
+
+fn mtime_secs(meta: &std::fs::Metadata) -> i64 {
+    meta.modified()
+        .ok()
+        .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
+        .map(|d| d.as_secs() as i64)
+        .unwrap_or(0)
+}
+
+// ─── pg_backup_start / pg_backup_stop ───────────────────────────────────────
+
+async fn backup_start(
+    conn: &mut ReplicationConn,
+    pg_version: i32,
+    label: &str,
+    fast: bool,
+) -> Result<u64> {
+    // Non-exclusive backup (session-scoped). PG15+ renamed the functions
+    let sql = if pg_version >= 150000 {
+        format!("SELECT pg_backup_start('{}', {fast})", sql_lit(label))
+    } else {
+        format!(
+            "SELECT pg_start_backup('{}', {fast}, false)",
+            sql_lit(label)
+        )
+    };
+    let rows = conn.query_rows(&sql).await.context("pg_backup_start")?;
+    let lsn = first_col(&rows).ok_or_else(|| anyhow!("pg_backup_start returned no LSN"))?;
+    parse_pg_lsn(&lsn).context("parse start LSN")
+}
+
+/// Returns (end_lsn, backup_label, tablespace_map)
+async fn backup_stop(conn: &mut ReplicationConn, pg_version: i32) -> Result<(u64, String, String)> {
+    // wait_for_archive=false: walrus ships WAL separately, and waiting can hang
+    // when no archiver is running
+    let sql = if pg_version >= 150000 {
+        "SELECT lsn::text, labelfile, spcmapfile FROM pg_backup_stop(false)"
+    } else {
+        "SELECT lsn::text, labelfile, spcmapfile FROM pg_stop_backup(false, false)"
+    };
+    let rows = conn.query_rows(sql).await.context("pg_backup_stop")?;
+    let row = rows
+        .first()
+        .ok_or_else(|| anyhow!("pg_backup_stop returned no row"))?;
+    let lsn = row
+        .first()
+        .and_then(|c| c.clone())
+        .ok_or_else(|| anyhow!("pg_backup_stop returned no LSN"))?;
+    let labelfile = row.get(1).and_then(|c| c.clone()).unwrap_or_default();
+    let spcmapfile = row.get(2).and_then(|c| c.clone()).unwrap_or_default();
+    Ok((
+        parse_pg_lsn(&lsn).context("parse end LSN")?,
+        labelfile,
+        spcmapfile,
+    ))
+}
+
+async fn query_u64(conn: &mut ReplicationConn, sql: &str) -> Result<u64> {
+    let rows = conn.query_rows(sql).await?;
+    first_col(&rows)
+        .ok_or_else(|| anyhow!("`{sql}` returned no value"))?
+        .trim()
+        .parse()
+        .with_context(|| format!("parse u64 from `{sql}`"))
+}
+
+async fn show_setting(conn: &mut ReplicationConn, name: &str) -> Result<String> {
+    let rows = conn.query_rows(&format!("SHOW {name}")).await?;
+    first_col(&rows).ok_or_else(|| anyhow!("SHOW {name} returned no rows"))
+}
+
+fn first_col(rows: &[Vec<Option<String>>]) -> Option<String> {
+    rows.first().and_then(|r| r.first()).and_then(|c| c.clone())
+}
+
+fn sql_lit(s: &str) -> String {
+    s.replace('\'', "''")
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn build_delta_context(
+    settings: &Settings,
+    storage: &DynStorage,
+    parent: Option<&delta::PrevBackupInfo>,
+    args: &PushArgs,
+    increment_format: IncrementFormat,
+    pgdata: &Path,
+    timeline: u32,
+    start_lsn: u64,
+) -> Option<DeltaContext> {
+    let p = parent?;
+    if start_lsn <= p.start_lsn {
+        tracing::warn!(
+            target = "backup_push",
+            "new start LSN <= parent; producing a full backup",
+        );
+        return None;
+    }
+    let map = if args.delta_from_wal_summaries {
+        push::build_delta_map_from_summaries(Some(pgdata), timeline, p.start_lsn, start_lsn)
+    } else {
+        delta::build_delta_map_from_wal(
+            settings,
+            storage,
+            p.timeline,
+            p.start_lsn,
+            start_lsn,
+            settings.compression,
+        )
+        .await
+    };
+    match map {
+        Ok(map) => {
+            tracing::info!(
+                target = "backup_push",
+                "delta map: {} dirty page(s)",
+                map.len(),
+            );
+            Some(DeltaContext {
+                map: Arc::new(map),
+                format: increment_format,
+                parent_files: p.parent_files.clone(),
+            })
+        }
+        Err(e) => {
+            tracing::warn!(
+                target = "backup_push",
+                "delta map build failed ({e:#}); producing a full backup",
+            );
+            None
+        }
+    }
+}
+
+// ─── fixed-size body reader ─────────────────────────────────────────────────
+
+/// Emits exactly `remaining` bytes from `inner`: truncates if the file grew,
+/// zero-pads if it shrank, since a file can change between stat and read under
+/// pg_backup_start. Keeps the tar body length matching the header size
+struct FixedSizeReader<R> {
+    inner: R,
+    remaining: u64,
+    inner_eof: bool,
+}
+
+impl<R> FixedSizeReader<R> {
+    fn new(inner: R, size: u64) -> Self {
+        Self {
+            inner,
+            remaining: size,
+            inner_eof: false,
+        }
+    }
+}
+
+impl<R: tokio::io::AsyncRead + Unpin> tokio::io::AsyncRead for FixedSizeReader<R> {
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut tokio::io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        use std::task::Poll;
+        let me = self.get_mut();
+        if me.remaining == 0 {
+            return Poll::Ready(Ok(()));
+        }
+        let want = (buf.remaining() as u64).min(me.remaining) as usize;
+        if want == 0 {
+            return Poll::Ready(Ok(()));
+        }
+        if me.inner_eof {
+            // initialize_unfilled_to zeroes the region; emit padding
+            buf.initialize_unfilled_to(want);
+            buf.advance(want);
+            me.remaining -= want as u64;
+            return Poll::Ready(Ok(()));
+        }
+        let n;
+        {
+            let dst = buf.initialize_unfilled_to(want);
+            let mut tmp = tokio::io::ReadBuf::new(dst);
+            match std::pin::Pin::new(&mut me.inner).poll_read(cx, &mut tmp) {
+                Poll::Pending => return Poll::Pending,
+                Poll::Ready(Err(e)) => return Poll::Ready(Err(e)),
+                Poll::Ready(Ok(())) => n = tmp.filled().len(),
+            }
+        }
+        if n == 0 {
+            // file shorter than recorded size: pad the rest with zeros
+            me.inner_eof = true;
+            buf.initialize_unfilled_to(want);
+            buf.advance(want);
+            me.remaining -= want as u64;
+        } else {
+            buf.advance(n);
+            me.remaining -= n as u64;
+        }
+        Poll::Ready(Ok(()))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io::Read as _;
+
+    use super::*;
+    use crate::compression::Method;
+    use crate::storage::fs::FsStorage;
+    use tokio::io::AsyncReadExt;
+
+    fn write_file(root: &Path, rel: &str, content: &[u8]) {
+        let p = root.join(rel);
+        std::fs::create_dir_all(p.parent().unwrap()).unwrap();
+        std::fs::write(p, content).unwrap();
+    }
+
+    /// Run the streaming walk to completion and flatten every batch back into one
+    /// entry list, for tests that inspect the walk's output rather than pack it
+    async fn walk_collect(root: &Path, tar_size: u64) -> (Vec<WalkEntry>, WalkMeta) {
+        let (tx, mut rx) = mpsc::channel::<Vec<WalkEntry>>(1024);
+        let root = root.to_path_buf();
+        let handle = tokio::task::spawn_blocking(move || walk_data_dir(&root, tar_size, tx));
+        let mut entries = Vec::new();
+        while let Some(batch) = rx.recv().await {
+            entries.extend(batch);
+        }
+        let meta = handle.await.unwrap().unwrap();
+        (entries, meta)
+    }
+
+    /// Walk into a shared receiver for driving `pack_worker`. Buffers every batch
+    /// (test inputs are tiny), then drops the sender so the worker sees EOF
+    async fn walk_batches(
+        root: &Path,
+        tar_size: u64,
+    ) -> Arc<Mutex<mpsc::Receiver<Vec<WalkEntry>>>> {
+        let (tx, rx) = mpsc::channel::<Vec<WalkEntry>>(1024);
+        let root = root.to_path_buf();
+        tokio::task::spawn_blocking(move || walk_data_dir(&root, tar_size, tx))
+            .await
+            .unwrap()
+            .unwrap();
+        Arc::new(Mutex::new(rx))
+    }
+
+    #[test]
+    fn is_pgdata_dir_detects_marker() {
+        let dir = tempfile::tempdir().unwrap();
+        assert!(!is_pgdata_dir(dir.path()));
+        std::fs::write(dir.path().join("PG_VERSION"), b"16").unwrap();
+        assert!(is_pgdata_dir(dir.path()));
+    }
+
+    #[tokio::test]
+    async fn walk_excludes_dirs_files_and_tees_pg_control() {
+        let dir = tempfile::tempdir().unwrap();
+        let root = dir.path();
+        write_file(root, "PG_VERSION", b"16");
+        write_file(root, "base/1/1234", b"relation");
+        write_file(root, "global/pg_control", b"control");
+        write_file(root, "global/pg_internal.init", b"relcache");
+        write_file(root, "base/1/pg_internal.init", b"relcache");
+        write_file(root, "pg_wal/000000010000000000000001", b"walseg");
+        write_file(root, "postmaster.pid", b"123");
+        write_file(root, "standby.signal", b"");
+        write_file(root, "recovery.signal", b"");
+
+        let (entries, meta) = walk_collect(root, u64::MAX).await;
+        let paths: std::collections::HashSet<&str> =
+            entries.iter().map(|e| e.tar_path.as_str()).collect();
+
+        assert!(paths.contains("PG_VERSION"));
+        assert!(paths.contains("base/1/1234"));
+        // excluded dir present as an (empty) entry, its contents are not
+        assert!(paths.contains("pg_wal"));
+        assert!(!paths.iter().any(|p| p.starts_with("pg_wal/")));
+        // excluded file dropped entirely
+        assert!(!paths.contains("postmaster.pid"));
+        // pg_internal.init churns under relcache invalidation; dropped in every
+        // directory (global + per-database) so a stat→open can't race a vanish
+        assert!(!paths.iter().any(|p| p.ends_with("pg_internal.init")));
+        // signal files dropped so a restore controls its own recovery state
+        assert!(!paths.contains("standby.signal"));
+        assert!(!paths.contains("recovery.signal"));
+        // pg_control rides only in the tee, never a regular entry
+        assert!(!paths.contains("global/pg_control"));
+        assert_eq!(meta.pg_control, Some(root.join("global/pg_control")));
+
+        let pg_wal = entries.iter().find(|e| e.tar_path == "pg_wal").unwrap();
+        assert!(matches!(pg_wal.kind, EntryKind::Dir));
+    }
+
+    /// pg_tblspc/<oid> symlinks: record (oid, on-disk target) and remap the
+    /// target's contents under pg_tblspc/<oid>/ in the tar
+    #[tokio::test]
+    async fn walk_remaps_tablespace_symlink() {
+        let dir = tempfile::tempdir().unwrap();
+        let root = dir.path().join("pgdata");
+        write_file(&root, "PG_VERSION", b"16");
+        write_file(&root, "global/pg_control", b"control");
+
+        // external tablespace location holding a relation file
+        let ts = dir.path().join("tblspc_a");
+        write_file(&ts, "PG_16_202307071/16400/12345", &[9u8; 100]);
+        std::fs::create_dir_all(root.join("pg_tblspc")).unwrap();
+        std::os::unix::fs::symlink(&ts, root.join("pg_tblspc/16384")).unwrap();
+
+        let (entries, meta) = walk_collect(&root, u64::MAX).await;
+        let paths: std::collections::HashSet<&str> =
+            entries.iter().map(|e| e.tar_path.as_str()).collect();
+
+        // tablespace recorded by oid → on-disk target
+        assert_eq!(meta.tablespaces, vec![(16384u32, ts.display().to_string())]);
+        // pg_tblspc dir emitted; target contents remapped beneath the oid
+        assert!(paths.contains("pg_tblspc"));
+        assert!(paths.contains("pg_tblspc/16384/PG_16_202307071/16400/12345"));
+        // symlinked relation file points back at its real on-disk location
+        let rel = entries
+            .iter()
+            .find(|e| e.tar_path == "pg_tblspc/16384/PG_16_202307071/16400/12345")
+            .unwrap();
+        assert!(matches!(rel.kind, EntryKind::File));
+        assert_eq!(rel.size, 100);
+        assert_eq!(rel.abs, ts.join("PG_16_202307071/16400/12345"));
+    }
+
+    fn file_entry(path: &str, size: u64) -> WalkEntry {
+        WalkEntry {
+            kind: EntryKind::File,
+            tar_path: path.into(),
+            abs: PathBuf::new(),
+            size,
+            mode: 0o644,
+            mtime: 0,
+        }
+    }
+
+    #[tokio::test]
+    async fn batcher_rotation() {
+        // threshold 100: [40, 40] fits one part; next 40 alone; oversize 500 alone
+        let (tx, mut rx) = mpsc::channel::<Vec<WalkEntry>>(64);
+        // blocking_send must run off the runtime; flush on drop is via explicit flush
+        tokio::task::spawn_blocking(move || {
+            let mut b = Batcher::new(100, tx);
+            for e in [
+                file_entry("a", 40),
+                file_entry("b", 40),
+                file_entry("c", 40),
+                file_entry("big", 500),
+                file_entry("d", 10),
+            ] {
+                b.push(e).unwrap();
+            }
+            b.flush().unwrap();
+        })
+        .await
+        .unwrap();
+
+        let mut batches: Vec<Vec<String>> = Vec::new();
+        while let Some(batch) = rx.recv().await {
+            batches.push(batch.iter().map(|e| e.tar_path.clone()).collect());
+        }
+        let got: Vec<Vec<&str>> = batches
+            .iter()
+            .map(|b| b.iter().map(String::as_str).collect())
+            .collect();
+        assert_eq!(got, vec![vec!["a", "b"], vec!["c"], vec!["big"], vec!["d"]]);
+    }
+
+    #[tokio::test]
+    async fn fixed_size_reader_truncates_and_pads() {
+        // truncate: 6 bytes available, want 4
+        let mut r = FixedSizeReader::new(std::io::Cursor::new(b"abcdef".to_vec()), 4);
+        let mut out = Vec::new();
+        r.read_to_end(&mut out).await.unwrap();
+        assert_eq!(out, b"abcd");
+
+        // pad: 3 bytes available, want 6 → zero-filled tail
+        let mut r = FixedSizeReader::new(std::io::Cursor::new(b"abc".to_vec()), 6);
+        let mut out = Vec::new();
+        r.read_to_end(&mut out).await.unwrap();
+        assert_eq!(out, b"abc\0\0\0");
+    }
+
+    /// walk → concurrent pack → read parts back: every file & dir survives
+    /// byte-clean through the async packer (uncompressed for a simple check)
+    #[tokio::test]
+    async fn pack_roundtrip_to_storage() {
+        let dir = tempfile::tempdir().unwrap();
+        let root = dir.path().join("pgdata");
+        write_file(&root, "PG_VERSION", b"16");
+        write_file(&root, "base/1/1234", &vec![7u8; 5000]);
+        write_file(&root, "base/1/5678", b"small");
+        write_file(&root, "global/123", &vec![3u8; 9000]);
+        write_file(&root, "pg_wal/seg", b"excluded");
+
+        // expected file bodies (pg_wal/seg is excluded by the walk)
+        let expect: std::collections::HashMap<String, Vec<u8>> = [
+            ("PG_VERSION".to_string(), b"16".to_vec()),
+            ("base/1/1234".to_string(), vec![7u8; 5000]),
+            ("base/1/5678".to_string(), b"small".to_vec()),
+            ("global/123".to_string(), vec![3u8; 9000]),
+        ]
+        .into_iter()
+        .collect();
+
+        let store_dir = tempfile::tempdir().unwrap();
+        let storage: DynStorage = Arc::new(FsStorage::new(store_dir.path()).unwrap());
+        let settings = Settings {
+            compression: Method::None,
+            ..Default::default()
+        };
+
+        let batch_rx = walk_batches(&root, 4096).await;
+        let counter = Arc::new(AtomicU32::new(0));
+        let name = "base_test";
+        let res = pack_worker(
+            batch_rx,
+            counter,
+            settings,
+            storage.clone(),
+            name.to_string(),
+            None,
+        )
+        .await
+        .unwrap();
+        assert!(res.max_file_no >= 1);
+
+        // read every emitted part & collect file bodies
+        let mut got: std::collections::HashMap<String, Vec<u8>> = std::collections::HashMap::new();
+        let mut part_bytes_total: u64 = 0;
+        for file_no in 1..=res.max_file_no {
+            let key = tar_part_key(name, file_no, "");
+            let mut body = storage.get(&key).await.unwrap();
+            let mut bytes = Vec::new();
+            body.read_to_end(&mut bytes).await.unwrap();
+            part_bytes_total += bytes.len() as u64;
+            let mut ar = tar::Archive::new(&bytes[..]);
+            for e in ar.entries().unwrap() {
+                let mut e = e.unwrap();
+                let p = e.path().unwrap().to_string_lossy().into_owned();
+                if e.header().entry_type().is_dir() {
+                    continue;
+                }
+                let mut c = Vec::new();
+                e.read_to_end(&mut c).unwrap();
+                got.insert(p, c);
+            }
+        }
+
+        assert_eq!(got.len(), expect.len(), "file count mismatch: {got:?}");
+        for (path, content) in &expect {
+            assert_eq!(got.get(path), Some(content), "mismatch for {path}");
+        }
+        // excluded file never made it into a part
+        assert!(!got.contains_key("pg_wal/seg"));
+        // uncompressed_size counts real tar bytes (headers, padding, dir
+        // entries), not just logical file bodies: with Method::None the stored
+        // part bytes equal the tar bytes the PartWriter counted
+        assert_eq!(
+            res.uncompressed as u64, part_bytes_total,
+            "uncompressed must equal actual tar part bytes"
+        );
+    }
+
+    #[tokio::test]
+    async fn open_walked_tolerates_missing() {
+        let dir = tempfile::tempdir().unwrap();
+        let present = dir.path().join("here");
+        std::fs::write(&present, b"x").unwrap();
+        assert!(open_walked(&present).await.unwrap().is_some());
+        assert!(
+            open_walked(&dir.path().join("gone"))
+                .await
+                .unwrap()
+                .is_none()
+        );
+    }
+
+    /// A relation unlinked between walk and pack (DROP TABLE) is dropped from the
+    /// backup without failing the part, matching wal-g
+    #[tokio::test]
+    async fn pack_skips_file_removed_after_walk() {
+        let dir = tempfile::tempdir().unwrap();
+        let root = dir.path().join("pgdata");
+        write_file(&root, "PG_VERSION", b"16");
+        write_file(&root, "base/1/1234", b"relation");
+        write_file(&root, "base/1/5678", b"dropme");
+
+        // walk records the file, then simulate DROP TABLE before the pack opens it
+        let batch_rx = walk_batches(&root, 4096).await;
+        std::fs::remove_file(root.join("base/1/5678")).unwrap();
+
+        let store_dir = tempfile::tempdir().unwrap();
+        let storage: DynStorage = Arc::new(FsStorage::new(store_dir.path()).unwrap());
+        let settings = Settings {
+            compression: Method::None,
+            ..Default::default()
+        };
+        let res = pack_worker(
+            batch_rx,
+            Arc::new(AtomicU32::new(0)),
+            settings,
+            storage,
+            "base_drop".to_string(),
+            None,
+        )
+        .await
+        .unwrap();
+
+        assert!(res.files.contains_key("base/1/1234"));
+        assert!(!res.files.contains_key("base/1/5678"));
+    }
+}
diff --git a/src/pg/backup/increment.rs b/src/pg/backup/increment.rs
index 2d6ba59..92de56d 100644
--- a/src/pg/backup/increment.rs
+++ b/src/pg/backup/increment.rs
@@ -274,7 +274,7 @@ where
     R: Read,
     W: io::Write + io::Seek,
 {
-    let mut page = vec![0u8; PG_PAGE_SIZE as usize];
+    let mut page = [0u8; PG_PAGE_SIZE as usize];
     for &block_no in blocks {
         increment.read_exact(&mut page)?;
         target.seek(io::SeekFrom::Start(block_no as u64 * PG_PAGE_SIZE))?;
@@ -328,7 +328,7 @@ mod tests {
 
     #[test]
     fn wi1_apply_writes_at_block_offsets() {
-        let mut target = Cursor::new(vec![0u8; PG_PAGE_SIZE as usize * 3]);
+        let mut target = Cursor::new([0u8; PG_PAGE_SIZE as usize * 3]);
         let mut inc = Vec::new();
         write_increment_header(&mut inc, PG_PAGE_SIZE * 3, &[1]).unwrap();
         inc.extend(std::iter::repeat_n(0xAA, PG_PAGE_SIZE as usize));
@@ -340,7 +340,7 @@ mod tests {
         assert_eq!(fmt, Format::Wi1);
 
         target.seek(SeekFrom::Start(0)).unwrap();
-        let mut b = vec![0u8; PG_PAGE_SIZE as usize];
+        let mut b = [0u8; PG_PAGE_SIZE as usize];
         target.read_exact(&mut b).unwrap();
         assert!(b.iter().all(|&x| x == 0));
         target.read_exact(&mut b).unwrap();
@@ -351,7 +351,7 @@ mod tests {
 
     #[test]
     fn wi1_trailing_data_rejected() {
-        let mut target = Cursor::new(vec![0u8; PG_PAGE_SIZE as usize * 2]);
+        let mut target = Cursor::new([0u8; PG_PAGE_SIZE as usize * 2]);
         let mut inc = Vec::new();
         write_increment_header(&mut inc, PG_PAGE_SIZE * 2, &[0]).unwrap();
         inc.extend(std::iter::repeat_n(0xCC, PG_PAGE_SIZE as usize));
@@ -395,14 +395,14 @@ mod tests {
         // block body for block 1
         inc.extend(std::iter::repeat_n(0xBB, PG_PAGE_SIZE as usize));
 
-        let mut target = Cursor::new(vec![0xAA; PG_PAGE_SIZE as usize * 5]);
+        let mut target = Cursor::new([0xAA; PG_PAGE_SIZE as usize * 5]);
         let (size, n, fmt) = apply_increment_in_place(&mut Cursor::new(inc), &mut target).unwrap();
         assert_eq!(size, PG_PAGE_SIZE * 5);
         assert_eq!(n, 1);
         assert_eq!(fmt, Format::Native);
 
         target.seek(SeekFrom::Start(PG_PAGE_SIZE)).unwrap();
-        let mut buf = vec![0u8; PG_PAGE_SIZE as usize];
+        let mut buf = [0u8; PG_PAGE_SIZE as usize];
         target.read_exact(&mut buf).unwrap();
         assert!(buf.iter().all(|&b| b == 0xBB));
     }
@@ -416,7 +416,7 @@ mod tests {
         inc.extend(std::iter::repeat_n(0x11, PG_PAGE_SIZE as usize));
         inc.extend(std::iter::repeat_n(0x22, PG_PAGE_SIZE as usize));
 
-        let mut target = Cursor::new(vec![0u8; PG_PAGE_SIZE as usize * 4]);
+        let mut target = Cursor::new([0u8; PG_PAGE_SIZE as usize * 4]);
         let (size, _, _) = apply_increment_in_place(&mut Cursor::new(inc), &mut target).unwrap();
         assert_eq!(size, PG_PAGE_SIZE * 3);
     }
@@ -428,7 +428,7 @@ mod tests {
         write_native_increment_header(&mut inc, 10, &[]).unwrap();
         assert_eq!(inc.len(), 12);
 
-        let mut target = Cursor::new(vec![0u8; PG_PAGE_SIZE as usize * 10]);
+        let mut target = Cursor::new([0u8; PG_PAGE_SIZE as usize * 10]);
         let (size, n, fmt) = apply_increment_in_place(&mut Cursor::new(inc), &mut target).unwrap();
         assert_eq!(size, PG_PAGE_SIZE * 10);
         assert_eq!(n, 0);
@@ -442,14 +442,14 @@ mod tests {
         write_native_increment_header(&mut inc, 1, &blocks).unwrap();
         inc.extend(std::iter::repeat_n(0xCC, PG_PAGE_SIZE as usize));
         inc.push(0x42);
-        let mut target = Cursor::new(vec![0u8; PG_PAGE_SIZE as usize]);
+        let mut target = Cursor::new([0u8; PG_PAGE_SIZE as usize]);
         let err = apply_increment_in_place(&mut Cursor::new(inc), &mut target).unwrap_err();
         assert!(matches!(err, IncrementError::UnexpectedTrailing));
     }
 
     #[test]
     fn apply_rejects_unknown_magic() {
-        let mut target = Cursor::new(vec![0u8; PG_PAGE_SIZE as usize]);
+        let mut target = Cursor::new([0u8; PG_PAGE_SIZE as usize]);
         let buf = vec![0xDE, 0xAD, 0xBE, 0xEF, 0x00];
         let err = apply_increment_in_place(&mut Cursor::new(buf), &mut target).unwrap_err();
         assert!(matches!(err, IncrementError::BadMagic(_)));
diff --git a/src/pg/backup/mod.rs b/src/pg/backup/mod.rs
index a7c1a80..ecf4d48 100644
--- a/src/pg/backup/mod.rs
+++ b/src/pg/backup/mod.rs
@@ -12,6 +12,7 @@ pub mod copy;
 pub mod delete;
 pub mod delta;
 pub mod fetch;
+pub mod fs_push;
 pub mod increment;
 pub mod list;
 pub mod push;
diff --git a/src/pg/backup/push.rs b/src/pg/backup/push.rs
index ea7238c..81372ae 100644
--- a/src/pg/backup/push.rs
+++ b/src/pg/backup/push.rs
@@ -8,7 +8,7 @@
 //! The data dir's `global/pg_control` is teed into a separate `pg_control.tar`
 //! so `backup-fetch` can apply it last (matches wal-g's restore ordering)
 //!
-//! `--pgdata` is optional; absent it, the sentinel records the PG-reported
+//! Local PGDATA is optional; absent it, the sentinel records the PG-reported
 //! `data_directory` and we never touch the local filesystem
 
 use std::path::PathBuf;
@@ -21,7 +21,7 @@ use bytes::{Bytes, BytesMut};
 use tokio::io::{AsyncRead, ReadBuf};
 use tokio::sync::mpsc;
 
-use crate::compression::{self, AsyncReader};
+use crate::compression::{self, AsyncBufReader, AsyncReader};
 use crate::concurrency::BoundedTasks;
 use crate::config::Settings;
 use crate::pg::backup::delta::{self, PrevBackupInfo};
@@ -35,7 +35,8 @@ use crate::pg::backup::{
 };
 use crate::pg::replication::PgConfig;
 use crate::pg::replication::base_backup::{
-    BackupEvent, BaseBackupOpts, ChannelReader, Tablespace, run_base_backup,
+    BackupEvent, BaseBackupOpts, ChannelReader, Tablespace, max_rate_kib_from_bytes,
+    run_base_backup,
 };
 use crate::pg::replication::conn::ReplicationConn;
 use crate::storage::DynStorage;
@@ -66,6 +67,15 @@ pub struct PushArgs {
 }
 
 pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) -> Result<()> {
+    // A local PGDATA directory selects the filesystem source (wal-g
+    // semantics): walks the data dir & packs parts concurrently. Without a
+    // readable local pgdata, fall through to the single-stream BASE_BACKUP path
+    if let Some(pgdata) = args.pgdata.as_deref()
+        && super::fs_push::is_pgdata_dir(pgdata)
+    {
+        return super::fs_push::handle(settings, storage, args).await;
+    }
+
     let start_time = chrono::Utc::now();
 
     // Resolve a delta parent if WALG_DELTA_MAX_STEPS > 0 (or --delta-from-
@@ -149,18 +159,25 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) ->
         }
         if parent.is_some() && args.pgdata.is_none() {
             bail!(
-                "--delta-from-wal-summaries requires --pgdata: WAL summaries live on \
+                "--delta-from-wal-summaries requires local PGDATA: WAL summaries live on \
                  the PG host filesystem & cannot be read remotely"
             );
         }
     }
 
     let label = format!("walrus {}", chrono::Utc::now().format("%Y%m%dT%H%M%SZ"));
+    let max_rate_kib = max_rate_kib_from_bytes(settings.disk_rate_limit);
+    if let Some(rate) = max_rate_kib {
+        tracing::info!(
+            target = "backup_push",
+            "BASE_BACKUP rate limited to {rate} kB/s (WALG_DISK_RATE_LIMIT)",
+        );
+    }
     let opts = BaseBackupOpts {
         label: label.clone(),
         fast_checkpoint: args.fast_checkpoint,
         no_verify_checksums: args.no_verify_checksums,
-        max_rate_kib: None,
+        max_rate_kib,
         // wal-g push uploads tablespaces separately and ships WAL via
         // `wal-push`; inlining the segments would duplicate them
         wal: false,
@@ -352,9 +369,12 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) ->
                     let cfg = settings.clone();
                     uploads
                         .spawn(async move {
-                            let reader: AsyncReader = Box::pin(part.reader);
-                            let compressed =
-                                compression::encode(cfg.compression, reader, cfg.compression_level);
+                            let reader: AsyncBufReader = Box::pin(part.reader);
+                            let compressed = compression::encode_buffered(
+                                cfg.compression,
+                                reader,
+                                cfg.compression_level,
+                            );
                             let encrypted = cfg.encrypt(compressed);
                             let counter = Arc::new(AtomicU64::new(0));
                             let counting = wrap_counted_reader(encrypted, counter.clone());
@@ -411,6 +431,94 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) ->
     let start_lsn = start_lsn.ok_or_else(|| anyhow!("no start LSN received"))?;
     let end_lsn = end_lsn.ok_or_else(|| anyhow!("no end LSN received"))?;
 
+    // Build TablespaceSpec from non-default tablespaces. Mirrors wal-g
+    let user_tablespaces: Vec<&Tablespace> =
+        tablespace_list.iter().filter(|t| !t.is_default()).collect();
+    let tablespace_spec = if user_tablespaces.is_empty() {
+        None
+    } else {
+        let mut spec = TablespaceSpec::new(&data_directory);
+        for t in &user_tablespaces {
+            spec.add(t.oid, &t.location);
+        }
+        Some(spec)
+    };
+
+    finalize_backup(Finalize {
+        settings,
+        storage: &storage,
+        backup_name,
+        start_lsn,
+        end_lsn,
+        pg_version,
+        system_identifier,
+        uncompressed_size,
+        compressed_size,
+        data_directory,
+        tablespace_spec,
+        tablespace_count: tablespace_list.len(),
+        all_files,
+        tar_file_sets,
+        pg_control_tee,
+        parent: parent.as_ref(),
+        delta_context: delta_context.as_ref(),
+        args: &args,
+        start_time,
+        part_count: file_no,
+    })
+    .await
+}
+
+/// Inputs to [`finalize_backup`], shared by the BASE_BACKUP & filesystem paths
+pub(crate) struct Finalize<'a> {
+    pub settings: &'a Settings,
+    pub storage: &'a DynStorage,
+    pub backup_name: String,
+    pub start_lsn: u64,
+    pub end_lsn: u64,
+    pub pg_version: i32,
+    pub system_identifier: u64,
+    pub uncompressed_size: i64,
+    pub compressed_size: i64,
+    pub data_directory: String,
+    pub tablespace_spec: Option<TablespaceSpec>,
+    pub tablespace_count: usize,
+    pub all_files: std::collections::HashMap<String, FileDescription>,
+    pub tar_file_sets: std::collections::HashMap<String, Vec<String>>,
+    pub pg_control_tee: Option<Bytes>,
+    pub parent: Option<&'a PrevBackupInfo>,
+    pub delta_context: Option<&'a DeltaContext>,
+    pub args: &'a PushArgs,
+    pub start_time: chrono::DateTime<chrono::Utc>,
+    pub part_count: u32,
+}
+
+/// Upload pg_control tee, files_metadata.json, sentinel & metadata. Prints the
+/// backup name on success. Common tail for both backup-push source paths
+pub(crate) async fn finalize_backup(f: Finalize<'_>) -> Result<()> {
+    let Finalize {
+        settings,
+        storage,
+        backup_name,
+        start_lsn,
+        end_lsn,
+        pg_version,
+        system_identifier,
+        uncompressed_size,
+        mut compressed_size,
+        data_directory,
+        tablespace_spec,
+        tablespace_count,
+        all_files,
+        tar_file_sets,
+        pg_control_tee,
+        parent,
+        delta_context,
+        args,
+        start_time,
+        part_count,
+    } = f;
+
     // Upload pg_control.tar as a tee so restore can apply it last
     if let Some(bytes) = pg_control_tee {
         let ext = settings.compression.extension();
@@ -438,26 +546,13 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) ->
         compressed_size += put_counter.load(Ordering::Relaxed) as i64;
     }
 
-    // Build TablespaceSpec from non-default tablespaces. Mirrors wal-g
-    let user_tablespaces: Vec<&Tablespace> =
-        tablespace_list.iter().filter(|t| !t.is_default()).collect();
-    let tablespace_spec = if user_tablespaces.is_empty() {
-        None
-    } else {
-        let mut spec = TablespaceSpec::new(&data_directory);
-        for t in &user_tablespaces {
-            spec.add(t.oid, &t.location);
-        }
-        Some(spec)
-    };
-
     // Emit files_metadata.json sidecar
     let files_meta = FilesMetadataDto {
         files: all_files,
         tar_file_sets,
         databases_by_names: Default::default(),
     };
-    upload_json(&storage, &files_metadata_key(&backup_name), &files_meta).await?;
+    upload_json(storage, &files_metadata_key(&backup_name), &files_meta).await?;
 
     let hostname = hostname().unwrap_or_default();
     let finish_time = chrono::Utc::now();
@@ -468,7 +563,7 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) ->
     // must claim FULL — otherwise restore would walk a chain whose
     // increments don't exist
     let (incr_from_lsn, incr_from_name, incr_full_name, incr_count, incr_format) =
-        match (parent.as_ref(), delta_context.as_ref()) {
+        match (parent, delta_context) {
             (Some(p), Some(ctx)) => (
                 Some(p.start_lsn),
                 Some(p.name.clone()),
@@ -523,14 +618,14 @@ pub async fn handle(settings: &Settings, storage: DynStorage, args: PushArgs) ->
         user_data: args.user_data.clone(),
     };
 
-    upload_json(&storage, &metadata_key(&backup_name), &meta).await?;
-    upload_json(&storage, &sentinel_key(&backup_name), &v2).await?;
+    upload_json(storage, &metadata_key(&backup_name), &meta).await?;
+    upload_json(storage, &sentinel_key(&backup_name), &v2).await?;
 
     tracing::info!(
         target = "backup_push",
         "wrote {backup_name} ({} parts, {} tablespace(s), {} bytes uncompressed, {} bytes compressed)",
-        file_no,
-        tablespace_list.len(),
+        part_count,
+        tablespace_count,
         uncompressed_size,
         compressed_size,
     );
@@ -619,7 +714,7 @@ fn wrap_with_counter(input: AsyncReader) -> (CounterHandle, AsyncReader) {
     (CounterHandle(counter), Box::pin(r))
 }
 
-fn wrap_counted_reader(input: AsyncReader, counter: Arc<AtomicU64>) -> AsyncReader {
+pub(crate) fn wrap_counted_reader(input: AsyncReader, counter: Arc<AtomicU64>) -> AsyncReader {
     Box::pin(CountingReader {
         inner: input,
         counter,
@@ -642,15 +737,16 @@ fn resolve_increment_full_name(p: &PrevBackupInfo) -> String {
     }
 }
 
-/// PG17 wal-summaries → delta map. Returns an error if --pgdata is absent
+/// PG17 wal-summaries → delta map. Returns an error if local PGDATA is absent
 /// since the summaries live on the server's filesystem
-fn build_delta_map_from_summaries(
+pub(crate) fn build_delta_map_from_summaries(
     pgdata: Option<&std::path::Path>,
     timeline: u32,
     first_used_lsn: u64,
     first_not_used_lsn: u64,
 ) -> Result<crate::pg::backup::delta::PagedFileDeltaMap> {
-    let pgdata = pgdata.ok_or_else(|| anyhow!("--delta-from-wal-summaries requires --pgdata"))?;
+    let pgdata =
+        pgdata.ok_or_else(|| anyhow!("--delta-from-wal-summaries requires local PGDATA"))?;
     let map = crate::pg::wal_summaries::read_for_range(
         pgdata,
         timeline,
@@ -671,9 +767,9 @@ mod tests {
 
     #[test]
     fn delta_map_from_summaries_requires_pgdata() {
-        // Summaries live on the PG host filesystem; without --pgdata the map
+        // Summaries live on the PG host filesystem; without local PGDATA the map
         // can't be read, so the wrapper must bail before touching disk
         let err = build_delta_map_from_summaries(None, 1, 0x100, 0x200).unwrap_err();
-        assert!(format!("{err:#}").contains("--pgdata"), "{err:#}");
+        assert!(format!("{err:#}").contains("PGDATA"), "{err:#}");
     }
 }
diff --git a/src/pg/backup/tar_streamer.rs b/src/pg/backup/tar_streamer.rs
index 00093d9..9b3f857 100644
--- a/src/pg/backup/tar_streamer.rs
+++ b/src/pg/backup/tar_streamer.rs
@@ -10,22 +10,25 @@
 //! into its own part (wal-g matches this behavior; mirrors a real PG tar
 //! that occasionally carries multi-GB segment files)
 //!
-//! The streamer runs as `spawn_blocking` because `tar::Archive` /
-//! `tar::Builder` are sync. Async input is bridged via `SyncIoBridge`;
-//! per-part output flows over an mpsc of `Bytes` that the caller reads as
-//! an `AsyncRead` (see `ChannelReader`)
+//! The streamer runs as a `tokio::spawn` task over `astral-tokio-tar`'s async
+//! `Archive` / `Builder`; per-part output flows over an mpsc of `Bytes` that
+//! the caller reads as an `AsyncRead` (see `ChannelReader`)
 
 use std::collections::{BTreeSet, HashMap, HashSet};
-use std::io::{Read, Write};
+use std::pin::Pin;
 use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::task::{Context as TaskContext, Poll};
 
 use anyhow::{Context, Result, anyhow};
 use bytes::Bytes;
 use chrono::{DateTime, Utc};
-use tokio::io::AsyncRead;
+use futures::StreamExt;
+use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, ReadBuf};
 use tokio::sync::mpsc;
 use tokio::task::JoinHandle;
-use tokio_util::io::SyncIoBridge;
+use tokio_tar::{Archive, Builder, Header};
+use tokio_util::sync::PollSender;
 
 use crate::pg::backup::delta::{self as delta_mod, PG_PAGE_SIZE, PagedFileDeltaMap};
 use crate::pg::backup::increment::{
@@ -125,32 +128,29 @@ where
     R: AsyncRead + Send + Unpin + 'static,
 {
     let (parts_tx, parts_rx) = mpsc::channel::<Result<Part>>(opts.queue_depth.max(1));
-    let handle = tokio::task::spawn_blocking(move || -> Result<StreamerResult> {
-        let sync_input = SyncIoBridge::new(input);
-        run_blocking(sync_input, opts, parts_tx)
-    });
+    let handle = tokio::spawn(run_async(input, opts, parts_tx));
     (parts_rx, handle)
 }
 
-fn run_blocking<R: Read>(
+async fn run_async<R: AsyncRead + Send + Unpin + 'static>(
     input: R,
     opts: StreamerOpts,
     parts_tx: mpsc::Sender<Result<Part>>,
 ) -> Result<StreamerResult> {
-    let mut archive = tar::Archive::new(input);
+    let mut archive = Archive::new(input);
     let mut entries = archive.entries().context("open tar entries")?;
 
     let mut result = StreamerResult::default();
     let mut file_no = opts.starting_file_no;
-    let mut tee_builder: Option<tar::Builder<Vec<u8>>> = if opts.tee_names.is_empty() {
+    let mut tee_builder: Option<Builder<Vec<u8>>> = if opts.tee_names.is_empty() {
         None
     } else {
-        Some(tar::Builder::new(Vec::new()))
+        Some(Builder::new(Vec::new()))
     };
 
     let mut current: Option<PartCtx> = None;
 
-    for entry in entries.by_ref() {
+    while let Some(entry) = entries.next().await {
         let mut entry = entry.context("read tar entry")?;
         let header = entry.header().clone();
         let orig_path = entry
@@ -199,11 +199,11 @@ fn run_blocking<R: Read>(
             && ctx.bytes_written() > 0
             && ctx.bytes_written().saturating_add(out_body_size) > opts.max_tar_size
         {
-            finalize_part(current.take().unwrap())?;
+            finalize_part(current.take().unwrap()).await?;
         }
         if current.is_none() {
             file_no += 1;
-            current = Some(start_part(file_no, &parts_tx)?);
+            current = Some(start_part(file_no, &parts_tx).await?);
         }
         let ctx = current.as_mut().unwrap();
 
@@ -230,6 +230,7 @@ fn run_blocking<R: Read>(
                 let body = IncrementBodyReader::new(header_bytes, &mut entry, blocks, entry_size);
                 ctx.builder
                     .append_data(&mut new_hdr, &mapped, body)
+                    .await
                     .context("append increment to current part")?;
                 (true, false)
             }
@@ -237,18 +238,24 @@ fn run_blocking<R: Read>(
                 if tee_match {
                     // Tee path: buffer in memory (only used for small files like pg_control)
                     let mut buf = Vec::with_capacity(entry_size as usize);
-                    entry.read_to_end(&mut buf).context("read tee entry")?;
+                    entry
+                        .read_to_end(&mut buf)
+                        .await
+                        .context("read tee entry")?;
                     ctx.builder
-                        .append_data(&mut new_hdr, &mapped, std::io::Cursor::new(&buf))
+                        .append_data(&mut new_hdr, &mapped, &buf[..])
+                        .await
                         .context("append to current part")?;
                     if let Some(tb) = tee_builder.as_mut() {
                         let mut tee_hdr = header.clone();
-                        tb.append_data(&mut tee_hdr, &mapped, std::io::Cursor::new(&buf))
+                        tb.append_data(&mut tee_hdr, &mapped, &buf[..])
+                            .await
                             .context("append to tee tar")?;
                     }
                 } else {
                     ctx.builder
                         .append_data(&mut new_hdr, &mapped, &mut entry)
+                        .await
                         .context("append to current part")?;
                 }
                 (false, false)
@@ -274,10 +281,11 @@ fn run_blocking<R: Read>(
     }
 
     if let Some(ctx) = current.take() {
-        finalize_part(ctx)?;
+        finalize_part(ctx).await?;
     }
-    if let Some(tb) = tee_builder.take() {
-        let buf = tb.into_inner().context("finish tee tar")?;
+    if let Some(mut tb) = tee_builder.take() {
+        tb.finish().await.context("finish tee tar")?;
+        let buf = tb.into_inner().await.context("into_inner tee tar")?;
         if !buf.is_empty() {
             result.tee_bytes = Some(Bytes::from(buf));
         }
@@ -288,7 +296,7 @@ fn run_blocking<R: Read>(
 }
 
 /// Outcome of the delta-mode lookup for one entry
-enum DeltaClass {
+pub(crate) enum DeltaClass {
     /// Not a paged file (or no delta map): pass body through unchanged
     Passthrough,
     /// Paged file whose changed-block set intersects the file: emit increment
@@ -301,7 +309,11 @@ enum DeltaClass {
     Skip,
 }
 
-fn classify_for_delta(ctx: &Option<DeltaContext>, path: &str, entry_size: u64) -> DeltaClass {
+pub(crate) fn classify_for_delta(
+    ctx: &Option<DeltaContext>,
+    path: &str,
+    entry_size: u64,
+) -> DeltaClass {
     let Some(ctx) = ctx.as_ref() else {
         return DeltaClass::Passthrough;
     };
@@ -357,25 +369,41 @@ fn classify_for_delta(ctx: &Option<DeltaContext>, path: &str, entry_size: u64) -
     }
 }
 
-/// `Read` impl that emits a pre-encoded increment header followed by the
+/// `AsyncRead` impl that emits a pre-encoded increment header followed by the
 /// subset of input pages whose block numbers appear in `blocks`. Reads the
-/// input strictly forward — for each emitted page, skips intervening pages
-/// by `read_exact` into a scratch buffer
-struct IncrementBodyReader<'a, R: Read> {
+/// input strictly forward — pages before each target are read & discarded
+enum IncrementPhase {
+    Header,
+    /// load the next target page (skipping intervening pages first)
+    Load,
+    /// emit the page currently buffered in `page_buf`
+    Emit,
+    Done,
+}
+
+pub(crate) struct IncrementBodyReader<'a, R> {
     header: Vec<u8>,
     header_pos: usize,
     input: &'a mut R,
     blocks: Vec<u32>,
     next_idx: usize,
+    /// next block index still to be read off the input
     cur_block: u32,
-    page_buf: Vec<u8>,
-    page_pos: usize,
-    page_filled: bool,
-    _entry_size: u64,
+    page_buf: [u8; PG_PAGE_SIZE as usize],
+    /// bytes filled into `page_buf` while loading the current page
+    fill: usize,
+    /// emit cursor into `page_buf`
+    emit_pos: usize,
+    phase: IncrementPhase,
 }
 
-impl<'a, R: Read> IncrementBodyReader<'a, R> {
-    fn new(header: Vec<u8>, input: &'a mut R, blocks: Vec<u32>, entry_size: u64) -> Self {
+impl<'a, R: AsyncRead + Unpin> IncrementBodyReader<'a, R> {
+    pub(crate) fn new(
+        header: Vec<u8>,
+        input: &'a mut R,
+        blocks: Vec<u32>,
+        _entry_size: u64,
+    ) -> Self {
         Self {
             header,
             header_pos: 0,
@@ -383,54 +411,86 @@ impl<'a, R: Read> IncrementBodyReader<'a, R> {
             blocks,
             next_idx: 0,
             cur_block: 0,
-            page_buf: vec![0u8; PG_PAGE_SIZE as usize],
-            page_pos: 0,
-            page_filled: false,
-            _entry_size: entry_size,
+            page_buf: [0u8; PG_PAGE_SIZE as usize],
+            fill: 0,
+            emit_pos: 0,
+            phase: IncrementPhase::Header,
         }
     }
 }
 
-impl<'a, R: Read> Read for IncrementBodyReader<'a, R> {
-    fn read(&mut self, out: &mut [u8]) -> std::io::Result<usize> {
-        if out.is_empty() {
-            return Ok(0);
-        }
-        // Phase 1: emit header bytes
-        if self.header_pos < self.header.len() {
-            let n = (self.header.len() - self.header_pos).min(out.len());
-            out[..n].copy_from_slice(&self.header[self.header_pos..self.header_pos + n]);
-            self.header_pos += n;
-            return Ok(n);
-        }
-        // Phase 2: emit current page
-        if self.page_filled {
-            let blcksz = PG_PAGE_SIZE as usize;
-            let n = (blcksz - self.page_pos).min(out.len());
-            out[..n].copy_from_slice(&self.page_buf[self.page_pos..self.page_pos + n]);
-            self.page_pos += n;
-            if self.page_pos == blcksz {
-                self.page_filled = false;
-                self.next_idx += 1;
+impl<'a, R: AsyncRead + Unpin> AsyncRead for IncrementBodyReader<'a, R> {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut TaskContext<'_>,
+        out: &mut ReadBuf<'_>,
+    ) -> Poll<std::io::Result<()>> {
+        let me = self.get_mut();
+        let page = PG_PAGE_SIZE as usize;
+        loop {
+            match me.phase {
+                IncrementPhase::Header => {
+                    if me.header_pos < me.header.len() {
+                        let n = (me.header.len() - me.header_pos).min(out.remaining());
+                        if n == 0 {
+                            return Poll::Ready(Ok(()));
+                        }
+                        out.put_slice(&me.header[me.header_pos..me.header_pos + n]);
+                        me.header_pos += n;
+                        return Poll::Ready(Ok(()));
+                    }
+                    me.phase = IncrementPhase::Load;
+                }
+                IncrementPhase::Load => {
+                    if me.next_idx >= me.blocks.len() {
+                        me.phase = IncrementPhase::Done;
+                        continue;
+                    }
+                    let target = me.blocks[me.next_idx];
+                    // fill page_buf with one full page from the input
+                    while me.fill < page {
+                        let mut rb = ReadBuf::new(&mut me.page_buf[me.fill..]);
+                        match Pin::new(&mut *me.input).poll_read(cx, &mut rb) {
+                            Poll::Pending => return Poll::Pending,
+                            Poll::Ready(Err(e)) => return Poll::Ready(Err(e)),
+                            Poll::Ready(Ok(())) => {
+                                let got = rb.filled().len();
+                                if got == 0 {
+                                    return Poll::Ready(Err(std::io::Error::new(
+                                        std::io::ErrorKind::UnexpectedEof,
+                                        "increment input ended mid-page",
+                                    )));
+                                }
+                                me.fill += got;
+                            }
+                        }
+                    }
+                    me.fill = 0;
+                    if me.cur_block < target {
+                        // intervening page: discard & advance
+                        me.cur_block += 1;
+                        continue;
+                    }
+                    me.emit_pos = 0;
+                    me.phase = IncrementPhase::Emit;
+                }
+                IncrementPhase::Emit => {
+                    let n = (page - me.emit_pos).min(out.remaining());
+                    if n == 0 {
+                        return Poll::Ready(Ok(()));
+                    }
+                    out.put_slice(&me.page_buf[me.emit_pos..me.emit_pos + n]);
+                    me.emit_pos += n;
+                    if me.emit_pos == page {
+                        me.next_idx += 1;
+                        me.cur_block += 1;
+                        me.phase = IncrementPhase::Load;
+                    }
+                    return Poll::Ready(Ok(()));
+                }
+                IncrementPhase::Done => return Poll::Ready(Ok(())),
             }
-            return Ok(n);
-        }
-        // Phase 3: load the next target page
-        if self.next_idx >= self.blocks.len() {
-            return Ok(0);
-        }
-        let target = self.blocks[self.next_idx];
-        // Skip pages before target by reading & discarding
-        while self.cur_block < target {
-            self.input.read_exact(&mut self.page_buf)?;
-            self.cur_block += 1;
         }
-        self.input.read_exact(&mut self.page_buf)?;
-        self.cur_block += 1;
-        self.page_filled = true;
-        self.page_pos = 0;
-        // Tail-recurse via loop semantics: the next read() call will pump out
-        Read::read(self, out)
     }
 }
 
@@ -441,45 +501,39 @@ fn _bind_increment(_: increment::IncrementHeader) {}
 
 struct PartCtx {
     file_no: u32,
-    builder: tar::Builder<CountingWriter<BlockingSender>>,
-    bytes_counter: std::sync::Arc<std::sync::atomic::AtomicU64>,
+    builder: Builder<PartWriter>,
+    bytes_counter: Arc<AtomicU64>,
 }
 
 impl PartCtx {
     fn bytes_written(&self) -> u64 {
-        self.bytes_counter
-            .load(std::sync::atomic::Ordering::Relaxed)
+        self.bytes_counter.load(Ordering::Relaxed)
     }
 }
 
-fn start_part(file_no: u32, parts_tx: &mpsc::Sender<Result<Part>>) -> Result<PartCtx> {
+async fn start_part(file_no: u32, parts_tx: &mpsc::Sender<Result<Part>>) -> Result<PartCtx> {
     let (byte_tx, byte_rx) = mpsc::channel::<std::io::Result<Bytes>>(4);
     let reader = ChannelReader::new(byte_rx);
     parts_tx
-        .blocking_send(Ok(Part { file_no, reader }))
+        .send(Ok(Part { file_no, reader }))
+        .await
         .map_err(|_| anyhow!("parts consumer dropped"))?;
-    let counter = std::sync::Arc::new(std::sync::atomic::AtomicU64::new(0));
-    let writer = CountingWriter {
-        inner: BlockingSender {
-            tx: byte_tx,
-            scratch: Vec::with_capacity(CHUNK_BYTES),
-        },
-        counter: counter.clone(),
-    };
+    let counter = Arc::new(AtomicU64::new(0));
+    let writer = PartWriter::new(byte_tx, counter.clone());
     Ok(PartCtx {
         file_no,
-        builder: tar::Builder::new(writer),
+        builder: Builder::new(writer),
         bytes_counter: counter,
     })
 }
 
-fn finalize_part(ctx: PartCtx) -> Result<()> {
-    // tar::Builder::into_inner writes the two trailing zero blocks then
-    // returns the inner writer
-    let writer = ctx.builder.into_inner().context("finish tar part")?;
-    let CountingWriter { mut inner, .. } = writer;
-    inner.flush().context("flush part")?;
-    drop(inner); // drop sender → ChannelReader sees EOF
+async fn finalize_part(ctx: PartCtx) -> Result<()> {
+    // finish writes the two trailing zero blocks; shutdown flushes the tail
+    // chunk, then dropping the writer closes the channel → ChannelReader EOF
+    let mut builder = ctx.builder;
+    builder.finish().await.context("finish tar part")?;
+    let mut writer = builder.into_inner().await.context("into_inner tar part")?;
+    writer.shutdown().await.context("flush part")?;
     Ok(())
 }
 
@@ -487,75 +541,85 @@ fn strip_dotslash(s: &str) -> &str {
     s.strip_prefix("./").unwrap_or(s)
 }
 
-fn header_mtime(h: &tar::Header) -> DateTime<Utc> {
+fn header_mtime(h: &Header) -> DateTime<Utc> {
     let secs = h.mtime().unwrap_or(0) as i64;
     DateTime::<Utc>::from_timestamp(secs, 0)
         .unwrap_or_else(|| DateTime::<Utc>::from_timestamp(0, 0).unwrap())
 }
 
-/// Sync writer that pushes its writes through a tokio mpsc as `Bytes`.
-/// `blocking_send` parks the writer thread when the channel is full —
-/// that's the backpressure
-struct BlockingSender {
-    tx: mpsc::Sender<std::io::Result<Bytes>>,
+fn broken_pipe() -> std::io::Error {
+    std::io::Error::new(std::io::ErrorKind::BrokenPipe, "part consumer dropped")
+}
+
+/// Async writer that pushes coalesced chunks through a tokio mpsc as `Bytes`.
+/// `PollSender::poll_reserve` parks the task when the channel is full — that's
+/// the backpressure. `counter` tracks total tar bytes for rotation budgeting
+pub(crate) struct PartWriter {
+    sink: PollSender<std::io::Result<Bytes>>,
     scratch: Vec<u8>,
+    counter: Arc<AtomicU64>,
 }
 
-impl Write for BlockingSender {
-    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-        // Coalesce small writes into a single channel send to avoid per-512-byte
-        // tar block traffic across the channel
-        self.scratch.extend_from_slice(buf);
-        if self.scratch.len() >= CHUNK_BYTES {
-            self.flush_scratch()?;
+impl PartWriter {
+    pub(crate) fn new(tx: mpsc::Sender<std::io::Result<Bytes>>, counter: Arc<AtomicU64>) -> Self {
+        Self {
+            sink: PollSender::new(tx),
+            scratch: Vec::with_capacity(CHUNK_BYTES),
+            counter,
         }
-        Ok(buf.len())
-    }
-
-    fn flush(&mut self) -> std::io::Result<()> {
-        self.flush_scratch()
     }
-}
 
-impl BlockingSender {
-    fn flush_scratch(&mut self) -> std::io::Result<()> {
+    /// Send the buffered scratch as one `Bytes` chunk, swapping in a fresh
+    /// buffer. Avoids the per-CHUNK_BYTES memcpy that `Bytes::copy_from_slice`
+    /// would do
+    fn flush_chunk(&mut self, cx: &mut TaskContext<'_>) -> Poll<std::io::Result<()>> {
         if self.scratch.is_empty() {
-            return Ok(());
+            return Poll::Ready(Ok(()));
+        }
+        match self.sink.poll_reserve(cx) {
+            Poll::Pending => return Poll::Pending,
+            Poll::Ready(Err(_)) => return Poll::Ready(Err(broken_pipe())),
+            Poll::Ready(Ok(())) => {}
         }
-        // Move the scratch Vec into a Bytes owner, swap in a fresh
-        // buffer for the next chunk. Avoids the per-CHUNK_BYTES memcpy
-        // that `Bytes::copy_from_slice` does
         let chunk = Bytes::from(std::mem::replace(
             &mut self.scratch,
             Vec::with_capacity(CHUNK_BYTES),
         ));
-        self.tx.blocking_send(Ok(chunk)).map_err(|_| {
-            std::io::Error::new(std::io::ErrorKind::BrokenPipe, "part consumer dropped")
-        })
+        self.sink.send_item(Ok(chunk)).map_err(|_| broken_pipe())?;
+        Poll::Ready(Ok(()))
     }
 }
 
-impl Drop for BlockingSender {
-    fn drop(&mut self) {
-        // Best-effort flush of any tail bytes before EOF
-        let _ = self.flush_scratch();
+impl AsyncWrite for PartWriter {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut TaskContext<'_>,
+        buf: &[u8],
+    ) -> Poll<std::io::Result<usize>> {
+        let me = self.get_mut();
+        // Flush the pending chunk before growing scratch past the threshold.
+        // Re-poll re-enters here without re-buffering `buf` (extend happens once,
+        // after the flush completes Ready)
+        if me.scratch.len() >= CHUNK_BYTES {
+            match me.flush_chunk(cx) {
+                Poll::Pending => return Poll::Pending,
+                Poll::Ready(Err(e)) => return Poll::Ready(Err(e)),
+                Poll::Ready(Ok(())) => {}
+            }
+        }
+        me.scratch.extend_from_slice(buf);
+        me.counter.fetch_add(buf.len() as u64, Ordering::Relaxed);
+        Poll::Ready(Ok(buf.len()))
     }
-}
 
-struct CountingWriter<W: Write> {
-    inner: W,
-    counter: std::sync::Arc<std::sync::atomic::AtomicU64>,
-}
-
-impl<W: Write> Write for CountingWriter<W> {
-    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-        let n = self.inner.write(buf)?;
-        self.counter
-            .fetch_add(n as u64, std::sync::atomic::Ordering::Relaxed);
-        Ok(n)
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut TaskContext<'_>) -> Poll<std::io::Result<()>> {
+        self.get_mut().flush_chunk(cx)
     }
-    fn flush(&mut self) -> std::io::Result<()> {
-        self.inner.flush()
+
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut TaskContext<'_>) -> Poll<std::io::Result<()>> {
+        // Flush tail bytes; dropping the writer (and its PollSender) closes the
+        // channel so the ChannelReader sees EOF
+        self.get_mut().flush_chunk(cx)
     }
 }
 
@@ -567,6 +631,10 @@ pub fn tablespace_prefix(oid: u32) -> String {
 
 #[cfg(test)]
 mod tests {
+    // Test fixtures build & inspect archives with the sync `tar` crate; the
+    // `Read` import drives `read_to_end` on those sync entries
+    use std::io::Read as _;
+
     use super::*;
     use tokio::io::AsyncReadExt;
 
diff --git a/src/pg/replication/base_backup.rs b/src/pg/replication/base_backup.rs
index 9cc8525..24fa54a 100644
--- a/src/pg/replication/base_backup.rs
+++ b/src/pg/replication/base_backup.rs
@@ -9,11 +9,11 @@
 //! as `AsyncReader` for `Storage::put`
 
 use anyhow::{Context, Result, anyhow, bail};
-use bytes::Bytes;
+use bytes::{Buf, Bytes};
 use postgres_protocol::message::backend::Message;
 use std::pin::Pin;
 use std::task::{Context as TaskCtx, Poll};
-use tokio::io::AsyncRead;
+use tokio::io::{AsyncBufRead, AsyncRead};
 use tokio::sync::mpsc;
 
 use crate::pg::backup::parse_pg_lsn;
@@ -33,6 +33,22 @@ pub struct BaseBackupOpts {
     pub wal: bool,
 }
 
+/// PG `BASE_BACKUP ... MAX_RATE` accepts kB/s within these bounds
+/// (src/include/backup/basebackup.h). Out-of-range is a protocol error, not a
+/// clamp, so callers must pre-clamp
+const MAX_RATE_LOWER_KIB: i64 = 32;
+const MAX_RATE_UPPER_KIB: i64 = 1_048_576;
+
+/// Convert a bytes/sec budget (WALG_DISK_RATE_LIMIT) into a `MAX_RATE` argument
+/// in kB/s. None when unset (0)
+pub fn max_rate_kib_from_bytes(bytes_per_sec: u64) -> Option<i32> {
+    if bytes_per_sec == 0 {
+        return None;
+    }
+    let kib = (bytes_per_sec / 1024) as i64;
+    Some(kib.clamp(MAX_RATE_LOWER_KIB, MAX_RATE_UPPER_KIB) as i32)
+}
+
 #[derive(Debug, Clone)]
 pub struct ArchiveMeta {
     pub name: String,
@@ -138,6 +154,34 @@ impl AsyncRead for ChannelReader {
     }
 }
 
+/// Feed leftover `Bytes` slice directly, skipping a per-read memcpy
+impl AsyncBufRead for ChannelReader {
+    fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut TaskCtx<'_>) -> Poll<std::io::Result<&[u8]>> {
+        let this = self.get_mut();
+        // Same empty-payload guard as poll_read: an empty CopyData frame must
+        // not be reported as EOF (empty slice ≡ EOF for AsyncBufRead callers)
+        while this.leftover.is_empty() {
+            if this.closed {
+                return Poll::Ready(Ok(&[]));
+            }
+            match this.rx.poll_recv(cx) {
+                Poll::Pending => return Poll::Pending,
+                Poll::Ready(None) => {
+                    this.closed = true;
+                    return Poll::Ready(Ok(&[]));
+                }
+                Poll::Ready(Some(Err(e))) => return Poll::Ready(Err(e)),
+                Poll::Ready(Some(Ok(b))) => this.leftover = b,
+            }
+        }
+        Poll::Ready(Ok(&this.leftover))
+    }
+
+    fn consume(self: Pin<&mut Self>, amt: usize) {
+        self.get_mut().leftover.advance(amt);
+    }
+}
+
 /// Drive a BASE_BACKUP session, emitting events on `events`.
 /// Returns when the session is fully drained or an error occurs
 pub async fn run_base_backup(
@@ -486,6 +530,7 @@ async fn expect_ready_for_query(conn: &mut ReplicationConn) -> Result<()> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use tokio::io::AsyncBufReadExt as _BufReadExt;
     use tokio::io::AsyncReadExt as _ReadExt;
 
     /// Regression: PG can send empty CopyData frames (eg sparse-file padding
@@ -511,6 +556,33 @@ mod tests {
         assert_eq!(&out, b"hello world!");
     }
 
+    /// Same empty-payload guard, but on the AsyncBufRead path the codec uses.
+    /// Drives partial `consume` so leftover tail survives across fill_buf.
+    #[tokio::test]
+    async fn channel_reader_bufread_skips_empty_payloads() {
+        let (tx, rx) = mpsc::channel::<std::io::Result<Bytes>>(16);
+        let mut reader = ChannelReader::new(rx);
+
+        tokio::spawn(async move {
+            tx.send(Ok(Bytes::from_static(b"hello "))).await.unwrap();
+            tx.send(Ok(Bytes::new())).await.unwrap();
+            tx.send(Ok(Bytes::from_static(b"world"))).await.unwrap();
+            tx.send(Ok(Bytes::from_static(b"!"))).await.unwrap();
+        });
+
+        let mut out = Vec::new();
+        loop {
+            let chunk = reader.fill_buf().await.unwrap();
+            if chunk.is_empty() {
+                break;
+            }
+            // consume one byte at a time to exercise the leftover tail
+            out.push(chunk[0]);
+            reader.consume(1);
+        }
+        assert_eq!(&out, b"hello world!");
+    }
+
     #[test]
     fn parses_archive_header_data_dir() {
         // base.tar\0\0
@@ -655,6 +727,23 @@ mod tests {
         assert_eq!(quote_pg_str("it's"), "'it''s'");
     }
 
+    #[test]
+    fn max_rate_kib_conversion() {
+        // unset → no MAX_RATE
+        assert_eq!(max_rate_kib_from_bytes(0), None);
+        // 8 MiB/s → 8192 kB/s (wal-g divides bytes by 1024)
+        assert_eq!(max_rate_kib_from_bytes(8 * 1024 * 1024), Some(8192));
+        // below PG's 32 kB/s floor clamps up rather than degrading to unlimited
+        assert_eq!(max_rate_kib_from_bytes(1), Some(32));
+        assert_eq!(max_rate_kib_from_bytes(31 * 1024), Some(32));
+        assert_eq!(max_rate_kib_from_bytes(32 * 1024), Some(32));
+        // above PG's 1 GiB/s ceiling clamps down (effectively unlimited anyway)
+        assert_eq!(
+            max_rate_kib_from_bytes(2 * 1024 * 1024 * 1024),
+            Some(1_048_576)
+        );
+    }
+
     use bytes::{BufMut, BytesMut};
     use std::time::Duration;
     use tokio::io::AsyncWriteExt;
diff --git a/src/pg/walparser/state.rs b/src/pg/walparser/state.rs
index 5b0fe98..fd70795 100644
--- a/src/pg/walparser/state.rs
+++ b/src/pg/walparser/state.rs
@@ -298,7 +298,7 @@ pub fn extract_locations_from_wal_file<R: Read>(
     mut r: R,
 ) -> Result<Vec<BlockLocation>, ExtractError> {
     let mut out = Vec::new();
-    let mut page_buf = vec![0u8; WAL_PAGE_SIZE as usize];
+    let mut page_buf = [0u8; WAL_PAGE_SIZE as usize];
     loop {
         match read_exact_or_eof(&mut r, &mut page_buf)? {
             ReadStatus::Eof => return Ok(out),
diff --git a/src/storage/s3.rs b/src/storage/s3.rs
index f6a44cf..03ebf62 100644
--- a/src/storage/s3.rs
+++ b/src/storage/s3.rs
@@ -8,6 +8,7 @@
 //! EC2 metadata service (see [`super::creds`])
 
 use std::io::Cursor;
+use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 
 use async_trait::async_trait;
@@ -20,6 +21,8 @@ use quick_xml::Reader;
 use quick_xml::events::Event;
 use reqwest::Client;
 use tokio::io::AsyncReadExt;
+use tokio::sync::Semaphore;
+use tokio::task::JoinSet;
 use tokio_util::io::StreamReader;
 use url::Url;
 
@@ -30,6 +33,13 @@ use crate::retry::{RetryPolicy, with_retry};
 const MULTIPART_THRESHOLD: u64 = 32 * 1024 * 1024;
 const PART_SIZE: usize = 8 * 1024 * 1024;
 
+/// Parts kept in flight across every concurrent multipart upload, capped by a
+/// per-backend permit pool. Shared so a single stream and an N-way fan-out
+/// converge on the same aggregate in-flight budget: deep enough to hide per-part
+/// network RTT behind compression, bounded so resident part buffers stay at
+/// MAX_INFLIGHT_PARTS × PART_SIZE regardless of stream count
+const MAX_INFLIGHT_PARTS: usize = 8;
+
 /// Path component encoding per SigV4 spec
 /// Same set as URL path-segment, but '/' kept literal
 const PATH_ENCODE: &AsciiSet = &NON_ALPHANUMERIC
@@ -60,6 +70,8 @@ pub struct S3Storage {
     client: Client,
     base: String,
     retry_policy: RetryPolicy,
+    /// In-flight part budget shared across all concurrent multipart uploads
+    part_permits: Arc<Semaphore>,
 }
 
 impl S3Storage {
@@ -79,6 +91,7 @@ impl S3Storage {
             client,
             base,
             retry_policy,
+            part_permits: Arc::new(Semaphore::new(MAX_INFLIGHT_PARTS)),
         })
     }
 
@@ -86,6 +99,19 @@ impl S3Storage {
         super::join_prefix_key(&self.cfg.prefix, key)
     }
 
+    /// Cheap clone of the request context (shared `Client`, copied config, same
+    /// permit pool) so a per-part PUT can run as its own `'static` task instead
+    /// of borrowing `&self`
+    fn worker(&self) -> S3Storage {
+        S3Storage {
+            cfg: self.cfg.clone(),
+            client: self.client.clone(),
+            base: self.base.clone(),
+            retry_policy: self.retry_policy,
+            part_permits: self.part_permits.clone(),
+        }
+    }
+
     /// Server-side copy identity: same endpoint/region + same credential.
     /// Conservative: AWS allows cross-region CopyObject, but mismatched
     /// region ids fall back to stream-through rather than risk custom
@@ -188,6 +214,42 @@ impl S3Storage {
         .await
     }
 
+    /// PUT one already-buffered part, retrying transients in place (the buffer
+    /// is owned so the body replays without re-reading source). Returns the
+    /// part's ETag for the completion manifest
+    async fn put_one_part(
+        &self,
+        key_full: &str,
+        part_no: u32,
+        upload_id: &str,
+        chunk: Bytes,
+    ) -> Result<String> {
+        let part_no_str = part_no.to_string();
+        with_retry(&self.retry_policy, StorageError::is_transient, || async {
+            let resp = self
+                .signed_request(
+                    "PUT",
+                    key_full,
+                    &[
+                        ("partNumber", part_no_str.as_str()),
+                        ("uploadId", upload_id),
+                    ],
+                    chunk.clone(),
+                    &[],
+                )
+                .await?;
+            let resp = check_status(resp).await?;
+            let etag = resp
+                .headers()
+                .get("etag")
+                .and_then(|v| v.to_str().ok())
+                .ok_or_else(|| StorageError::InvalidResponse("missing ETag".into()))?
+                .to_string();
+            Ok::<String, StorageError>(etag)
+        })
+        .await
+    }
+
     async fn put_multipart(&self, key: &str, mut body: AsyncReader) -> Result<()> {
         // initiate
         let init_resp = self
@@ -205,14 +267,29 @@ impl S3Storage {
             StorageError::InvalidResponse("missing UploadId in CreateMultipartUpload".into())
         })?;
 
-        let mut parts: Vec<(u32, String)> = Vec::new();
+        // Pipeline parts: fill one PART_SIZE buffer (reads stay sequential, so
+        // byte boundaries match the serial path), spawn its PUT under a shared
+        // in-flight permit, then read the next part while prior PUTs run. The
+        // permit pool overlaps compression with several concurrent PUTs and
+        // bounds aggregate in-flight parts across streams. PUTs finish out of
+        // order, so collected ETags are sorted by partNumber before completion
+        let ctx = Arc::new(self.worker());
+        let key_full = Arc::new(self.full_key(key));
+        let upload_id = Arc::new(upload_id);
+        let mut join: JoinSet<Result<(u32, String)>> = JoinSet::new();
         let mut part_no: u32 = 0;
+        let mut read_result: Result<()> = Ok(());
 
-        loop {
+        'read: loop {
             let mut buf = BytesMut::with_capacity(PART_SIZE);
             while buf.len() < PART_SIZE {
-                if body.read_buf(&mut buf).await? == 0 {
-                    break;
+                match body.read_buf(&mut buf).await {
+                    Ok(0) => break,
+                    Ok(_) => {}
+                    Err(e) => {
+                        read_result = Err(e.into());
+                        break 'read;
+                    }
                 }
             }
             let filled = buf.len();
@@ -220,76 +297,63 @@ impl S3Storage {
                 break;
             }
             part_no += 1;
-            let part_no_str = part_no.to_string();
             let chunk = buf.freeze();
 
-            // Per-part retry: chunk is already buffered, so transient failures
-            // (5xx, transport) replay the same body without re-reading source
-            let key_full = self.full_key(key);
-            let result = with_retry(&self.retry_policy, StorageError::is_transient, || async {
-                let resp = self
-                    .signed_request(
-                        "PUT",
-                        &key_full,
-                        &[
-                            ("partNumber", part_no_str.as_str()),
-                            ("uploadId", upload_id.as_str()),
-                        ],
-                        chunk.clone(),
-                        &[],
-                    )
-                    .await?;
-                let resp = check_status(resp).await?;
-                let etag = resp
-                    .headers()
-                    .get("etag")
-                    .and_then(|v| v.to_str().ok())
-                    .ok_or_else(|| StorageError::InvalidResponse("missing ETag".into()))?
-                    .to_string();
-                Ok::<String, StorageError>(etag)
-            })
-            .await;
-
-            let etag = match result {
-                Ok(e) => e,
-                Err(e) => {
-                    let _ = self.abort_multipart(key, &upload_id).await;
-                    return Err(e);
+            // Acquire before spawning so reading backpressures once the budget
+            // is exhausted, capping resident part buffers
+            let permit = match self.part_permits.clone().acquire_owned().await {
+                Ok(p) => p,
+                Err(_) => {
+                    read_result = Err(StorageError::Config("part permit pool closed".into()));
+                    break;
                 }
             };
-            parts.push((part_no, etag));
+            let ctx = ctx.clone();
+            let key_full = key_full.clone();
+            let upload_id = upload_id.clone();
+            join.spawn(async move {
+                let _permit = permit;
+                let etag = ctx
+                    .put_one_part(&key_full, part_no, &upload_id, chunk)
+                    .await?;
+                Ok((part_no, etag))
+            });
 
             if filled < PART_SIZE {
                 break;
             }
         }
 
-        if parts.is_empty() {
-            // empty body, send a single empty part
-            part_no += 1;
-            let resp = self
-                .signed_request(
-                    "PUT",
-                    &self.full_key(key),
-                    &[
-                        ("partNumber", part_no.to_string().as_str()),
-                        ("uploadId", upload_id.as_str()),
-                    ],
-                    Bytes::new(),
-                    &[],
-                )
-                .await?;
-            let resp = check_status(resp).await?;
-            let etag = resp
-                .headers()
-                .get("etag")
-                .and_then(|v| v.to_str().ok())
-                .unwrap_or("\"d41d8cd98f00b204e9800998ecf8427e\"")
-                .to_string();
-            parts.push((part_no, etag));
+        // Drain finished PUTs; on the first failure (read error or part PUT)
+        // stop collecting, cancel the siblings still in flight, then abort the
+        // whole upload so no parts outlive the aborted multipart
+        let mut parts: Vec<(u32, String)> = Vec::with_capacity(part_no as usize);
+        let mut first_err = read_result.err();
+        if first_err.is_none() {
+            while let Some(joined) = join.join_next().await {
+                match joined {
+                    Ok(Ok(pe)) => parts.push(pe),
+                    Ok(Err(e)) => {
+                        first_err = Some(e);
+                        break;
+                    }
+                    Err(je) => {
+                        first_err = Some(StorageError::Transport(format!(
+                            "multipart part task: {je}"
+                        )));
+                        break;
+                    }
+                }
+            }
+        }
+        if let Some(e) = first_err {
+            join.shutdown().await;
+            let _ = self.abort_multipart(key, &upload_id).await;
+            return Err(e);
         }
 
-        // complete
+        // CompleteMultipartUpload lists every part sorted by partNumber
+        parts.sort_by_key(|(n, _)| *n);
         let mut xml = String::from("<CompleteMultipartUpload>");
         for (n, etag) in &parts {
             xml.push_str(&format!(
@@ -390,43 +454,50 @@ impl Storage for S3Storage {
         let client = self.client.clone();
         let base = self.base.clone();
         let retry_policy = self.retry_policy;
+        let part_permits = self.part_permits.clone();
 
         let s = stream::unfold(
             (Some(String::new()), full_prefix, cfg, client, base),
-            move |(token, prefix, cfg, client, base)| async move {
-                let token = token?;
-                let s = S3Storage {
-                    cfg: cfg.clone(),
-                    client: client.clone(),
-                    base: base.clone(),
-                    retry_policy,
-                };
-                let q: [(&str, &str); _] = [
-                    ("list-type", "2"),
-                    ("prefix", prefix.as_str()),
-                    ("continuation-token", token.as_str()),
-                ];
-                let q = if token.is_empty() { &q[..2] } else { &q[..] };
-                let resp = match s.signed_request("GET", "", q, Bytes::new(), &[]).await {
-                    Ok(r) => r,
-                    Err(e) => return Some((Err(e), (None, prefix, cfg, client, base))),
-                };
-                let resp = match check_status(resp).await {
-                    Ok(r) => r,
-                    Err(e) => return Some((Err(e), (None, prefix, cfg, client, base))),
-                };
-                let body = match resp.text().await {
-                    Ok(b) => b,
-                    Err(e) => {
-                        return Some((Err(e.into()), (None, prefix, cfg, client, base)));
-                    }
-                };
-                match parse_list_v2(&body, &cfg.prefix) {
-                    Ok((objects, next)) => {
-                        let next_state = (next, prefix, cfg, client, base);
-                        Some((Ok(objects), next_state))
+            move |(token, prefix, cfg, client, base)| {
+                // list only issues GETs; the reconstructed handle shares the
+                // real permit pool but never reaches the multipart path
+                let part_permits = part_permits.clone();
+                async move {
+                    let token = token?;
+                    let s = S3Storage {
+                        cfg: cfg.clone(),
+                        client: client.clone(),
+                        base: base.clone(),
+                        retry_policy,
+                        part_permits,
+                    };
+                    let q: [(&str, &str); _] = [
+                        ("list-type", "2"),
+                        ("prefix", prefix.as_str()),
+                        ("continuation-token", token.as_str()),
+                    ];
+                    let q = if token.is_empty() { &q[..2] } else { &q[..] };
+                    let resp = match s.signed_request("GET", "", q, Bytes::new(), &[]).await {
+                        Ok(r) => r,
+                        Err(e) => return Some((Err(e), (None, prefix, cfg, client, base))),
+                    };
+                    let resp = match check_status(resp).await {
+                        Ok(r) => r,
+                        Err(e) => return Some((Err(e), (None, prefix, cfg, client, base))),
+                    };
+                    let body = match resp.text().await {
+                        Ok(b) => b,
+                        Err(e) => {
+                            return Some((Err(e.into()), (None, prefix, cfg, client, base)));
+                        }
+                    };
+                    match parse_list_v2(&body, &cfg.prefix) {
+                        Ok((objects, next)) => {
+                            let next_state = (next, prefix, cfg, client, base);
+                            Some((Ok(objects), next_state))
+                        }
+                        Err(e) => Some((Err(e), (None, prefix, cfg, client, base))),
                     }
-                    Err(e) => Some((Err(e), (None, prefix, cfg, client, base))),
                 }
             },
         )
@@ -1036,12 +1107,11 @@ mod tests {
                     if key.contains("boom") && part >= 2 {
                         return Resp::new(503).body(b"<Error/>".to_vec());
                     }
-                    let id = req.query("uploadId").unwrap().to_string();
-                    u.lock()
-                        .unwrap()
-                        .entry(id)
-                        .or_default()
-                        .insert(part, req.body.clone());
+                    // late parts of an aborted upload must not resurrect it
+                    let id = req.query("uploadId").unwrap();
+                    if let Some(parts) = u.lock().unwrap().get_mut(id) {
+                        parts.insert(part, req.body.clone());
+                    }
                     Resp::new(200).header("etag", &format!("\"etag-{part}\""))
                 }
                 "PUT" if req.headers.contains_key("x-amz-copy-source") => {
@@ -1202,4 +1272,105 @@ mod tests {
         assert!(matches!(err, Err(StorageError::Http { status: 503, .. })));
         assert!(uploads.lock().unwrap().is_empty(), "abort must clean up");
     }
+
+    /// Pipelined multipart keeps several part PUTs in flight, so they finish
+    /// out of order; CompleteMultipartUpload must still list every part
+    /// ascending by partNumber with its matching ETag (S3 rejects unsorted
+    /// manifests). Force part 1 to land last via a one-shot transient — its
+    /// retry backoff outlasts the other parts' PUTs — then assert the captured
+    /// completion XML is sorted and the bytes survive the reorder.
+    #[tokio::test]
+    async fn multipart_completion_orders_parts_by_number() {
+        use crate::storage::test_http::{Req, Resp, payload, read_all, reader, serve};
+        use std::collections::BTreeMap;
+        use std::sync::atomic::{AtomicBool, Ordering};
+        use std::sync::{Arc, Mutex};
+
+        let parts: Arc<Mutex<BTreeMap<u32, Vec<u8>>>> = Arc::new(Mutex::new(BTreeMap::new()));
+        let complete_xml: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
+        let part1_failed = Arc::new(AtomicBool::new(false));
+
+        let (p, cx, pf) = (parts.clone(), complete_xml.clone(), part1_failed.clone());
+        let base = serve(move |req: &Req| match req.method.as_str() {
+            "POST" if req.has_query("uploads") => Resp::new(200).body(
+                b"<InitiateMultipartUploadResult><UploadId>u1</UploadId></InitiateMultipartUploadResult>"
+                    .to_vec(),
+            ),
+            "PUT" if req.has_query("partNumber") => {
+                let part: u32 = req.query("partNumber").unwrap().parse().unwrap();
+                // fail part 1's first attempt once; its retry backoff lands it
+                // last in completion order, forcing the sort to do real work
+                if part == 1 && !pf.swap(true, Ordering::SeqCst) {
+                    return Resp::new(503).body(b"<Error/>".to_vec());
+                }
+                p.lock().unwrap().insert(part, req.body.clone());
+                Resp::new(200).header("etag", &format!("\"e{part}\""))
+            }
+            "POST" if req.has_query("uploadId") => {
+                *cx.lock().unwrap() = Some(String::from_utf8_lossy(&req.body).into_owned());
+                Resp::new(200).body(b"<CompleteMultipartUploadResult/>".to_vec())
+            }
+            "GET" => {
+                let buf: Vec<u8> = p.lock().unwrap().values().flatten().copied().collect();
+                Resp::new(200).body(buf)
+            }
+            _ => Resp::new(400),
+        })
+        .await;
+
+        let cfg = S3Config {
+            bucket: "bkt".into(),
+            prefix: "p".into(),
+            region: "us-east-1".into(),
+            creds: CredentialSource::Static(Credentials {
+                access_key: "AKID".into(),
+                secret_key: "sek".into(),
+                session_token: None,
+                expires_at: None,
+            }),
+            endpoint: Some(base),
+            force_path_style: true,
+        };
+        // base_delay long enough that part 1's retry completes well after the
+        // sub-ms loopback PUTs of parts 2..5
+        let policy = RetryPolicy {
+            max_attempts: 3,
+            base_delay: Duration::from_millis(80),
+            max_delay: Duration::from_millis(80),
+            jitter: false,
+        };
+        let s = S3Storage::with_retry_policy(cfg, policy).unwrap();
+
+        let big = payload(33 * 1024 * 1024); // 5 parts: 8,8,8,8,1 MiB
+        s.put("big.zst", reader(&big), Some(big.len() as u64))
+            .await
+            .unwrap();
+
+        let xml = complete_xml
+            .lock()
+            .unwrap()
+            .clone()
+            .expect("completion sent");
+        let nums: Vec<u32> = xml
+            .split("<PartNumber>")
+            .skip(1)
+            .filter_map(|s| s.split("</PartNumber>").next())
+            .filter_map(|n| n.parse().ok())
+            .collect();
+        assert_eq!(
+            nums,
+            vec![1, 2, 3, 4, 5],
+            "parts must be sorted by partNumber: {xml}"
+        );
+        for n in 1..=5u32 {
+            assert!(
+                xml.contains(&format!(
+                    "<PartNumber>{n}</PartNumber><ETag>\"e{n}\"</ETag>"
+                )),
+                "part {n} etag mapping wrong in {xml}"
+            );
+        }
+        // bytes survive the out-of-order pipeline
+        assert_eq!(read_all(s.get("big.zst").await.unwrap()).await, big);
+    }
 }
diff --git a/tests/backup_roundtrip.rs b/tests/backup_roundtrip.rs
index dcd3ddc..89d1686 100644
--- a/tests/backup_roundtrip.rs
+++ b/tests/backup_roundtrip.rs
@@ -349,6 +349,86 @@ async fn fetch_recreates_tablespace_symlinks() {
     assert_eq!(std::fs::read(target.join("PG_VERSION")).unwrap(), b"16");
 }
 
+/// A part carrying its own `pg_tblspc/<oid>` symlink entry must not override
+/// the sentinel-restored link: the sentinel target (which honors
+/// --tablespace-mapping) is authoritative, and recreating the link mid-restore
+/// would race the concurrent data fan-out. Regression for the archived link
+/// target clobbering a mapped relocation
+#[cfg(unix)]
+#[tokio::test]
+async fn fetch_ignores_archived_tablespace_symlink_entry() {
+    let dir = tempfile::tempdir().unwrap();
+    let storage_dir = dir.path().join("storage");
+    let restore = dir.path().join("restore");
+    let sentinel_target = dir.path().join("ts_target");
+    let archived_target = dir.path().join("archived_ts");
+    let store = Arc::new(FsStorage::new(&storage_dir).unwrap());
+
+    let backup_name = format_backup_name(1, 0x0300_0000, 16 * 1024 * 1024);
+
+    let mut spec = TablespaceSpec::new(restore.to_string_lossy());
+    spec.add(16384, sentinel_target.to_string_lossy());
+    let mut sentinel = make_sentinel_v2(restore.to_str().unwrap());
+    sentinel.sentinel.tablespace_spec = Some(spec);
+    put_bytes(
+        store.clone(),
+        &sentinel_key(&backup_name),
+        serde_json::to_vec(&sentinel).unwrap(),
+    )
+    .await;
+
+    // Part has BOTH an archived symlink entry pointing at the backup-time
+    // location AND the file beneath it. The symlink entry must be ignored
+    let tar_bytes = {
+        let mut buf: Vec<u8> = Vec::new();
+        {
+            let mut b = tar::Builder::new(&mut buf);
+            let mut link = tar::Header::new_gnu();
+            link.set_entry_type(tar::EntryType::Symlink);
+            link.set_size(0);
+            link.set_mode(0o777);
+            link.set_link_name(&archived_target).unwrap();
+            link.set_path("pg_tblspc/16384").unwrap();
+            link.set_cksum();
+            b.append(&link, std::io::empty()).unwrap();
+
+            let mut file = tar::Header::new_gnu();
+            file.set_size(2);
+            file.set_mode(0o644);
+            file.set_path("pg_tblspc/16384/PG_VERSION").unwrap();
+            file.set_cksum();
+            b.append(&file, &b"16"[..]).unwrap();
+            b.finish().unwrap();
+        }
+        buf
+    };
+    put_bytes(store.clone(), &tar_part_key(&backup_name, 1, ""), tar_bytes).await;
+
+    fetch_mod::handle(
+        &test_settings(),
+        store as Arc<dyn Storage>,
+        &backup_name,
+        &restore,
+    )
+    .await
+    .unwrap();
+
+    let link = restore.join("pg_tblspc/16384");
+    let md = std::fs::symlink_metadata(&link).unwrap();
+    assert!(md.file_type().is_symlink(), "expected symlink at {link:?}");
+    // Link must still point at the sentinel target, not the archived one
+    assert_eq!(std::fs::read_link(&link).unwrap(), sentinel_target);
+    assert!(
+        !archived_target.exists(),
+        "archived target must never be materialized"
+    );
+    // File lands through the sentinel link
+    assert_eq!(
+        std::fs::read(sentinel_target.join("PG_VERSION")).unwrap(),
+        b"16"
+    );
+}
+
 #[tokio::test]
 async fn show_round_trip_and_mark_flips_permanent() {
     use walrus::pg::backup::show as show_mod;
diff --git a/tests/vm_live.rs b/tests/vm_live.rs
index de6726d..be62811 100644
--- a/tests/vm_live.rs
+++ b/tests/vm_live.rs
@@ -876,7 +876,7 @@ async fn wal_summaries_parse_real_pg_files() {
 }
 
 /// End-to-end `--delta-from-wal-summaries`: the `summarize_wal=off` and
-/// missing-`--pgdata` preconditions must abort, and the success path must
+/// missing local PGDATA preconditions must abort, and success path must
 /// reconstruct byte-for-byte against a non-delta backup of the same state.
 #[tokio::test]
 async fn delta_from_summaries_against_live_pg() {
@@ -952,7 +952,7 @@ async fn delta_from_summaries_against_live_pg() {
     let mut s_delta = s.clone();
     s_delta.delta.max_steps = 1;
 
-    // ── precondition bail: summaries live on the host fs, so --pgdata is
+    // ── precondition bail: summaries live on host fs, so local PGDATA is
     //    required once a delta parent is in play ──
     let pgdata_err = backup::push::handle(
         &s_delta,
@@ -964,9 +964,9 @@ async fn delta_from_summaries_against_live_pg() {
         },
     )
     .await
-    .expect_err("--delta-from-wal-summaries without --pgdata must abort");
+    .expect_err("--delta-from-wal-summaries without local PGDATA must abort");
     assert!(
-        format!("{pgdata_err:#}").contains("--pgdata"),
+        format!("{pgdata_err:#}").contains("PGDATA"),
         "{pgdata_err:#}"
     );