Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
190 commits
Select commit Hold shift + click to select a range
787b671
docs(cas): Phase 1 + Phase 2 implementation plans
filimonov May 7, 2026
779e200
refactor(cas): move checksumstxt parser to pkg/checksumstxt
filimonov May 7, 2026
1a67bdc
test(checksumstxt): real ClickHouse fixtures (v4 wide/compact/project…
filimonov May 7, 2026
40fb763
docs(cas): commit design specification for content-addressable backup…
filimonov May 7, 2026
f622f1b
feat(cas): add CASBackupParams and BackupMetadata.CAS field
filimonov May 7, 2026
09e996b
feat(cas): Backend interface, storage adapter, and in-memory test fake
filimonov May 7, 2026
4bbb31a
feat(cas): blob path derivation and bucket layout helpers
filimonov May 7, 2026
ff40ae8
docs(cas): document why disk name is not TablePathEncode'd
filimonov May 7, 2026
c2ddeb8
feat(cas): config schema and validation
filimonov May 7, 2026
ac734fb
feat(cas): reject root_prefix containing '..' or starting with '/'
filimonov May 7, 2026
36d097a
feat(cas): inprogress and prune marker primitives
filimonov May 7, 2026
9b2f378
feat(cas): parallel cold-list of blob existence set
filimonov May 7, 2026
1a9d127
feat(cas): tar.zstd archive with path-traversal containment
filimonov May 7, 2026
dd8eb09
feat(cas): ValidateBackup precondition for every CAS command
filimonov May 7, 2026
4de4ca6
feat(cas): reject dot-only backup names ('.', '..', '...')
filimonov May 7, 2026
84dec21
feat(cas): object-disk table pre-flight detection
filimonov May 7, 2026
48b3747
feat(cas): cas-upload orchestrator (§6.4)
filimonov May 7, 2026
b212d3f
feat(cas): cas-download materializes v1-shaped local backup (§6.5)
filimonov May 7, 2026
d121dfc
feat(cas): cas-restore = cas-download + v1 restore handoff (§6.5)
filimonov May 7, 2026
e22646c
feat(cas): cas-delete with §6.6 ordering
filimonov May 7, 2026
9cab436
feat(cas): cas-verify HEAD + size check (§6.8)
filimonov May 7, 2026
a8cc945
feat(cas): cas-status bucket-health summary
filimonov May 7, 2026
ab64088
feat(cas): wire cas-* CLI commands
filimonov May 7, 2026
607d7d7
feat(cas): pidlock for cas-delete to match v1 delete
filimonov May 7, 2026
2b0cf2d
docs(cas): README section and cross-link from upload --help
filimonov May 7, 2026
b59ce42
test(cas): integration roundtrip and cross-mode guards (§10.4 Phase 1)
filimonov May 7, 2026
c25514e
feat(cas): exclude CAS prefix from v1 list/retention; cross-mode list…
filimonov May 7, 2026
73975f8
fix(cas): final-review fixes — restore handoff, upload stats, end-use…
filimonov May 7, 2026
ad661f2
docs(cas): follow-up plan covering correctness gaps + consistency rev…
filimonov May 7, 2026
06be8bf
fix(cas): merge Query/UUID/Size/TotalBytes from local v1 metadata int…
filimonov May 7, 2026
ea790ed
test(cas): mutation-dedup integration test (the headline value-prop)
filimonov May 7, 2026
d9d4577
fix(cas): cas-verify distinguishes stat-error from missing-blob
filimonov May 7, 2026
ff105e8
fix(cas): replace '???' sentinel with '(unknown)' in list remote (Fin…
filimonov May 7, 2026
99e4017
fix(cas): object-disk pre-flight reads the backup snapshot, not live …
filimonov May 7, 2026
b2bcd70
fix(cas): F9+F11+F12+F13+F14 — small follow-ups
filimonov May 7, 2026
a75be30
test(cas): integration test fixups (partial — 1/4 CAS tests passing)
filimonov May 7, 2026
1ca1961
chore: untrack docs/superpowers/ (working artifacts; kept locally via…
filimonov May 7, 2026
d225b00
fix(cas): T1+T3+T4+T8 — pre-merge fixes from external review
filimonov May 7, 2026
32ee188
fix(cas): T5 — duration fields are strings parseable as 24h via yaml.…
filimonov May 7, 2026
3ed1cc9
fix(cas): T2 — decode shadow-dir names before storing in tablePlan (B2)
filimonov May 7, 2026
57a2e40
fix(cas): T6 — validate disk/part names from remote metadata against …
filimonov May 7, 2026
5ef2887
fix(cas): T7 — cas-prune stub + accurate Phase 1 docs (M6)
filimonov May 7, 2026
3c81aac
fix(cas): proper cross-mode guard for v1 download/delete on CAS backu…
filimonov May 7, 2026
a6854fe
fix(cas): integration suite green — Walk prefix + cross-mode v1-names…
filimonov May 7, 2026
79bfcff
feat(cas): streaming on-disk mark set with external mergesort
filimonov May 7, 2026
990bfd8
feat(cas): parallel orphan sweep with grace cutoff
filimonov May 7, 2026
6903b3a
feat(cas): cas-prune mark-and-sweep with deferred marker release (§6.7)
filimonov May 7, 2026
abeaac1
feat(cas): cas-prune CLI binding
filimonov May 7, 2026
351283b
fix(cas): cas-prune --grace-blob/--abandon-threshold accept zero lite…
filimonov May 7, 2026
9575d8c
test(cas): integration coverage for cas-prune
filimonov May 7, 2026
65b9d62
docs(cas): mark Phase 1 + 2 shipped; add prune to README and runbook …
filimonov May 7, 2026
7da649a
fix(cas): wire --skip-object-disks to planUpload's exclusion filter
filimonov May 7, 2026
a707cde
fix(cas): T1 follow-up — actually exclude object-disk tables, Exclude…
filimonov May 7, 2026
634cd79
refactor(cas): T1 follow-up — end-to-end test, doc cleanup, naming
filimonov May 7, 2026
4f7f226
fix(cas): build bm.Tables from metadata-JSON enumeration, not parts walk
filimonov May 7, 2026
b3d45f8
fix(cas): T2 follow-up — empty tables don't break download
filimonov May 7, 2026
920e4b7
chore(cas): remove TablePathDecode — dead after planUpload refactor
filimonov May 7, 2026
87f3c9f
test(cas): testfixtures supports synthesizing parts with projections
filimonov May 7, 2026
840e00c
fix(cas): two-pass projection-aware part walker
filimonov May 7, 2026
f005801
fix(cas): recurse into .proj/ during download blob discovery
filimonov May 7, 2026
96c19da
fix(cas): T6 follow-up — validate filenames before .proj recursion
filimonov May 7, 2026
c71509f
test(cas): integration coverage for projections, empty tables, skip-o…
filimonov May 7, 2026
e451fb0
feat(storage): add PutFileAbsoluteIfAbsent interface + stubs
filimonov May 8, 2026
9005a75
feat(storage/s3): native PutFileAbsoluteIfAbsent via IfNoneMatch
filimonov May 8, 2026
5045490
feat(storage/sftp): native PutFileAbsoluteIfAbsent via O_EXCL
filimonov May 8, 2026
c80bd44
feat(storage/azblob): native PutFileAbsoluteIfAbsent via If-None-Match
filimonov May 8, 2026
6029f5d
feat(storage/gcs): native PutFileAbsoluteIfAbsent via DoesNotExist
filimonov May 8, 2026
143f9d8
feat(storage/cos): native PutFileAbsoluteIfAbsent via If-None-Match
filimonov May 8, 2026
994f283
feat(storage/ftp): PutFileAbsoluteIfAbsent — refuse-by-default + opt-in
filimonov May 8, 2026
bc63e91
fix(storage/ftp): cleanup tmp file on Stor failure + defer reorder
filimonov May 8, 2026
1a63573
feat(cas): PutFileIfAbsent on Backend + casstorage adapter + fakedst
filimonov May 8, 2026
9a0cd3f
fix(cas): atomic inprogress marker — refuse second concurrent upload
filimonov May 8, 2026
163135e
fix(cas): atomic prune marker + scoped cleanup defer
filimonov May 8, 2026
ab35b1e
fix(cas): filter bm.Tables in local metadata.json on partial download
filimonov May 8, 2026
db9e788
test(cas): integration coverage for concurrent upload + prune refusal
filimonov May 8, 2026
28a206c
test(cas): refactor casBootstrap to accept a base config name
filimonov May 8, 2026
94e43ca
test(cas): GCS smoke test against fake-gcs-server emulator
filimonov May 8, 2026
d0e2303
test(cas): Azure Blob smoke test against Azurite emulator
filimonov May 8, 2026
9b35346
test(cas): SFTP smoke test against OpenSSH-server (panubo/sshd)
filimonov May 8, 2026
d586739
test(cas): FTP refuse-by-default smoke test
filimonov May 8, 2026
beb2056
test(cas): FTP opt-in best-effort smoke test
filimonov May 8, 2026
72da8b4
docs(cas): document CI smoke-test coverage matrix
filimonov May 8, 2026
ba9f4a8
fix(cas): cleanup inprogress marker on upload step 11b error path
filimonov May 8, 2026
b0518a0
fix(cas): cleanup inprogress marker on metadata.json commit failure
filimonov May 8, 2026
43d11f4
fix(cas): --dry-run --unlock no longer deletes the real prune marker
filimonov May 8, 2026
20add86
fix(cas): cas-download/cas-restore --data-only refuses with ErrNotImp…
filimonov May 8, 2026
487d055
fix(cas): treat zero-ModTime markers as fresh and zero-ModTime blobs …
filimonov May 8, 2026
901bc29
fix(cas): metadata-JSON-driven object-disk pre-flight catches fully-r…
filimonov May 8, 2026
d7256e7
fix(cas): --skip-object-disks honors decoded names for special-charac…
filimonov May 8, 2026
0bd7fa3
docs(cas): clarify --unlock error message for fresh-inprogress refusal
filimonov May 8, 2026
b49918a
feat(cas): populate BlobsTotal and OrphansHeldByGrace in PruneReport
filimonov May 8, 2026
a42ddbe
feat(cas): render BytesReclaimed via FormatBytes in PrintPruneReport
filimonov May 8, 2026
0e3f93c
fix(cas): defensive cfg.Validate() at Prune entry
filimonov May 8, 2026
ac19d22
test(cas): explicit-zero --grace-blob/--abandon-threshold overrides n…
filimonov May 8, 2026
b28a849
fix(cas): re-validate cold-listed blobs before commit (closes ColdLis…
filimonov May 8, 2026
3f64fe5
test(storage): focused not-found classification tests per backend
filimonov May 8, 2026
9c76f2e
test(cas/casstorage): Walk key reconstruction is correct under variou…
filimonov May 8, 2026
3c12f7c
test(cas): cross-backup dedup — third backup reuses blobs from earlie…
filimonov May 8, 2026
bc359bd
feat(cas/config): add wait_for_prune duration knob
filimonov May 8, 2026
04b36e8
feat(cas): waitForPrune helper polls prune marker with deadline
filimonov May 8, 2026
45a8213
feat(cas): cas-upload honors WaitForPrune via shared poll helper
filimonov May 8, 2026
f32a9f7
feat(cas): cas-delete honors WaitForPrune via shared poll helper
filimonov May 8, 2026
6352183
feat(cas/cli): --wait-for-prune flag on cas-upload and cas-delete
filimonov May 8, 2026
a087b6d
test(cas): integration tests for --wait-for-prune
filimonov May 8, 2026
eecd41b
docs(cas): document wait_for_prune (no longer deferred)
filimonov May 8, 2026
4d09a77
scaffold(server): cas_handlers.go with shared async-ack shape
filimonov May 8, 2026
6554ccc
refactor(backup): unify CAS* method signatures with commandId parameter
filimonov May 8, 2026
11ff338
feat(server): POST /backup/cas-upload/{name} async handler
filimonov May 8, 2026
88cfcee
feat(server): POST /backup/cas-download and /backup/cas-restore handlers
filimonov May 8, 2026
748997e
feat(server): POST /backup/cas-delete sync handler with status mapping
filimonov May 8, 2026
234094b
feat(server): POST /backup/cas-verify and /backup/cas-prune async han…
filimonov May 8, 2026
ca1483f
feat(server): GET /backup/cas-status sync handler
filimonov May 8, 2026
c159ae3
feat(server): /backup/actions recognizes cas-* verbs
filimonov May 8, 2026
030eb05
feat(server): /backup/list response includes CAS backups with kind field
filimonov May 8, 2026
335f71b
test(cas): integration roundtrip via REST API
filimonov May 8, 2026
fcf064c
docs(cas): document REST API endpoints
filimonov May 8, 2026
298eb32
fix(server): cas-status uses NotFromAPI instead of bare 0 commandId
filimonov May 8, 2026
8b5daec
fix(cas): recognize legacy 'azure' disk type as object disk (parity w…
filimonov May 8, 2026
4427f3b
fix(cas): reject truncated blobs during download (compare bytes-copie…
filimonov May 8, 2026
4c9e4a0
feat(cas): default inline_threshold to 256 KiB; mark CAS as experimental
filimonov May 8, 2026
8bdd9e5
fix(cas): reject cold-listed blobs whose remote size differs from exp…
filimonov May 8, 2026
0f0eeb6
docs(cas): RemoteStorage compat note and experimental banner in README
filimonov May 8, 2026
7f44074
docs(runbook): add first-production-deployment walkthrough
filimonov May 8, 2026
d060318
feat(cas): probe conditional-put support on first CAS use; refuse on …
filimonov May 8, 2026
425a650
test(storage): not-found classifiers extracted as named helpers; test…
filimonov May 8, 2026
b865119
fix(cas): download materializes into staging dir + atomic rename
filimonov May 8, 2026
13ac41f
fix(cas): probe gracefully skips on backends that refuse conditional …
filimonov May 8, 2026
6ae2118
fix(storage/sftp): disambiguate O_EXCL FAILURE via Stat (proftpd/SSHv…
filimonov May 8, 2026
4857773
feat(cas): unsafe-mode startup WARN banner; defer atomic-FTP-via-MKD
filimonov May 8, 2026
26925c9
fix(cas): preflight detects encrypted-over-object disks (parity with …
filimonov May 8, 2026
146c1e1
fix(cas): object-disk preflight fails closed on disk-query errors (Al…
filimonov May 8, 2026
c603d6b
fix(storage/s3): conditional PUT preserves SSE-C / KMS encryption con…
filimonov May 8, 2026
4c167be
fix(cas): probe runs only on marker-writing ops; read-only CAS comman…
filimonov May 8, 2026
0a62836
fix(cas): verify blob upload bytes match checksums.txt size; reject c…
filimonov May 8, 2026
3b6f6f4
docs(cas): correct R4 hash-collision math (birthday-paradox bound)
filimonov May 8, 2026
c7bbcf9
docs(cas): SR2+SR3 — document new flags + known-limitations section
filimonov May 8, 2026
810c671
fix(cas): --tables filter supports glob patterns (parity with v1)
filimonov May 8, 2026
10ac913
fix(cas): reject multi-segment root_prefix to keep v1 skip protection…
filimonov May 8, 2026
108e6bf
docs(cas): refresh §7 LOC estimate and risk-register entries against …
filimonov May 8, 2026
54ee36b
fix(cas): per-process random probe key prevents concurrent probes sab…
filimonov May 8, 2026
29dd7a4
fix(cas): pidlock around cas-download phase prevents concurrent stagi…
filimonov May 8, 2026
08e1bd5
docs(cas): §9 — add deferred entries 4 (per-blob conditional create)
filimonov May 8, 2026
3406fbf
docs(changelog): vNEXT entry for CAS backups (Phases 1-8)
filimonov May 8, 2026
f0419db
fix(cas): observability + naming polish (F17, F18, F19, F20, F26)
filimonov May 8, 2026
8e3fc77
fix(storage/sftp): close file before Remove on error path (F22)
filimonov May 8, 2026
cd63913
fix(server): map cas-delete in-progress / exists errors to 409 (F27)
filimonov May 8, 2026
ae774b9
perf(cas): SweepOrphans shard iterator now uses container/heap (O(N l…
filimonov May 8, 2026
f906332
fix(cas): wave-A review fixups (F26 hint + F27 dead-code removal)
filimonov May 8, 2026
8ae89ae
feat(server/metrics): register CAS commands and add CAS backups gauge…
filimonov May 8, 2026
304fdc6
test(cas/prune): add BlobDeleteFailuresCounted test + fakedst delete …
filimonov May 8, 2026
126d636
feat(cas/status): JSON tags + age_seconds on StatusReport and friends…
filimonov May 8, 2026
1d953ed
fix(backup): connect ClickHouse for CAS remote-list macro expansion (N1)
filimonov May 8, 2026
a3f07cd
perf(cas): stream per-table archives via tempfile (avoid in-memory bu…
filimonov May 8, 2026
cedfa80
docs(runbook): binary-rollback warning + root_prefix-change risks (F2…
filimonov May 8, 2026
2638073
test(cas/prune): zero-live-backups behavior with grace=0 vs default (F7)
filimonov May 8, 2026
02ea659
fix(cas): preserve CAS metadata field on cas-download handoff so v1 o…
filimonov May 8, 2026
56037b5
perf(cas/upload): parallelize step-11c cold-list re-validation (F5)
filimonov May 8, 2026
6a05627
perf(cas/prune): parallelize mark-phase archive downloads (F6)
filimonov May 8, 2026
2c87818
fix(cas/prune): drop dead accumulateRefsForBackup (F6 review fixup)
filimonov May 8, 2026
f6c3a70
fix(cas): cas-delete writes inprogress marker to lock out same-name u…
filimonov May 8, 2026
ed4f24c
feat(server): cas-delete REST handler async (F13)
filimonov May 8, 2026
3b61c04
feat(cas/prune): dry-run candidates in PruneReport + structured loggi…
filimonov May 8, 2026
a2537c9
fix(cas): D1 review fixups (defensive DryRunCandidates copy + tighter…
filimonov May 8, 2026
fc85bc3
test(cas): integration test fixups for wave-5 F13/N2 changes
filimonov May 8, 2026
0561e6f
docs(cas): §9 — add genuinely-deferred wave-5 items (F16, F21, F23-F2…
filimonov May 8, 2026
97fc18c
fix(cas): observability/safety polish (#5, #11, #13, #18)
filimonov May 8, 2026
cc5aac8
fix(cas/delete): refuse unreadable marker, stale-marker defer, Enable…
filimonov May 8, 2026
2665603
chore(cas): drop dead accumulateRefsFromArchive (#24)
filimonov May 8, 2026
3b8ff55
test(cas): coverage for LimitReader, Enabled-guard, and unreadable-ma…
filimonov May 8, 2026
6572a82
fix(cas): all marker-cleanup defers use detached context (#2)
filimonov May 8, 2026
9d00a82
refactor(cas/upload): single defer for inprogress marker; remove 10 e…
filimonov May 8, 2026
25307c6
fix(cas): wave-6.B review fixups
filimonov May 8, 2026
8b8e890
perf(cas): cas-verify streams archives instead of io.ReadAll (#4)
filimonov May 8, 2026
71ae743
test(storage): TestBackupList_SkipPrefixesFiltering (#6)
filimonov May 8, 2026
d90565d
feat(cas): reject backup names matching CAS prefix; promote skip log …
filimonov May 8, 2026
d77960e
feat(cas): cas-upload --unlock <name> self-service stranded-marker re…
filimonov May 8, 2026
827f727
fix(cas): cfg.Validate() at entry of delete/download/verify/status (#15)
filimonov May 8, 2026
fec9fbd
feat(server): probe + unsafe-banner fire once per daemon lifetime, no…
filimonov May 8, 2026
c081fb3
test(cas): wave-6.C review fixups — name-collision boundary + unlock …
filimonov May 8, 2026
bb01105
docs(changelog): BREAKING CHANGES section for CAS rollback + interfac…
filimonov May 8, 2026
7bf9ed0
docs(cas): defer wave-6 minor items to §9 + runbook note for cas-stat…
filimonov May 8, 2026
86d95a8
test(testflows): refresh cli snapshots for CAS additions
filimonov May 8, 2026
df27af8
test: cross-test isolation fixes for CAS state and JSON 'kind' field
filimonov May 8, 2026
ac95863
test+server: cross-version CI fixes for CAS
filimonov May 8, 2026
772b822
test: fix CAS test cross-test state leakage and data race
filimonov May 8, 2026
b98de7e
test: detect system.projections via system.tables (24.4+ feature)
filimonov May 8, 2026
539f150
test: correct system.projections version note (24.9 not 24.4)
filimonov May 8, 2026
d69fd20
test: nuke entire backup/ tree for CAS tests in env.Cleanup
filimonov May 8, 2026
54d0e05
ci: rerun Testflows 22.3 (materializedpostgresql 600s flake)
filimonov May 9, 2026
f90c7af
test: scrub empty shadow/increment.txt left by TestShadowCleanup
filimonov May 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ __pycache__/
*.py[cod]
.agents/
pyrightconfig.json
docs/clickhouse-backup-v2-design-state.md
docs/superpowers/
13 changes: 13 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# vNEXT (unreleased)

BREAKING CHANGES

- ⚠️ **DO NOT downgrade to a pre-CAS binary if CAS data exists in your bucket.** The pre-CAS binary has no knowledge of the `cas/` skip prefix and will treat the CAS namespace as a broken v1 backup. The next `clean remote_broken` run, or `BackupsToKeepRemote` retention cron, will silently DELETE all CAS data. Recovery procedures: see [docs/cas-operator-runbook.md](docs/cas-operator-runbook.md) "Binary rollback procedure".
- The `pkg/storage.RemoteStorage` interface gains two required methods: `PutFileAbsoluteIfAbsent(ctx, key, r, size) (created bool, err error)` and `PutFileIfAbsent(ctx, key, r, size) (created bool, err error)`. Any third-party `RemoteStorage` implementation must add these methods to compile. Implementors that don't support atomic create-only-if-absent should return `pkg/storage.ErrConditionalPutNotSupported`; CAS commands then refuse on those backends unless `cas.allow_unsafe_markers=true`.
- The `pkg/storage.BackupDestination.BackupList` signature gains a fourth `skipPrefixes []string` parameter. External callers must pass `nil` (or the result of `cas.Config.SkipPrefixes()`) to compile. Internal callers in this repo are updated.
- A v1 backup literally named `"cas"` will be silently filtered after upgrade (the default `cas.root_prefix` is `"cas/"`). Rename or move any such backup before upgrading. The new binary logs an ERROR for each skipped entry and rejects future creation of names that collide with the CAS skip-prefix.

NEW FEATURES

- add experimental Content-Addressable Storage (CAS) backups via new `cas-upload`, `cas-download`, `cas-restore`, `cas-delete`, `cas-verify`, `cas-prune`, `cas-status` commands. CAS deduplicates file content across backups (especially effective for mutated parts) and removes the incremental-chain dependency — every CAS backup is independently restorable. Available in CLI and REST API. Configure via new `cas:` config block; see [docs/cas-design.md](docs/cas-design.md) and [docs/cas-operator-runbook.md](docs/cas-operator-runbook.md). Object-disk and client-side-encryption tables not yet supported.

# v2.6.43

NEW FEATURES
Expand Down
44 changes: 44 additions & 0 deletions ReadMe.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,45 @@ For that reason, it's required to run `clickhouse-backup` on the same host or sa
- **Support for multi disks installations**
- **Support for custom remote storage types via `rclone`, `kopia`, `restic`, `rsync` etc**
- **Support for incremental backups on remote storage**
- **Smart deduplicating backups** with the `cas-*` commands — every backup is independent, only changed data is uploaded, and mutations don't blow up your storage bill (see below)

## Smart deduplicating backups (opt-in, ⚠️ EXPERIMENTAL)

> **EXPERIMENTAL.** The `cas-*` commands and on-disk layout are still under active development; future releases may bump `LayoutVersion` in a way that requires re-uploading existing CAS backups. Do not rely on CAS as the sole copy of production data yet — keep a parallel v1 backup (or a copy outside the CAS namespace) until the feature is marked stable. Evaluate it on non-critical workloads first; report issues. See [`docs/cas-design.md`](docs/cas-design.md) for the full design.

Most backup tools force a tradeoff: full backups eat storage and bandwidth, while incremental backups are smaller but chain together — losing or rotating the wrong base backup breaks every dependent restore. ClickHouse mutations make this worse: a single `ALTER TABLE ... UPDATE` can rewrite one column and rename the part, leaving 99% of the bytes identical to the previous version but invisible to chain-based dedup.

The `cas-*` commands (`cas-upload`, `cas-download`, `cas-restore`, `cas-delete`, `cas-verify`, `cas-status`, `cas-prune`) use **content-addressed storage** to solve both problems. Files are keyed by their content hash, so identical bytes are stored once and shared across every backup that contains them — across mutations, across days, across tables. The result:

- **Smaller uploads than incremental, no base-backup dependency.** Each `cas-upload` only transfers files whose content isn't already in the remote — typically a small fraction of a full backup. Unlike incremental backups, every CAS backup is independently restorable. Delete any backup at any time without affecting the others.
- **Mutation-friendly.** An `ALTER UPDATE` on one column reuses every other column's bytes; the second backup uploads only the changed column.
- **Storage grows with new data, not with the number of backups.** Keeping 30 daily snapshots of a slowly-changing dataset costs roughly the same as keeping one.

### Quick start

In `config.yml`:

```yaml
cas:
enabled: true
cluster_id: my-prod-cluster # required; identifies this source cluster
```

Then:

```sh
clickhouse-backup create my_backup # snapshot the data locally
clickhouse-backup cas-upload my_backup # push to remote (only new content)
clickhouse-backup cas-status # see counts, sizes, in-flight uploads
clickhouse-backup cas-restore my_backup # restore (any backup, any time)
clickhouse-backup cas-delete my_backup # remove the backup's metadata atomically
clickhouse-backup cas-prune # reclaim blob bytes left behind by deletes
clickhouse-backup cas-verify my_backup # cheap integrity check (HEAD + size)
```

`cas-delete` only removes the per-backup metadata; the blob bytes are reclaimed by the periodic `cas-prune` mark-and-sweep GC. See [`docs/cas-operator-runbook.md`](docs/cas-operator-runbook.md) for cadence, monitoring, and recovery from a stranded prune marker.

CAS backups live under their own prefix in the remote bucket and don't interfere with the existing `upload` / `download` / `restore` commands — you can mix both in the same bucket if needed.

## Limitations

Expand Down Expand Up @@ -637,6 +676,11 @@ Display a list of all operations from start of API server: `curl -s localhost:71
- Optional string query argument `filter` to filter actions on server side.
- Optional string query argument `last` to show only the last `N` actions.

### CAS endpoints

For CAS commands (`cas-upload`, `cas-restore`, etc.), see the corresponding
`/backup/cas-*` endpoints documented in [docs/cas-operator-runbook.md](docs/cas-operator-runbook.md).

## Examples

- [Simple cron script for daily backups and remote upload](Examples.md#simple-cron-script-for-daily-backups-and-remote-upload)
Expand Down
254 changes: 254 additions & 0 deletions cmd/clickhouse-backup/cas_commands.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
package main

import (
"fmt"
"time"

"github.com/urfave/cli"

"github.com/Altinity/clickhouse-backup/v2/pkg/backup"
"github.com/Altinity/clickhouse-backup/v2/pkg/config"
)

// resolveWaitForPrune returns the --wait-for-prune CLI value if set, otherwise
// falls back to the configured cas.wait_for_prune value.
func resolveWaitForPrune(c *cli.Context, cfg *config.Config) (time.Duration, error) {
if v := c.String("wait-for-prune"); v != "" {
d, err := time.ParseDuration(v)
if err != nil {
return 0, fmt.Errorf("--wait-for-prune: %w", err)
}
return d, nil
}
return cfg.CAS.WaitForPruneDuration(), nil
}

// casCommands returns the seven cas-* CLI subcommands (six implemented + the
// cas-prune Phase-2 stub). rootFlags is the slice of global flags from main.go
// (passed via the same append-pattern as the existing v1 commands).
func casCommands(rootFlags []cli.Flag) []cli.Command {
return []cli.Command{
{
Name: "cas-upload",
Usage: "Upload a local backup using the content-addressable layout (see docs/cas-design.md)",
UsageText: "clickhouse-backup cas-upload [--skip-object-disks] [--dry-run] [--unlock] <backup_name>",
Description: "Upload a backup created by 'clickhouse-backup create' using the CAS layout. Blobs are content-keyed via per-part checksums.txt; small files are packed into per-table tar.zstd archives. CAS dedupes across mutations and across backups; every backup is independently restorable. Requires cas.enabled=true and cas.cluster_id configured.\n\n --unlock removes a stranded inprogress marker for <backup_name> (left behind by SIGKILL/OOM) and exits immediately without uploading. Incompatible with --dry-run and --skip-object-disks.",
Action: func(c *cli.Context) error {
cfg := config.GetConfigFromCli(c)
wait, err := resolveWaitForPrune(c, cfg)
if err != nil {
return err
}
b := backup.NewBackuper(cfg)
return b.CASUpload(c.Args().First(), c.Bool("skip-object-disks"), c.Bool("dry-run"), c.Bool("unlock"), version, c.Int("command-id"), wait)
},
Flags: append(rootFlags,
cli.BoolFlag{
Name: "skip-object-disks",
Usage: "Exclude tables on object disks (s3/azure/hdfs/web) instead of refusing the upload",
},
cli.BoolFlag{
Name: "dry-run",
Usage: "Plan the upload without writing anything to remote storage",
},
cli.StringFlag{
Name: "wait-for-prune",
Usage: `If a prune is in progress, wait up to this duration (Go duration string, e.g. "5m") before giving up. Overrides cas.wait_for_prune. Empty = use config; "0s" = don't wait.`,
},
cli.BoolFlag{
Name: "unlock",
Usage: "Remove a stranded inprogress marker for <backup_name> (self-service recovery after SIGKILL/OOM). Incompatible with --dry-run and --skip-object-disks. Does NOT perform an upload.",
},
),
},
{
Name: "cas-download",
Usage: "Materialize a CAS backup into the local data directory (does not load into ClickHouse)",
UsageText: "clickhouse-backup cas-download [-t, --tables=<db>.<table>] [--partitions=<part_names>] [-s, --schema] <backup_name>",
Description: "Download a CAS-layout backup into <DefaultDataPath>/backup/<name>/. Use cas-restore (or v1 restore) to load tables into ClickHouse from the materialized directory.",
Action: func(c *cli.Context) error {
b := backup.NewBackuper(config.GetConfigFromCli(c))
return b.CASDownload(c.Args().First(), c.String("tables"), c.StringSlice("partitions"), c.Bool("schema"), c.Bool("data"), version, c.Int("command-id"))
},
Flags: append(rootFlags,
cli.StringFlag{
Name: "table, tables, t",
Usage: "Restrict to tables matching db.table (comma-separated, exact match in CAS v1)",
},
cli.StringSliceFlag{
Name: "partitions",
Usage: "Restrict to part names (comma-separated)",
},
cli.BoolFlag{
Name: "schema, schema-only, s",
Usage: "Schema-only: write JSON metadata locally and skip part archives + blobs",
},
cli.BoolFlag{
Name: "data, d",
Hidden: true,
Usage: "Reserved (currently a no-op); will gate data-only download in a future version",
},
),
},
{
Name: "cas-restore",
Usage: "Download a CAS backup and restore tables into ClickHouse",
UsageText: "clickhouse-backup cas-restore [-t, --tables=<db>.<table>] [-m, --restore-database-mapping=<src>:<dst>[,...]] [--tm, --restore-table-mapping=<src>:<dst>[,...]] [--partitions=<part_names>] [-s, --schema] [-d, --data] [--rm, --drop] [--restore-schema-as-attach] [--replicated-copy-to-detached] [--skip-empty-tables] [--resume] <backup_name>",
Description: "Pulls the named CAS backup into the local backup directory and runs the v1 restore flow against it. --ignore-dependencies is rejected: CAS backups have no dependency chain. RBAC/configs/named-collections are out of scope for CAS v1.",
Action: func(c *cli.Context) error {
b := backup.NewBackuper(config.GetConfigFromCli(c))
return b.CASRestore(
c.Args().First(),
c.String("tables"),
c.StringSlice("restore-database-mapping"),
c.StringSlice("restore-table-mapping"),
c.StringSlice("partitions"),
c.StringSlice("skip-projections"),
c.Bool("schema"),
c.Bool("data"),
c.Bool("drop"),
c.Bool("ignore-dependencies"),
c.Bool("restore-schema-as-attach"),
c.Bool("replicated-copy-to-detached"),
c.Bool("skip-empty-tables"),
c.Bool("resume"),
version,
c.Int("command-id"),
)
},
Flags: append(rootFlags,
cli.StringFlag{
Name: "table, tables, t",
Usage: "Restrict to tables matching db.table (comma-separated, exact match in CAS v1)",
},
cli.StringSliceFlag{
Name: "restore-database-mapping, m",
Usage: "Database rename rules at restore time, format <src>:<dst> (repeatable or comma-separated)",
},
cli.StringSliceFlag{
Name: "restore-table-mapping, tm",
Usage: "Table rename rules at restore time, format <src>:<dst> (repeatable or comma-separated)",
},
cli.StringSliceFlag{
Name: "partitions",
Usage: "Restrict to part names (comma-separated)",
},
cli.StringSliceFlag{
Name: "skip-projections",
Usage: "Skip listed projections during restore, format `db_pattern.table_pattern:projections_pattern`",
},
cli.BoolFlag{
Name: "schema, s",
Usage: "Restore schema only",
},
cli.BoolFlag{
Name: "data, d",
Usage: "Restore data only",
},
cli.BoolFlag{
Name: "rm, drop",
Usage: "Drop existing schema objects before restore",
},
cli.BoolFlag{
Name: "i, ignore-dependencies",
Usage: "(rejected for CAS backups; accepted for CLI parity with 'restore')",
Hidden: true,
},
cli.BoolFlag{
Name: "restore-schema-as-attach",
Usage: "Use DETACH/ATTACH instead of DROP/CREATE for schema restoration",
},
cli.BoolFlag{
Name: "replicated-copy-to-detached",
Usage: "Copy data to detached folder for Replicated*MergeTree tables but skip ATTACH PART step",
},
cli.BoolFlag{
Name: "skip-empty-tables",
Usage: "Skip restoring tables that have no data (empty tables with only schema)",
},
cli.BoolFlag{
Name: "resume, resumable",
Usage: "Save intermediate state and resume restore on retry",
},
),
},
{
Name: "cas-delete",
Usage: "Delete a CAS backup's metadata subtree (Phase 1: blobs are NOT reclaimed)",
UsageText: "clickhouse-backup cas-delete <backup_name>",
Description: "Removes the named backup atomically by deleting metadata.json first, then the rest of the metadata subtree. Blob bytes are NOT reclaimed in Phase 1 — that ships with cas-prune in Phase 2; until then, deleted-backup blobs accumulate in remote storage.",
Action: func(c *cli.Context) error {
cfg := config.GetConfigFromCli(c)
wait, err := resolveWaitForPrune(c, cfg)
if err != nil {
return err
}
b := backup.NewBackuper(cfg)
return b.CASDelete(c.Args().First(), c.Int("command-id"), wait)
},
Flags: append(rootFlags,
cli.StringFlag{
Name: "wait-for-prune",
Usage: `If a prune is in progress, wait up to this duration (Go duration string, e.g. "5m") before giving up. Overrides cas.wait_for_prune. Empty = use config; "0s" = don't wait.`,
},
),
},
{
Name: "cas-verify",
Usage: "HEAD-check every blob referenced by a CAS backup",
UsageText: "clickhouse-backup cas-verify [--json] <backup_name>",
Description: "Walks the per-table archives, parses every checksums.txt, and HEAD-checks each referenced blob's existence and size. Exits non-zero if any failures are detected.",
Action: func(c *cli.Context) error {
b := backup.NewBackuper(config.GetConfigFromCli(c))
return b.CASVerify(c.Args().First(), c.Bool("json"), c.Int("command-id"))
},
Flags: append(rootFlags,
cli.BoolFlag{
Name: "json",
Usage: "Emit one JSON object per failure instead of human-readable lines",
},
),
},
{
Name: "cas-status",
Usage: "Print a LIST-only health summary for the configured CAS cluster",
UsageText: "clickhouse-backup cas-status",
Description: "Counts backups and blobs, reports the prune marker (if any), and lists fresh / abandoned in-progress upload markers. No object bodies are fetched.",
Action: func(c *cli.Context) error {
b := backup.NewBackuper(config.GetConfigFromCli(c))
return b.CASStatus(c.Int("command-id"))
},
Flags: rootFlags,
},
{
Name: "cas-prune",
Usage: "Garbage-collect orphan blobs (mark-and-sweep) for the configured CAS cluster",
UsageText: "clickhouse-backup cas-prune [--dry-run] [--grace-blob=<duration>] [--abandon-threshold=<duration>] [--unlock]",
Description: "Mark-and-sweep GC: walks every live backup's per-table archives, builds a sorted on-disk reference set, then lists the blob store and deletes orphans older than cas.grace_blob. Holds an advisory cas/<cluster>/prune.marker — concurrent cas-upload and cas-delete refuse while it's held. See docs/cas-design.md §6.7 and docs/cas-operator-runbook.md.",
Action: func(c *cli.Context) error {
b := backup.NewBackuper(config.GetConfigFromCli(c))
return b.CASPrune(c.Bool("dry-run"), c.String("grace-blob"), c.String("abandon-threshold"), c.Bool("unlock"), c.Int("command-id"))
},
Flags: append(rootFlags,
cli.BoolFlag{
Name: "dry-run",
Usage: "Print orphan candidates without deleting anything (no marker is written)",
},
cli.StringFlag{
Name: "grace-blob",
Value: "",
Usage: "Override cas.grace_blob — Go duration string (e.g. \"24h\", \"30m\", \"0s\"). Empty (default) uses the configured value.",
},
cli.StringFlag{
Name: "abandon-threshold",
Value: "",
Usage: "Override cas.abandon_threshold — Go duration string (e.g. \"168h\", \"0s\"). Empty (default) uses the configured value.",
},
cli.BoolFlag{
Name: "unlock",
Usage: "Delete a stranded cas/<cluster>/prune.marker (escape hatch when SIGKILL/OOM left it behind). Refuses if no marker is present.",
},
),
},
}
}
Loading
Loading