Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/borg/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -1861,9 +1861,14 @@ def verify_data(self):
# we must decompress, so it'll call assert_id() in there:
self.repo_objs.parse(defect_chunk, encrypted_data, decompress=True, ro_type=ROBJ_DONTCARE)
except IntegrityErrorBase:
# failed twice -> get rid of this chunk
# failed twice -> get rid of this chunk by dropping the pack file it lives in.
# At N=1 a pack holds exactly this one object (pack_id == chunk_id), so this
# removes only the defect. N>1 TODO: a defect inside a shared pack needs
# copy-forward of the good neighbours (see
# ArchiveGarbageCollector._copy_forward_pack); until then we address by chunk id.
pack_id = self.chunks[defect_chunk].pack_id
del self.chunks[defect_chunk]
self.repository.delete(defect_chunk)
self.repository.delete_pack(pack_id)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The goal is to get rid of the N=1 limitation, so I suggest you don't add code that makes no sense and is even very harmful when N!=1.

logger.debug("chunk %s deleted.", bin_to_hex(defect_chunk))
else:
logger.warning("chunk %s not deleted, did not consistently fail.", bin_to_hex(defect_chunk))
Expand Down
114 changes: 90 additions & 24 deletions src/borg/archiver/compact_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from ..helpers import set_ec, EXIT_ERROR, format_file_size, bin_to_hex
from ..helpers import ProgressIndicatorPercent
from ..manifest import Manifest
from ..repository import Repository, repo_lister
from ..repository import Repository, PackWriter, repo_lister

from ..logger import create_logger

Expand Down Expand Up @@ -171,29 +171,7 @@ def report_and_delete(self):
logger.warning(f"Soft-deleted archive {name} {hex_id} not found.")

repo_size_before = self.repository_size
logger.info("Determining unused objects...")
unused = set()
for id, entry in self.chunks.iteritems():
if not (entry.flags & ChunkIndex.F_USED):
unused.add(id)
logger.info(f"Deleting {len(unused)} unused objects...")
if unused:
# Before deleting any repository object, invalidate all centrally cached chunk indexes.
# Otherwise, if we get interrupted within the deletion loop, the still-existing cache/chunks.*
# would claim that already-deleted objects are still present. A later "borg create" would then
# trust that stale index, not re-upload the affected chunks and silently create an archive with
# dangling object references (see issue #9748). By removing the cached indexes first, an
# interruption is conservative: clients must rebuild the index from actual repository contents
# and will re-upload any deleted data. save_chunk_index() writes a fresh, valid index afterwards.
delete_chunkindex_cache(self.repository)
pi = ProgressIndicatorPercent(
total=len(unused), msg="Deleting unused objects %3.1f%%", step=0.1, msgid="compact.report_and_delete"
)
for i, id in enumerate(unused):
pi.show(i)
self.repository.delete(id)
del self.chunks[id]
pi.finish()
self.compact_packs()
repo_size_after = self.repository_size

count = len(self.chunks)
Expand All @@ -214,6 +192,94 @@ def report_and_delete(self):
else:
logger.info(f"Repository has data stored in {count} objects.")

def compact_packs(self):
"""Reclaim space at the granularity the store supports: whole pack files.

The store is append-only and content-addressed, so we never delete a single object in

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

append-only is a term from the borg 1.x past, don't use it if it makes no sense anymore, that just adds confusion.

at the store level, full packs are just added usually. removed only by special borg operations like compact or check.

there is also no ordering anymore as in 1.x segment files 0..N, so when looking at the whole store, one maybe can say add-only, but append-only implies an "end" of the store that does not exist anymore.

place. Instead we look at each pack file as a unit, using the F_USED flags that
analyze_archives() set on the index:

- all objects unused -> drop the whole pack file.
- all objects used -> leave it untouched.
- mixed (some used) -> copy the still-used objects forward into a fresh pack, repoint
their index entries, then drop the old pack whole. The unused
objects are simply never copied -- that is how their space goes.

At N=1 a pack holds exactly one object, so a pack is always either all-used or all-unused
and the mixed branch is unreachable; the observable result is identical to deleting the
unused single-object packs. The mixed branch is the N>1 copy-forward path; it is proven
here by tests that force max_count > 1.
"""
logger.info("Determining unused objects...")
# Group every known object by the pack file it lives in. We collect into a plain dict
# first (fully draining the iterator) so we can mutate self.chunks afterwards.
packs: dict[bytes, dict[str, list[bytes]]] = {}
for id, entry in self.chunks.iteritems():
bucket = packs.setdefault(entry.pack_id, {"used": [], "unused": []})
Comment on lines +217 to +218

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Never do whole-chunkindex transformations, building new datastructures that contain most of the information from the chunks index. The chunks index is a highly optimized datastructure and even then, it can be many gigabytes in size. If you are transforming that into another arrangement using unoptimized python datastructures, you will likely use too much RAM.

if entry.flags & ChunkIndex.F_USED:
bucket["used"].append(id)
else:
bucket["unused"].append(id)

unused = [id for bucket in packs.values() for id in bucket["unused"]]
packs_to_drop = [pid for pid, b in packs.items() if b["unused"] and not b["used"]]
packs_to_rewrite = [pid for pid, b in packs.items() if b["unused"] and b["used"]]
Comment on lines +225 to +226

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically the right idea, just needs a more efficient implementation.


logger.info(f"Deleting {len(unused)} unused objects...")
if unused:
# Before mutating the store, invalidate all centrally cached chunk indexes.
# Otherwise, if we get interrupted partway through, the still-existing cache/chunks.*
# would claim that already-removed objects are still present. A later "borg create" would
# then trust that stale index, not re-upload the affected chunks and silently create an
# archive with dangling object references (see issue #9748). By removing the cached indexes
# first, an interruption is conservative: clients must rebuild the index from actual
# repository contents and will re-upload any removed data. save_chunk_index() writes a
# fresh, valid index afterwards.
delete_chunkindex_cache(self.repository)

pi = ProgressIndicatorPercent(
total=len(packs_to_drop) + len(packs_to_rewrite),
msg="Compacting packs %3.1f%%",
step=0.1,
msgid="compact.report_and_delete",
)
progress = 0
for pack_id in packs_to_drop:
pi.show(progress)
progress += 1
# every object in this pack is unused -> the file goes as a whole.
self.repository.delete_pack(pack_id)
for id in packs[pack_id]["unused"]:
del self.chunks[id]
for pack_id in packs_to_rewrite:
pi.show(progress)
progress += 1
# N>1 only: keep the referenced objects by rewriting them into a fresh pack.
self._copy_forward_pack(pack_id, packs[pack_id]["used"])
for id in packs[pack_id]["unused"]:
del self.chunks[id]
pi.finish()

def _copy_forward_pack(self, pack_id, used_ids):

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

usually this is named "compaction".

"""Rewrite a mixed pack's still-used objects into a fresh pack, then drop the old pack.

Only reached at N>1 (a pack holding more than one object). Reads each survivor's stored
bytes via the current index location, writes them through a PackWriter bound to this
collector's index (so the new pack location lands in the index we persist), then drops
the old pack whole. Reading/writing goes through the local store, which is why this path
is exercised by local tests; routing it through repository RPC for remote N>1 is left to
the N>1 PR (it never runs at N=1, the only mode that exists today).
"""
store = self.repository.store
writer = PackWriter(store, max_count=self.repository.pack_max_count, chunks=self.chunks)
for id in used_ids:
entry = self.chunks[id]
key = "packs/" + bin_to_hex(entry.pack_id)
cdata = store.load(key, offset=entry.obj_offset, size=entry.obj_size)
writer.add(id, cdata)
writer.flush()
Comment on lines +274 to +280

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use borgstore.Store.defrag method, that is way more efficient.

self.repository.delete_pack(pack_id)


class CompactMixIn:
@with_repository(exclusive=True, compatibility=(Manifest.Operation.DELETE,))
Expand Down
5 changes: 4 additions & 1 deletion src/borg/archiver/debug_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,10 @@ def do_debug_delete_obj(self, args, repository):
print("object id %s is invalid." % hex_id)
else:
try:
repository.delete(id)
# N=1: pack_id == chunk_id, so the given object id names its pack file and we
# drop that pack whole. N>1 TODO: this should mark the object deleted in the
# index instead, since a pack then holds more than one object.
repository.delete_pack(id)
print("object %s deleted." % hex_id)
except Repository.ObjectNotFound:
print("object %s not found." % hex_id)
Expand Down
29 changes: 22 additions & 7 deletions src/borg/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,15 @@ def is_chunk_index_loaded(self):
"""
return self._chunks is not None

@property
def pack_max_count(self):
"""How many objects a freshly written pack may bundle (the N in "N objects per pack").

N=1 today, so every pack holds one object; raising this later enables real packing.
compact reads this to size the pack writer it uses for copy-forward.
"""
return self._pack_writer.max_count

def flush(self):
"""Flush any buffered pack writer chunks."""
if self._pack_writer is not None:
Expand Down Expand Up @@ -799,24 +808,30 @@ def put(self, id, data, wait=True):
# PackWriter shares this repository's index, so add() triggers the lazy build itself.
return self._pack_writer.add(id, data)

def delete(self, id, wait=True):
"""delete a repo object
def delete_pack(self, pack_id, wait=True):

@ThomasWaldmann ThomasWaldmann Jun 15, 2026

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

guess you could just use .store_delete at the few places that need this.

"""delete a whole pack file from the store, addressed by its pack_id.

Note: when doing calls with wait=False this gets async and caller must
deal with async results / exceptions later.
This is the only space-reclaiming primitive: the store is append-only and
content-addressed, so the smallest unit it can remove is a whole pack file.
There is deliberately no per-object delete -- removing a single object out of a
pack that still holds referenced neighbours is done by copy-forward (compact
rewrites the survivors into a fresh pack, then drops the old pack as a whole).

At N=1 a pack holds exactly one object and pack_id == chunk_id, so dropping the
pack is the same as dropping that one object; callers that still think in object
ids pass the id as the pack_id and that stays correct until N>1 lands.
"""
self._lock_refresh()
pack_id = id # N=1: pack_id == chunk_id
key = "packs/" + bin_to_hex(pack_id)
try:
self.store.delete(key)
except StoreObjectNotFound:
raise self.ObjectNotFound(id, str(self._location)) from None
raise self.ObjectNotFound(pack_id, str(self._location)) from None

def async_response(self, wait=True):
"""Get one async result (only applies to remote repositories).

async commands (== calls with wait=False, e.g. delete and put) have no results,
async commands (== calls with wait=False, e.g. put) have no results,
but may raise exceptions. These async exceptions must get collected later via
async_response() calls. Repeat the call until it returns None.
The previous calls might either return one (non-None) result or raise an exception.
Expand Down
8 changes: 4 additions & 4 deletions src/borg/testsuite/archiver/check_cmd_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def test_missing_file_chunk(archivers, request):
if item.path.endswith(src_file):
valid_chunks = item.chunks
killed_chunk = valid_chunks[-1]
repository.delete(killed_chunk.id)
repository.delete_pack(killed_chunk.id) # N=1: pack_id == chunk_id
break
else:
pytest.fail("should not happen") # convert 'fail'
Expand Down Expand Up @@ -198,7 +198,7 @@ def test_missing_archive_item_chunk(archivers, request):
check_cmd_setup(archiver)
archive, repository = open_archive(archiver.repository_path, "archive1")
with repository:
repository.delete(archive.metadata.items[0])
repository.delete_pack(archive.metadata.items[0]) # N=1: pack_id == chunk_id
cmd(archiver, "check", exit_code=1)
cmd(archiver, "check", "--repair", exit_code=0)
cmd(archiver, "check", exit_code=0)
Expand All @@ -209,7 +209,7 @@ def test_missing_archive_metadata(archivers, request):
check_cmd_setup(archiver)
archive, repository = open_archive(archiver.repository_path, "archive1")
with repository:
repository.delete(archive.id)
repository.delete_pack(archive.id) # N=1: pack_id == chunk_id
cmd(archiver, "check", exit_code=1)
cmd(archiver, "check", "--repair", exit_code=0)
cmd(archiver, "check", exit_code=0)
Expand Down Expand Up @@ -446,5 +446,5 @@ def test_empty_repository(archivers, request):
check_cmd_setup(archiver)
with Repository(archiver.repository_location, exclusive=True) as repository:
for id, _ in repository.list():
repository.delete(id)
repository.delete_pack(id) # N=1: pack_id == chunk_id
cmd(archiver, "check", exit_code=1)
91 changes: 90 additions & 1 deletion src/borg/testsuite/archiver/compact_cmd_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@
import pytest

from ...constants import * # NOQA
from ...helpers import get_cache_dir
from ...archiver.compact_cmd import ArchiveGarbageCollector
from ...hashindex import ChunkIndex
from ...helpers import get_cache_dir, bin_to_hex
from ...cache import files_cache_name, discover_files_cache_names, list_chunkindex_hashes
from ...manifest import Manifest
from ...repository import Repository
from . import cmd, create_regular_file, create_src_archive, generate_archiver_tests, open_repository, RK_ENCRYPTION
from . import changedir
from ..hashindex_test import H
from ..repository_test import fchunk, pdchunk

pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA

Expand Down Expand Up @@ -193,3 +198,87 @@ def test_compact_files_cache_cleanup(archivers, request):
# Get expected cache files for remaining archives
expected_cache_files = {files_cache_name(name) for name in ["archive1", "archive3"]}
assert expected_cache_files == remaining_cache_files, "Unexpected cache files found"


def _pack_names(repo):
# set of pack file names (hex pack_ids) currently present in the store.
return {info.name for info in repo.store.list("packs")}


def _load_obj(repo, index, chunk_id):
# read an object's stored bytes via its current index location (same range-load get() does).
entry = index[chunk_id]
return repo.store.load("packs/" + bin_to_hex(entry.pack_id), offset=entry.obj_offset, size=entry.obj_size)


def test_compact_drops_whole_unused_packs(tmp_path):
"""N=1: each pack holds exactly one object, so compact reclaims space by dropping whole
unused pack files -- there is no per-object delete. The used object's pack stays; the
unused ones are removed as a whole."""
repo_location = os.fspath(tmp_path / "repo")
with Repository(repo_location, exclusive=True, create=True) as repo:
# default max_count == 1: one object per pack.
for i in range(3):
repo.put(H(i), fchunk(b"data%d" % i))
repo.flush()
index = repo.chunks
assert len(_pack_names(repo)) == 3

# simulate analyze_archives(): start from "unused" (put() marks new chunks F_USED), then
# mark only H(0) as referenced; H(1) and H(2) stay unused.
gc = ArchiveGarbageCollector(repo, manifest=None, stats=False, iec=False)
gc.chunks = index
for i in range(3):
entry = index[H(i)]
flags = ChunkIndex.F_USED if i == 0 else ChunkIndex.F_NONE
index[H(i)] = entry._replace(flags=flags)

gc.compact_packs()

# only the used object's pack survives; the two unused single-object packs are gone.
assert _pack_names(repo) == {bin_to_hex(index[H(0)].pack_id)}
assert H(0) in index
assert H(1) not in index and H(2) not in index
assert pdchunk(_load_obj(repo, index, H(0))) == b"data0"


def test_compact_copy_forward_mixed_pack(tmp_path):
"""N>1: a single pack can hold both used and unused objects. compact must keep the used
ones by copying them into a fresh pack and drop the old pack as a whole -- never delete a
single object in place. We force max_count > 1 to build such a mixed pack, since the N=1
production path never produces one. This proves the copy-forward design ahead of N>1."""
repo_location = os.fspath(tmp_path / "repo")
with Repository(repo_location, exclusive=True, create=True) as repo:
# build ONE pack holding three objects by bundling (max_count = 3).
repo._pack_writer.max_count = 3
repo.put(H(0), fchunk(b"data0"))
repo.put(H(1), fchunk(b"data1"))
repo.put(H(2), fchunk(b"data2")) # third add fills the pack and flushes it
repo.flush()

index = repo.chunks
old_pack_id = index[H(0)].pack_id
assert {index[H(i)].pack_id for i in range(3)} == {old_pack_id} # all three share one pack
assert _pack_names(repo) == {bin_to_hex(old_pack_id)}

# simulate analyze_archives(): start from "unused" (put() marks new chunks F_USED), then
# mark H(0) and H(2) as referenced; H(1) stays unused.
gc = ArchiveGarbageCollector(repo, manifest=None, stats=False, iec=False)
gc.chunks = index
for i in range(3):
entry = index[H(i)]
flags = ChunkIndex.F_USED if i in (0, 2) else ChunkIndex.F_NONE
index[H(i)] = entry._replace(flags=flags)

gc.compact_packs()

# the unused object is gone from the index; the survivors moved into a new pack.
assert H(1) not in index
new_pack_id = index[H(0)].pack_id
assert new_pack_id != old_pack_id
assert index[H(2)].pack_id == new_pack_id

# old pack dropped whole, new pack present, survivors still hold their original bytes.
assert _pack_names(repo) == {bin_to_hex(new_pack_id)}
assert pdchunk(_load_obj(repo, index, H(0))) == b"data0"
assert pdchunk(_load_obj(repo, index, H(2))) == b"data2"
2 changes: 1 addition & 1 deletion src/borg/testsuite/archiver/extract_cmd_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,7 +800,7 @@ def test_extract_file_with_missing_chunk(archivers, request):
for item in archive.iter_items():
if item.path.endswith(src_file):
chunk = item.chunks[-1]
repository.delete(chunk.id)
repository.delete_pack(chunk.id) # N=1: pack_id == chunk_id
break
else:
assert False # missed the file
Expand Down
2 changes: 1 addition & 1 deletion src/borg/testsuite/archiver/mount_cmds_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def test_fuse_allow_damaged_files(archivers, request):
with repository:
for item in archive.iter_items():
if item.path.endswith(src_file):
repository.delete(item.chunks[-1].id)
repository.delete_pack(item.chunks[-1].id) # N=1: pack_id == chunk_id
path = item.path # store full path for later
break
else:
Expand Down
Loading
Loading