Skip to content

Commit cf98347

Browse files
committed
feat: add circle archiving in the BagIt (RFC 8493) format
1 parent f338e36 commit cf98347

11 files changed

Lines changed: 680 additions & 6 deletions

File tree

CHANGELOG

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
1.0.1
1+
1.1.0
2+
- feat: add circle archiving in the BagIt (RFC 8493) format
23
- fix: handle resource downloads whose target directory got deleted
34
- fix: check for existence of persistent job list in upload
45
1.0.0

dcoraid/bagit/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .archive import bag_circle # noqa: F401

dcoraid/bagit/archive.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import hashlib
2+
import json
3+
import pathlib
4+
import shutil
5+
import threading
6+
from typing import Callable
7+
8+
from ..api import CKANAPI
9+
from ..dbmodel import APIInterrogator
10+
from ..download import DownloadJob
11+
from . import info, manifest
12+
13+
14+
def bag_circle(api: CKANAPI,
15+
circle_name: str,
16+
target_path: pathlib.Path,
17+
abort_event: threading.Event = None,
18+
callback: Callable = None):
19+
"""Download an entire circle to a target directory in BagIt format
20+
21+
The format follows RFC 8493 "The BagIt File Packaging Format (V1.0)".
22+
23+
To validate the BagIt bags:
24+
25+
pip install bagit
26+
bagit.py --validate --quiet target_path/*
27+
28+
Parameters
29+
----------
30+
api
31+
CKANAPI for connecting to the DCOR instance
32+
circle_name
33+
Name of the circle to archive
34+
target_path
35+
Download location
36+
abort_event
37+
Specify a `threading.Event` to be able to abort archiving;
38+
when you wish to abort `.set()` the event.
39+
callback
40+
Method for progress tracking (returns a float between 0 and 1)
41+
"""
42+
# fetch total list of active datasets
43+
ai = APIInterrogator(api)
44+
dataset_dicts = ai.search_dataset_via_api(circles=[circle_name],
45+
limit=0,
46+
ret_db_extract=False)
47+
48+
num_datasets = len(dataset_dicts)
49+
50+
# sort datasets according to creation date
51+
dataset_dicts = sorted(dataset_dicts, key=lambda x: x["metadata_created"])
52+
53+
# compute sha256 hash of all dataset IDs
54+
hasher = hashlib.sha256()
55+
for ds_dict in dataset_dicts:
56+
hasher.update(ds_dict["id"].encode(encoding="utf-8"))
57+
sha256_hash = hasher.hexdigest()
58+
59+
# Check whether there is already a list of dataset IDs in the directory,
60+
# and if yes, compute the MD5 hash and compare it. If the comparison
61+
# fails, then the user has to choose a different `target_path`, because
62+
# we cannot guarantee data integrity.
63+
circle_jsonlines_path = target_path / "circle.jsonlines"
64+
if circle_jsonlines_path.exists():
65+
lines = circle_jsonlines_path.read_text().split("\n")
66+
hasher2 = hashlib.sha256()
67+
for line in lines:
68+
hasher2.update(json.loads(line)["id"].encode(encoding="utf-8"))
69+
sha256_hash2 = hasher2.hexdigest()
70+
if sha256_hash != sha256_hash2:
71+
raise ValueError(
72+
f"A previous attempt to archive circle {circle_name} was made "
73+
f"in directory {target_path}. However, the number of datasets "
74+
f"changed since then. Therefore, it is not possible to "
75+
f"archive this circle to that directory.")
76+
else:
77+
# save list of datasets as jsonlines
78+
with circle_jsonlines_path.open("w", encoding="utf-8") as f:
79+
for ds_dict in dataset_dicts:
80+
f.write(json.dumps(ds_dict) + "\n")
81+
82+
# number of digits for enumeration
83+
max_digits = len(str(num_datasets))
84+
85+
# bag all dataset
86+
for ii, ds_dict in enumerate(dataset_dicts):
87+
if callback:
88+
callback(ii / num_datasets)
89+
90+
dataset_index = ii+1
91+
prefix = str(dataset_index).zfill(max_digits)
92+
bag_path = target_path / f"{prefix}_{ds_dict['name']}"
93+
94+
if not manifest.is_bagged(bag_path):
95+
bag_dataset(api=api,
96+
ds_dict=ds_dict,
97+
dataset_index=dataset_index,
98+
num_datasets=num_datasets,
99+
bag_path=bag_path,
100+
abort_event=abort_event)
101+
if abort_event is not None and abort_event.is_set():
102+
return
103+
if callback:
104+
callback(1)
105+
106+
107+
def bag_dataset(api: CKANAPI,
108+
ds_dict: dict,
109+
bag_path: pathlib.Path,
110+
abort_event: threading.Event = None,
111+
dataset_index: int = 1,
112+
num_datasets: int = 1,
113+
):
114+
"""Download a dataset to a target directory in BagIt format
115+
116+
Parameters
117+
----------
118+
api
119+
CKANAPI for connecting to the DCOR instance
120+
ds_dict
121+
CKAN dataset dictionary
122+
bag_path
123+
Path of the bag
124+
abort_event
125+
Event for aborting (see :func:`bag_circle`)
126+
dataset_index
127+
Index of this dataset in the circle
128+
num_datasets
129+
Total number of datasets in the circle
130+
"""
131+
# clear/create download directory
132+
if bag_path.exists():
133+
shutil.rmtree(bag_path)
134+
bag_path.mkdir(parents=True, exist_ok=True)
135+
data_path = bag_path / "data"
136+
data_path.mkdir(parents=True, exist_ok=True)
137+
138+
# dataset dictionary
139+
meta = json.dumps(ds_dict, indent=2, sort_keys=True)
140+
(data_path / "dataset.json").write_text(meta)
141+
142+
# download all resources
143+
for res_dict in ds_dict["resources"]:
144+
if abort_event is not None and abort_event.is_set():
145+
return
146+
147+
# resource
148+
download_resource(api=api,
149+
bag_path=bag_path,
150+
res_dict=res_dict,
151+
condensed=False)
152+
153+
# condensed resource
154+
if res_dict["name"].endswith(".rtdc"):
155+
download_resource(api=api,
156+
bag_path=bag_path,
157+
res_dict=res_dict,
158+
condensed=True)
159+
160+
# create BagIt files
161+
info.write_bag_info(bag_path=bag_path,
162+
bag_index=dataset_index,
163+
num_bags=num_datasets,
164+
ds_dict=ds_dict)
165+
166+
# create BagIt manifest files
167+
manifest.write_manifest(bag_path=bag_path,
168+
ds_dict=ds_dict)
169+
170+
171+
def download_resource(api: CKANAPI,
172+
bag_path: pathlib.Path,
173+
res_dict: dict,
174+
condensed: bool):
175+
"""Download and verify a resource from DCOR
176+
177+
Parameters
178+
api
179+
CKANAPI for connecting to the DCOR instance
180+
bag_path
181+
Path of the bag
182+
res_dict
183+
CKAN resource dictionary
184+
condensed
185+
Whether to download the condensed resource (or the original resource)
186+
"""
187+
data_path = bag_path / "data"
188+
data_path.mkdir(parents=True, exist_ok=True)
189+
dl_path = data_path / res_dict["name"]
190+
if condensed:
191+
dl_path = dl_path.with_name(dl_path.stem + "_condensed.rtdc")
192+
dj = DownloadJob(api=api,
193+
resource_id=res_dict["id"],
194+
download_path=dl_path,
195+
condensed=condensed
196+
)
197+
dj.task_download_resource()
198+
dj.task_verify_resource()

dcoraid/bagit/info.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import pathlib
2+
3+
import time
4+
5+
6+
from .._version import version
7+
8+
9+
def get_payload_oxum(bag_path: pathlib.Path) -> str:
10+
"""Return the Payload-Oxum
11+
12+
The "octetstream sum" of the payload, which is
13+
intended for the purpose of quickly detecting incomplete bags
14+
before performing checksum validation. This is strictly an
15+
optimization, and implementations MUST perform the standard
16+
checksum validation process before proclaiming a bag to be valid.
17+
This element MUST NOT be present more than once and, if present,
18+
MUST be in the form "_OctetCount_._StreamCount_", where
19+
_OctetCount_ is the total number of octets (8-bit bytes) across
20+
all payload file content and _StreamCount_ is the total number of
21+
payload files. This metadata element MUST NOT be repeated.
22+
"""
23+
num_files = 0
24+
size_files = 0
25+
for pp in (bag_path / "data").rglob("*"):
26+
if pp.is_file():
27+
num_files += 1
28+
size_files += pp.stat().st_size
29+
return f"{size_files}.{num_files}"
30+
31+
32+
def write_bag_info(bag_path: pathlib.Path,
33+
ds_dict: dict,
34+
bag_index: int = 1,
35+
num_bags: int = 1,
36+
):
37+
"""Write the bag-info.txt file for a bag
38+
39+
The content of the info file looks like this:
40+
41+
Bag-Software-Agent: dcoraid 1.1 <https://github.com/DCOR-dev/DCOR-Aid>
42+
Bagging-Date: 2026-02-12
43+
Payload-Oxum: 2307.10
44+
Bag-Group-Identifier: Name of the Circle
45+
Bag-Count: 23 of 8472
46+
External-Identifier: DATASET IDENTIFIER
47+
Internal-Sender-Identifier: DATASET TITLE
48+
"""
49+
webloc = "https://github.com/DCOR-dev/DCOR-Aid"
50+
lines = [
51+
f"Bag-Software-Agent: dcoraid {version} <{webloc}>",
52+
time.strftime("Bagging-Date: %Y-%m-%d"),
53+
f"Payload-Oxum: {get_payload_oxum(bag_path)}",
54+
f"Bag-Group-Identifier: {ds_dict['organization']['name']}",
55+
f"Bag-Count: {bag_index} of {num_bags}",
56+
f"External-Identifier: {ds_dict["name"]}",
57+
f"Internal-Sender-Identifier: {ds_dict["title"]}",
58+
]
59+
(bag_path / "bag-info.txt").write_text("\n".join(lines),
60+
encoding="utf-8")

dcoraid/bagit/manifest.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import hashlib
2+
import warnings
3+
4+
5+
class ManifestValidationError(BaseException):
6+
pass
7+
8+
9+
def is_bagged(bag_path):
10+
"""Check whether a dataset has been bagged
11+
12+
This checks for the existence of the tagmanifest file and
13+
validates it.
14+
"""
15+
for name in ["bagit.txt", "bag-info.txt", "manifest-sha256.txt"]:
16+
if not (bag_path / name).exists():
17+
return False
18+
19+
tagmanifest_path = bag_path / "tagmanifest-sha256.txt"
20+
if not tagmanifest_path.exists():
21+
return False
22+
else:
23+
try:
24+
validate_tag_manifest(bag_path)
25+
except ManifestValidationError:
26+
return False
27+
else:
28+
return True
29+
30+
31+
def hash_file(path):
32+
hasher = hashlib.sha256()
33+
with path.open("rb") as fd:
34+
while chunk := fd.read(1024 ** 2):
35+
hasher.update(chunk)
36+
return hasher.hexdigest()
37+
38+
39+
def write_manifest(bag_path, ds_dict):
40+
"""Write a manifest-sha256.txt to the bag directory
41+
42+
Manifest files look like this:
43+
44+
e91f941be5973ff71f1dccb743da38b1689 data/file_1.rtdc
45+
ee792190d28d8a7abdb0ec805a2618e4573 data/file_2.txt
46+
"""
47+
manifest = []
48+
for res_dict in ds_dict["resources"]:
49+
manifest.append(f"{res_dict['sha256']} data/{res_dict['name']}")
50+
if res_dict.get("mimetype") == "RT-DC":
51+
pp_res = bag_path / "data" / res_dict["name"]
52+
pp_cond = pp_res.with_name(f"{pp_res.stem}_condensed.rtdc")
53+
# compute that hash for the condensed file
54+
if pp_cond.exists():
55+
manifest.append(f"{hash_file(pp_cond)} data/{pp_cond.name}")
56+
else:
57+
warnings.warn(f"No condensed file found for {pp_res}")
58+
59+
# add dataset.json
60+
json_hash = hash_file(bag_path / "data" / "dataset.json")
61+
manifest.append(f"{json_hash} data/dataset.json")
62+
63+
(bag_path / "manifest-sha256.txt").write_text("\n".join(manifest))
64+
65+
# write bag declaration
66+
(bag_path / "bagit.txt").write_text(
67+
"\n".join(["BagIt-Version: 0.97",
68+
"Tag-File-Character-Encoding: UTF-8"]),
69+
encoding="utf-8")
70+
71+
write_tag_manifest(bag_path)
72+
73+
74+
def write_tag_manifest(bag_path):
75+
"""Write a tagmanifest-sha256.txt file
76+
77+
This is just a manifest file for the other txt metadata files
78+
"""
79+
tag_manifest = []
80+
for pp in bag_path.glob("*.txt"):
81+
pp_hash = hashlib.sha256(pp.read_bytes()).hexdigest()
82+
tag_manifest.append(f"{pp_hash} {pp.name}")
83+
(bag_path / "tagmanifest-sha256.txt").write_text("\n".join(tag_manifest),
84+
encoding="utf-8")
85+
86+
87+
def validate_tag_manifest(bag_path):
88+
"""Validate the tagmanifest-sha256.txt file
89+
90+
Raises
91+
------
92+
ManifestValidationError
93+
When a hash is missing or a hash does not match
94+
"""
95+
# read hashes
96+
hash_dict = {}
97+
lines = (bag_path / "tagmanifest-sha256.txt").read_text().split("\n")
98+
for line in lines:
99+
pp_hash, pp_name = line.split(" ", 1)
100+
hash_dict[pp_name] = pp_hash
101+
102+
for pp in bag_path.glob("*.txt"):
103+
if pp.name == "tagmanifest-sha256.txt":
104+
# tagmanifest does not have a hash
105+
continue
106+
if pp.name not in hash_dict:
107+
raise ManifestValidationError(f"No hash associated with {pp}")
108+
if hash_dict[pp.name] != hash_file(pp):
109+
raise ManifestValidationError(f"Hash mismatch for {pp}")

0 commit comments

Comments
 (0)