Skip to content

Commit 9e9d52f

Browse files
committed
implement archive chunking
Teach pkgfile how to break a single tar file into multiple cpio files. Doing this puts an upper bound on how much work a single thread has to do and makes queries on huge repos like [extra] signficantly faster. On my machine, --search operations see about a 75% reduction in latency, whereas --list operations see closer to a 50% reduction in latency. Implements a suggestion from #71.
1 parent a830066 commit 9e9d52f

24 files changed

Lines changed: 382 additions & 127 deletions

build-aux/ci-test

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,15 @@ endgroup() {
2121
echo "::endgroup::"
2222
}
2323

24-
debug "Running with buildmode=$buildmode"
25-
2624
group 'setup'
2725

2826
buildmode=$1
2927
setup_flags=()
3028
builddir=build-$buildmode
3129
test_install=0
3230

31+
debug "Running with buildmode=$buildmode"
32+
3333
case $buildmode in
3434
plain)
3535
test_install=1
@@ -60,7 +60,7 @@ endgroup
6060

6161
# test
6262
group 'test'
63-
meson test -C "$builddir"
63+
meson test -C "$builddir" --print-errorlogs -v
6464
endgroup
6565

6666
# install

build-aux/test-san

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@ sanitizer=${1:-address}
66
dir=sanbuild$$
77

88
meson setup "$dir" -Db_sanitize="$sanitizer" --buildtype=debugoptimized
9-
meson test -C "$dir"
9+
ninja -C "$dir" test

meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ libcommon = static_library(
5353
src/repo.cc src/repo.hh
5454
src/result.cc src/result.hh
5555
src/update.cc src/update.hh
56+
src/queue.hh
5657
'''.split(),
5758
),
5859
dependencies: [libpcre, libarchive, libcurl, pthreads, stdcppfs],

src/archive_converter.cc

Lines changed: 74 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
#include <sys/time.h>
55

66
#include <filesystem>
7+
#include <format>
8+
#include <iostream>
79
#include <sstream>
810

911
namespace fs = std::filesystem;
@@ -12,10 +14,8 @@ namespace pkgfile {
1214

1315
// static
1416
std::unique_ptr<ArchiveConverter> ArchiveConverter::New(
15-
const std::string& reponame, int fd_in, const std::string& filename_out,
16-
int compress) {
17-
std::string tmpfile_path = filename_out;
18-
tmpfile_path.append("~");
17+
const std::string& reponame, int fd_in, std::string base_filename_out,
18+
int compress, int repo_chunk_bytes) {
1919
const char* error;
2020

2121
auto reader = ReadArchive::New(fd_in, &error);
@@ -25,15 +25,44 @@ std::unique_ptr<ArchiveConverter> ArchiveConverter::New(
2525
return nullptr;
2626
}
2727

28-
auto writer = WriteArchive::New(tmpfile_path, compress, &error);
28+
auto writer = WriteArchive::New(
29+
MakeArchiveChunkFilename(base_filename_out, 0, true), compress, &error);
2930
if (writer == nullptr) {
3031
fprintf(stderr, "error: failed to open file for writing: %s: %s\n",
31-
tmpfile_path.c_str(), error);
32+
base_filename_out.c_str(), error);
3233
return nullptr;
3334
}
3435

35-
return std::make_unique<ArchiveConverter>(reponame, std::move(reader),
36-
std::move(writer));
36+
return std::make_unique<ArchiveConverter>(
37+
reponame, std::move(base_filename_out), compress, repo_chunk_bytes,
38+
std::move(reader), std::move(writer));
39+
}
40+
41+
std::string ArchiveConverter::MakeArchiveChunkFilename(
42+
const std::string& base_filename, int chunk_number, bool tempfile) {
43+
return std::format("{}.{:03d}{}", base_filename, chunk_number,
44+
tempfile ? "~" : "");
45+
}
46+
47+
bool ArchiveConverter::NextArchiveChunk() {
48+
if (!out_->Close()) {
49+
return false;
50+
}
51+
52+
const char* error;
53+
54+
auto writer = WriteArchive::New(
55+
MakeArchiveChunkFilename(base_filename_out_, ++chunk_number_, true),
56+
compress_, &error);
57+
if (writer == nullptr) {
58+
fprintf(stderr, "error: failed to open file for writing: %s: %s\n",
59+
base_filename_out_.c_str(), error);
60+
return false;
61+
}
62+
63+
out_ = std::move(writer);
64+
65+
return true;
3766
}
3867

3968
int ArchiveConverter::WriteCpioEntry(archive_entry* ae,
@@ -50,7 +79,7 @@ int ArchiveConverter::WriteCpioEntry(archive_entry* ae,
5079
entry_data << '/' << line << '\n';
5180
}
5281

53-
const auto entry = entry_data.str();
82+
const std::string entry = entry_data.str();
5483

5584
// adjust the entry size for removing the first line and adding slashes
5685
archive_entry_set_size(ae, entry.size());
@@ -74,7 +103,7 @@ int ArchiveConverter::WriteCpioEntry(archive_entry* ae,
74103
return -errno;
75104
}
76105

77-
return 0;
106+
return entry.size();
78107
}
79108

80109
bool ArchiveConverter::Finalize() {
@@ -92,32 +121,57 @@ bool ArchiveConverter::Finalize() {
92121
{st.st_mtim.tv_sec, 0},
93122
};
94123

95-
if (utimes(out_->path().c_str(), times) < 0) {
96-
fprintf(stderr, "warning: failed to set filetimes on %s: %s\n",
97-
out_->path().c_str(), strerror(errno));
124+
for (int i = 0; i <= chunk_number_; ++i) {
125+
std::string path = MakeArchiveChunkFilename(base_filename_out_, i, true);
126+
127+
if (utimes(path.c_str(), times) < 0) {
128+
fprintf(stderr, "warning: failed to set filetimes on %s: %s\n",
129+
out_->path().c_str(), strerror(errno));
130+
}
131+
132+
const fs::path dest = path.substr(0, path.size() - 1);
133+
134+
std::error_code ec;
135+
if (fs::rename(path, dest, ec); ec.value() != 0) {
136+
fprintf(stderr, "error: renaming tmpfile to %s failed: %s\n",
137+
dest.c_str(), ec.message().c_str());
138+
}
98139
}
99140

100-
auto dest = out_->path().substr(0, out_->path().size() - 1);
141+
for (int i = chunk_number_ + 1;; ++i) {
142+
std::string path = MakeArchiveChunkFilename(base_filename_out_, i, false);
101143

102-
std::error_code ec;
103-
if (fs::rename(out_->path(), dest, ec); ec.value() != 0) {
104-
fprintf(stderr, "error: renaming tmpfile to %s failed: %s\n", dest.c_str(),
105-
ec.message().c_str());
106-
return false;
144+
if (unlink(path.c_str()) != 0) {
145+
break;
146+
}
107147
}
108148

109149
return true;
110150
}
111151

112152
bool ArchiveConverter::RewriteArchive() {
113153
archive_entry* ae;
154+
int chunk_size = 0;
155+
114156
while (archive_read_next_header(in_->read_archive(), &ae) == ARCHIVE_OK) {
157+
if (chunk_size > repo_chunk_bytes_) {
158+
if (!NextArchiveChunk()) {
159+
return false;
160+
}
161+
162+
chunk_size = 0;
163+
}
164+
115165
fs::path entryname = archive_entry_pathname(ae);
166+
116167
// ignore everything but the /files metadata
117168
if (entryname.filename() == "files") {
118-
if (WriteCpioEntry(ae, entryname) < 0) {
169+
const int bytes_written = WriteCpioEntry(ae, entryname);
170+
if (bytes_written < 0) {
119171
return false;
120172
}
173+
174+
chunk_size += bytes_written;
121175
}
122176
}
123177

src/archive_converter.hh

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,15 @@ namespace pkgfile {
1717
// packed in CPIO).
1818
class ArchiveConverter {
1919
public:
20-
ArchiveConverter(std::string reponame, std::unique_ptr<ReadArchive> in,
20+
ArchiveConverter(std::string reponame, std::string base_filename_out,
21+
int compress, int repo_chunk_bytes,
22+
std::unique_ptr<ReadArchive> in,
2123
std::unique_ptr<WriteArchive> out)
2224
: reponame_(std::move(reponame)),
25+
base_filename_out_(std::move(base_filename_out)),
26+
compress_(compress),
27+
repo_chunk_bytes_(repo_chunk_bytes <= 0 ? kDefaultRepoChunkMax
28+
: repo_chunk_bytes),
2329
in_(std::move(in)),
2430
out_(std::move(out)) {}
2531

@@ -31,19 +37,32 @@ class ArchiveConverter {
3137

3238
static std::unique_ptr<ArchiveConverter> New(const std::string& reponame,
3339
int fd_in,
34-
const std::string& filename_out,
35-
int compress);
40+
std::string base_filename_out,
41+
int compress,
42+
int repo_chunk_bytes);
3643

3744
bool RewriteArchive();
3845

3946
private:
47+
static constexpr int kDefaultRepoChunkMax = 40 * (1 << 20);
48+
4049
int WriteCpioEntry(archive_entry* ae, const std::filesystem::path& entryname);
4150
bool Finalize();
4251

52+
static std::string MakeArchiveChunkFilename(const std::string& base_filename,
53+
int chunk_number, bool tempfile);
54+
55+
bool NextArchiveChunk();
56+
4357
std::string reponame_;
44-
std::string destfile_;
58+
std::string base_filename_out_;
59+
int compress_;
60+
int repo_chunk_bytes_;
61+
4562
std::unique_ptr<ReadArchive> in_;
4663
std::unique_ptr<WriteArchive> out_;
64+
65+
int chunk_number_ = 0;
4766
};
4867

4968
} // namespace pkgfile

0 commit comments

Comments
 (0)