Skip to content

Commit 416eaca

Browse files
disagg: limit S3 read amplification on compute nodes (#10771) (#10789)
close #10752 add a node-level S3 byte limiter and wire it into both direct reads and FileCache downloads avoid duplicate S3 downloads for the same key by adding bounded wait on in-flight FileCache entries add phase-2 remote cache observability for bounded wait and background download stages refine FileCache and S3 read paths for maintainability, comments, failure handling, and reload consistency add an English design document for the latest disagg S3 backpressure plan Signed-off-by: ti-chi-bot <ti-community-prow-bot@tidb.io> Signed-off-by: JaySon-Huang <tshent@qq.com> Co-authored-by: JaySon <tshent@qq.com> Co-authored-by: JaySon-Huang <tshent@qq.com>
1 parent 2265aeb commit 416eaca

34 files changed

+3892
-460
lines changed

dbms/src/Common/FailPoint.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ namespace DB
115115
M(exception_when_fetch_disagg_pages) \
116116
M(cop_send_failure) \
117117
M(file_cache_fg_download_fail) \
118+
M(file_cache_bg_download_fail) \
119+
M(file_cache_bg_download_schedule_fail) \
118120
M(force_set_parallel_prehandle_threshold) \
119121
M(force_raise_prehandle_exception) \
120122
M(force_agg_on_partial_block) \
@@ -135,6 +137,7 @@ namespace DB
135137
M(force_join_v2_probe_disable_lm) \
136138
M(force_s3_random_access_file_init_fail) \
137139
M(force_s3_random_access_file_read_fail) \
140+
M(force_s3_random_access_file_seek_chunked) \
138141
M(force_release_snap_meet_null_storage)
139142

140143
#define APPLY_FOR_PAUSEABLE_FAILPOINTS_ONCE(M) \

dbms/src/Common/TiFlashMetrics.cpp

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,27 @@
2222

2323
namespace DB
2424
{
25+
namespace
26+
{
27+
constexpr std::array remote_cache_file_type_labels = {"merged", "coldata", "other"};
28+
constexpr std::array remote_cache_wait_result_labels = {"hit", "timeout", "failed"};
29+
constexpr std::array remote_cache_reject_reason_labels = {"too_many_download"};
30+
constexpr std::array remote_cache_download_stage_labels = {"queue_wait", "download"};
31+
constexpr auto remote_cache_wait_on_downloading_buckets = ExpBuckets{0.0001, 2, 20};
32+
constexpr auto remote_cache_bg_download_stage_buckets = ExpBuckets{0.0001, 2, 20};
33+
34+
static_assert(
35+
remote_cache_file_type_labels.size() == static_cast<size_t>(TiFlashMetrics::RemoteCacheFileTypeMetric::Count));
36+
static_assert(
37+
remote_cache_wait_result_labels.size() == static_cast<size_t>(TiFlashMetrics::RemoteCacheWaitResultMetric::Count));
38+
static_assert(
39+
remote_cache_reject_reason_labels.size()
40+
== static_cast<size_t>(TiFlashMetrics::RemoteCacheRejectReasonMetric::Count));
41+
static_assert(
42+
remote_cache_download_stage_labels.size()
43+
== static_cast<size_t>(TiFlashMetrics::RemoteCacheDownloadStageMetric::Count));
44+
} // namespace
45+
2546
TiFlashMetrics & TiFlashMetrics::instance()
2647
{
2748
static TiFlashMetrics inst; // Instantiated on first use.
@@ -78,6 +99,82 @@ TiFlashMetrics::TiFlashMetrics()
7899
.Name("tiflash_storage_s3_store_summary_bytes")
79100
.Help("S3 storage summary bytes by store and file type")
80101
.Register(*registry);
102+
103+
registered_remote_cache_wait_on_downloading_result_family
104+
= &prometheus::BuildCounter()
105+
.Name("tiflash_storage_remote_cache_wait_on_downloading_result")
106+
.Help("Bounded wait result of remote cache downloading")
107+
.Register(*registry);
108+
registered_remote_cache_wait_on_downloading_bytes_family
109+
= &prometheus::BuildCounter()
110+
.Name("tiflash_storage_remote_cache_wait_on_downloading_bytes")
111+
.Help("Bytes covered by remote cache bounded wait")
112+
.Register(*registry);
113+
// Timeline for one cache miss with possible follower requests:
114+
//
115+
// req A: miss -> create Empty -> enqueue bg task ---- queue_wait ---- download ---- Complete/Failed
116+
// req B: sees Empty -> -------- wait_on_downloading_seconds --------> hit/timeout/failed
117+
// req C: sees Empty -> --- wait_on_downloading_seconds ---> hit/timeout/failed
118+
//
119+
// `tiflash_storage_remote_cache_bg_download_stage_seconds`
120+
// - downloader-task view
121+
// - measures how long the background download itself spent in `queue_wait` and `download`
122+
registered_remote_cache_bg_download_stage_seconds_family
123+
= &prometheus::BuildHistogram()
124+
.Name("tiflash_storage_remote_cache_bg_download_stage_seconds")
125+
.Help("Remote cache background download stage duration")
126+
.Register(*registry);
127+
// `tiflash_storage_remote_cache_wait_on_downloading_seconds`
128+
// - follower-request view
129+
// - measures how long a request waited on an existing `Empty` segment before ending as hit/timeout/failed
130+
registered_remote_cache_wait_on_downloading_seconds_family
131+
= &prometheus::BuildHistogram()
132+
.Name("tiflash_storage_remote_cache_wait_on_downloading_seconds")
133+
.Help("Bounded wait duration of remote cache downloading")
134+
.Register(*registry);
135+
registered_remote_cache_reject_family = &prometheus::BuildCounter()
136+
.Name("tiflash_storage_remote_cache_reject")
137+
.Help("Remote cache admission rejection by reason and file type")
138+
.Register(*registry);
139+
140+
for (size_t file_type_idx = 0; file_type_idx < remote_cache_file_type_labels.size(); ++file_type_idx)
141+
{
142+
for (size_t result_idx = 0; result_idx < remote_cache_wait_result_labels.size(); ++result_idx)
143+
{
144+
auto labels = prometheus::Labels{
145+
{"result", std::string(remote_cache_wait_result_labels[result_idx])},
146+
{"file_type", std::string(remote_cache_file_type_labels[file_type_idx])},
147+
};
148+
remote_cache_wait_on_downloading_result_metrics[file_type_idx][result_idx]
149+
= &registered_remote_cache_wait_on_downloading_result_family->Add(labels);
150+
remote_cache_wait_on_downloading_bytes_metrics[file_type_idx][result_idx]
151+
= &registered_remote_cache_wait_on_downloading_bytes_family->Add(labels);
152+
prometheus::Histogram::BucketBoundaries wait_buckets = ExpBuckets{
153+
remote_cache_wait_on_downloading_buckets.start,
154+
remote_cache_wait_on_downloading_buckets.base,
155+
remote_cache_wait_on_downloading_buckets.size};
156+
remote_cache_wait_on_downloading_seconds_metrics[file_type_idx][result_idx]
157+
= &registered_remote_cache_wait_on_downloading_seconds_family->Add(labels, wait_buckets);
158+
}
159+
for (size_t reason_idx = 0; reason_idx < remote_cache_reject_reason_labels.size(); ++reason_idx)
160+
{
161+
remote_cache_reject_metrics[file_type_idx][reason_idx] = &registered_remote_cache_reject_family->Add(
162+
{{"reason", std::string(remote_cache_reject_reason_labels[reason_idx])},
163+
{"file_type", std::string(remote_cache_file_type_labels[file_type_idx])}});
164+
}
165+
for (size_t stage_idx = 0; stage_idx < remote_cache_download_stage_labels.size(); ++stage_idx)
166+
{
167+
prometheus::Histogram::BucketBoundaries buckets = ExpBuckets{
168+
remote_cache_bg_download_stage_buckets.start,
169+
remote_cache_bg_download_stage_buckets.base,
170+
remote_cache_bg_download_stage_buckets.size};
171+
remote_cache_bg_download_stage_seconds_metrics[file_type_idx][stage_idx]
172+
= &registered_remote_cache_bg_download_stage_seconds_family->Add(
173+
{{"stage", std::string(remote_cache_download_stage_labels[stage_idx])},
174+
{"file_type", std::string(remote_cache_file_type_labels[file_type_idx])}},
175+
buckets);
176+
}
177+
}
81178
}
82179

83180
void TiFlashMetrics::addReplicaSyncRU(UInt32 keyspace_id, UInt64 ru)
@@ -287,4 +384,41 @@ void TiFlashMetrics::setS3StoreSummaryBytes(UInt64 store_id, UInt64 data_file_by
287384
it->second.data_file_bytes->Set(data_file_bytes);
288385
it->second.dt_file_bytes->Set(dt_file_bytes);
289386
}
387+
388+
prometheus::Counter & TiFlashMetrics::getRemoteCacheWaitOnDownloadingResultCounter(
389+
TiFlashMetrics::RemoteCacheFileTypeMetric file_type,
390+
TiFlashMetrics::RemoteCacheWaitResultMetric result)
391+
{
392+
return *remote_cache_wait_on_downloading_result_metrics[static_cast<size_t>(file_type)]
393+
[static_cast<size_t>(result)];
394+
}
395+
396+
prometheus::Counter & TiFlashMetrics::getRemoteCacheWaitOnDownloadingBytesCounter(
397+
TiFlashMetrics::RemoteCacheFileTypeMetric file_type,
398+
TiFlashMetrics::RemoteCacheWaitResultMetric result)
399+
{
400+
return *remote_cache_wait_on_downloading_bytes_metrics[static_cast<size_t>(file_type)][static_cast<size_t>(result)];
401+
}
402+
403+
prometheus::Histogram & TiFlashMetrics::getRemoteCacheWaitOnDownloadingSecondsHistogram(
404+
TiFlashMetrics::RemoteCacheFileTypeMetric file_type,
405+
TiFlashMetrics::RemoteCacheWaitResultMetric result)
406+
{
407+
return *remote_cache_wait_on_downloading_seconds_metrics[static_cast<size_t>(file_type)]
408+
[static_cast<size_t>(result)];
409+
}
410+
411+
prometheus::Histogram & TiFlashMetrics::getRemoteCacheBgDownloadStageSecondsHistogram(
412+
TiFlashMetrics::RemoteCacheFileTypeMetric file_type,
413+
TiFlashMetrics::RemoteCacheDownloadStageMetric stage)
414+
{
415+
return *remote_cache_bg_download_stage_seconds_metrics[static_cast<size_t>(file_type)][static_cast<size_t>(stage)];
416+
}
417+
418+
prometheus::Counter & TiFlashMetrics::getRemoteCacheRejectCounter(
419+
TiFlashMetrics::RemoteCacheFileTypeMetric file_type,
420+
TiFlashMetrics::RemoteCacheRejectReasonMetric reason)
421+
{
422+
return *remote_cache_reject_metrics[static_cast<size_t>(file_type)][static_cast<size_t>(reason)];
423+
}
290424
} // namespace DB

0 commit comments

Comments
 (0)