From d38c5488fdb8d1bb092089de08381bcd5595252d Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Tue, 24 Mar 2026 23:43:07 +0800
Subject: [PATCH 01/36] disagg: add configurable node-level S3 read limiter

---
 dbms/src/Common/CurrentMetrics.cpp            |   1 +
 dbms/src/Common/TiFlashMetrics.h              |  18 ++
 dbms/src/IO/BaseFile/IORateLimitConfig.cpp    |  14 +-
 dbms/src/IO/BaseFile/IORateLimitConfig.h      |   4 +
 dbms/src/IO/BaseFile/RateLimiter.cpp          |  21 ++
 dbms/src/IO/BaseFile/RateLimiter.h            |   6 +
 dbms/src/Server/Server.cpp                    |   2 +
 .../src/Server/tests/gtest_storage_config.cpp |  22 ++
 dbms/src/Storages/S3/MockS3Client.h           |  12 +-
 dbms/src/Storages/S3/S3Common.cpp             |  21 +-
 dbms/src/Storages/S3/S3Common.h               |  34 ++-
 dbms/src/Storages/S3/S3ReadLimiter.cpp        | 240 ++++++++++++++++++
 dbms/src/Storages/S3/S3ReadLimiter.h          | 122 +++++++++
 dbms/src/Storages/S3/tests/gtest_s3client.cpp |  22 ++
 14 files changed, 527 insertions(+), 12 deletions(-)
 create mode 100644 dbms/src/Storages/S3/S3ReadLimiter.cpp
 create mode 100644 dbms/src/Storages/S3/S3ReadLimiter.h

diff --git a/dbms/src/Common/CurrentMetrics.cpp b/dbms/src/Common/CurrentMetrics.cpp
index 5cb62909ae1..97fd4987f9a 100644
--- a/dbms/src/Common/CurrentMetrics.cpp
+++ b/dbms/src/Common/CurrentMetrics.cpp
@@ -86,6 +86,7 @@
     M(RegionPersisterRunMode)                   \
     M(S3Requests)                               \
     M(S3RandomAccessFile)                       \
+    M(S3ActiveGetObjectStreams)                 \
     M(GlobalStorageRunMode)                     \
     M(GlobalThread)                             \
     M(GlobalThreadActive)                       \
diff --git a/dbms/src/Common/TiFlashMetrics.h b/dbms/src/Common/TiFlashMetrics.h
index 26740735c4a..6dba9f1735b 100644
--- a/dbms/src/Common/TiFlashMetrics.h
+++ b/dbms/src/Common/TiFlashMetrics.h
@@ -791,6 +791,24 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       F(type_head_object, {{"type", "head_object"}}, ExpBuckets{0.001, 2, 20}),                                                     \
       F(type_read_stream, {{"type", "read_stream"}}, ExpBuckets{0.0001, 2, 20}),                                                    \
       F(type_read_stream_err, {{"type", "read_stream_err"}}, ExpBuckets{0.0001, 2, 20}))                                            \
+    M(tiflash_storage_s3_read_limiter,                                                                                              \
+      "S3 read limiter counters",                                                                                                   \
+      Counter,                                                                                                                      \
+      F(type_stream_wait_count, {{"type", "stream_wait_count"}}),                                                                   \
+      F(type_byte_wait_count, {{"type", "byte_wait_count"}}),                                                                       \
+      F(type_direct_read_bytes, {{"type", "direct_read_bytes"}}),                                                                   \
+      F(type_filecache_download_bytes, {{"type", "filecache_download_bytes"}}))                                                     \
+    M(tiflash_storage_s3_read_limiter_wait_seconds,                                                                                 \
+      "S3 read limiter wait duration in seconds",                                                                                   \
+      Histogram,                                                                                                                    \
+      F(type_stream_wait, {{"type", "stream_wait"}}, ExpBuckets{0.0001, 2, 20}),                                                   \
+      F(type_byte_wait, {{"type", "byte_wait"}}, ExpBuckets{0.0001, 2, 20}))                                                       \
+    M(tiflash_storage_s3_read_limiter_status,                                                                                       \
+      "S3 read limiter status",                                                                                                     \
+      Gauge,                                                                                                                        \
+      F(type_active_get_object_streams, {{"type", "active_get_object_streams"}}),                                                   \
+      F(type_max_get_object_streams, {{"type", "max_get_object_streams"}}),                                                         \
+      F(type_max_read_bytes_per_sec, {{"type", "max_read_bytes_per_sec"}}))                                                         \
     M(tiflash_storage_s3_http_request_seconds,                                                                                      \
       "S3 request duration breakdown in seconds",                                                                                   \
       Histogram,                                                                                                                    \
diff --git a/dbms/src/IO/BaseFile/IORateLimitConfig.cpp b/dbms/src/IO/BaseFile/IORateLimitConfig.cpp
index 90bc9abd5e4..7427e83322a 100644
--- a/dbms/src/IO/BaseFile/IORateLimitConfig.cpp
+++ b/dbms/src/IO/BaseFile/IORateLimitConfig.cpp
@@ -52,6 +52,8 @@ void IORateLimitConfig::parse(const String & storage_io_rate_limit, const Logger
     readConfig(config, "max_bytes_per_sec", max_bytes_per_sec);
     readConfig(config, "max_read_bytes_per_sec", max_read_bytes_per_sec);
     readConfig(config, "max_write_bytes_per_sec", max_write_bytes_per_sec);
+    readConfig(config, "s3_max_read_bytes_per_sec", s3_max_read_bytes_per_sec);
+    readConfig(config, "s3_max_get_object_streams", s3_max_get_object_streams);
     readConfig(config, "foreground_write_weight", fg_write_weight);
     readConfig(config, "background_write_weight", bg_write_weight);
     readConfig(config, "foreground_read_weight", fg_read_weight);
@@ -72,6 +74,7 @@ std::string IORateLimitConfig::toString() const
 {
     return fmt::format(
         "IORateLimitConfig{{max_bytes_per_sec={} max_read_bytes_per_sec={} max_write_bytes_per_sec={} "
+        "s3_max_read_bytes_per_sec={} s3_max_get_object_streams={} "
         "use_max_bytes_per_sec={} "
         "fg_write_weight={} bg_write_weight={} fg_read_weight={} bg_read_weight={} "
         "fg_write_max_bytes_per_sec={} bg_write_max_bytes_per_sec={} "
@@ -80,6 +83,8 @@ std::string IORateLimitConfig::toString() const
         max_bytes_per_sec,
         max_read_bytes_per_sec,
         max_write_bytes_per_sec,
+        s3_max_read_bytes_per_sec,
+        s3_max_get_object_streams,
         use_max_bytes_per_sec,
         fg_write_weight,
         bg_write_weight,
@@ -165,9 +170,12 @@ UInt64 IORateLimitConfig::getReadMaxBytesPerSec() const
 bool IORateLimitConfig::operator==(const IORateLimitConfig & config) const
 {
     return config.max_bytes_per_sec == max_bytes_per_sec && config.max_read_bytes_per_sec == max_read_bytes_per_sec
-        && config.max_write_bytes_per_sec == max_write_bytes_per_sec && config.bg_write_weight == bg_write_weight
-        && config.fg_write_weight == fg_write_weight && config.bg_read_weight == bg_read_weight
-        && config.fg_read_weight == fg_read_weight && config.emergency_pct == emergency_pct
+        && config.max_write_bytes_per_sec == max_write_bytes_per_sec
+        && config.s3_max_read_bytes_per_sec == s3_max_read_bytes_per_sec
+        && config.s3_max_get_object_streams == s3_max_get_object_streams
+        && config.bg_write_weight == bg_write_weight && config.fg_write_weight == fg_write_weight
+        && config.bg_read_weight == bg_read_weight && config.fg_read_weight == fg_read_weight
+        && config.emergency_pct == emergency_pct
         && config.high_pct == high_pct && config.medium_pct == medium_pct && config.tune_base == tune_base
         && config.min_bytes_per_sec == min_bytes_per_sec && config.auto_tune_sec == auto_tune_sec;
 }
diff --git a/dbms/src/IO/BaseFile/IORateLimitConfig.h b/dbms/src/IO/BaseFile/IORateLimitConfig.h
index f8901e776ce..45e1fc0e685 100644
--- a/dbms/src/IO/BaseFile/IORateLimitConfig.h
+++ b/dbms/src/IO/BaseFile/IORateLimitConfig.h
@@ -28,6 +28,8 @@ struct IORateLimitConfig
     // For disk that read bandwidth and write bandwith are calculated separately, such as GCP's persistent disks.
     UInt64 max_read_bytes_per_sec;
     UInt64 max_write_bytes_per_sec;
+    UInt64 s3_max_read_bytes_per_sec;
+    UInt64 s3_max_get_object_streams;
 
     // only true when both max_read_bytes_per_sec and max_write_bytes_per_sec are 0
     bool use_max_bytes_per_sec;
@@ -54,6 +56,8 @@ struct IORateLimitConfig
         : max_bytes_per_sec(0)
         , max_read_bytes_per_sec(0)
         , max_write_bytes_per_sec(0)
+        , s3_max_read_bytes_per_sec(0)
+        , s3_max_get_object_streams(0)
         , use_max_bytes_per_sec(true)
         // only limit background write by default
         , fg_write_weight(0)
diff --git a/dbms/src/IO/BaseFile/RateLimiter.cpp b/dbms/src/IO/BaseFile/RateLimiter.cpp
index b36df903586..c3ae14b4872 100644
--- a/dbms/src/IO/BaseFile/RateLimiter.cpp
+++ b/dbms/src/IO/BaseFile/RateLimiter.cpp
@@ -17,6 +17,7 @@
 #include <Common/TiFlashMetrics.h>
 #include <IO/BaseFile/RateLimiter.h>
 #include <Poco/Util/AbstractConfiguration.h>
+#include <Storages/S3/S3ReadLimiter.h>
 #include <boost_wrapper/string.h>
 #include <common/likely.h>
 #include <common/logger_useful.h>
@@ -482,6 +483,12 @@ ReadLimiterPtr IORateLimiter::getReadLimiter()
     return is_background_thread ? bg_read_limiter : fg_read_limiter;
 }
 
+std::shared_ptr<S3::S3ReadLimiter> IORateLimiter::getS3ReadLimiter()
+{
+    std::lock_guard lock(limiter_mtx);
+    return s3_read_limiter;
+}
+
 void IORateLimiter::updateConfig(Poco::Util::AbstractConfiguration & config_)
 {
     if (!reloadConfig(config_))
@@ -518,6 +525,20 @@ void IORateLimiter::updateLimiterByConfig(const IORateLimitConfig & cfg)
     std::lock_guard lock(limiter_mtx);
     updateReadLimiter(cfg.getBgReadMaxBytesPerSec(), cfg.getFgReadMaxBytesPerSec());
     updateWriteLimiter(cfg.getBgWriteMaxBytesPerSec(), cfg.getFgWriteMaxBytesPerSec());
+    if (cfg.s3_max_read_bytes_per_sec == 0 && cfg.s3_max_get_object_streams == 0)
+    {
+        s3_read_limiter = nullptr;
+    }
+    else if (s3_read_limiter == nullptr)
+    {
+        s3_read_limiter = std::make_shared<S3::S3ReadLimiter>(
+            cfg.s3_max_read_bytes_per_sec,
+            cfg.s3_max_get_object_streams);
+    }
+    else
+    {
+        s3_read_limiter->updateConfig(cfg.s3_max_read_bytes_per_sec, cfg.s3_max_get_object_streams);
+    }
 }
 
 void IORateLimiter::updateReadLimiter(Int64 bg_bytes, Int64 fg_bytes)
diff --git a/dbms/src/IO/BaseFile/RateLimiter.h b/dbms/src/IO/BaseFile/RateLimiter.h
index 3fce18e6e6e..96037169108 100644
--- a/dbms/src/IO/BaseFile/RateLimiter.h
+++ b/dbms/src/IO/BaseFile/RateLimiter.h
@@ -33,6 +33,10 @@ namespace DB
 {
 class LimiterStat;
 class IOLimitTuner;
+namespace S3
+{
+class S3ReadLimiter;
+}
 
 enum class LimiterType
 {
@@ -216,6 +220,7 @@ class IORateLimiter
     WriteLimiterPtr getBgWriteLimiter();
 
     ReadLimiterPtr getReadLimiter();
+    std::shared_ptr<S3::S3ReadLimiter> getS3ReadLimiter();
     void init(Poco::Util::AbstractConfiguration & config_);
     void updateConfig(Poco::Util::AbstractConfiguration & config_);
 
@@ -250,6 +255,7 @@ class IORateLimiter
     // Background read and foreground read
     ReadLimiterPtr bg_read_limiter;
     ReadLimiterPtr fg_read_limiter;
+    std::shared_ptr<S3::S3ReadLimiter> s3_read_limiter;
 
     std::mutex bg_thread_ids_mtx;
     std::vector<pid_t> bg_thread_ids;
diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp
index 1fcb9f1d0e9..8f08296c604 100644
--- a/dbms/src/Server/Server.cpp
+++ b/dbms/src/Server/Server.cpp
@@ -942,6 +942,7 @@ try
 
     /// Initialize RateLimiter.
     global_context->initializeRateLimiter(config(), bg_pool, blockable_bg_pool);
+    S3::ClientFactory::instance().setS3ReadLimiter(global_context->getIORateLimiter().getS3ReadLimiter());
 
     global_context->setServerInfo(server_info);
     if (server_info.memory_info.capacity == 0)
@@ -971,6 +972,7 @@ try
             buildLoggers(*config);
             global_context->getTMTContext().reloadConfig(*config);
             global_context->getIORateLimiter().updateConfig(*config);
+            S3::ClientFactory::instance().setS3ReadLimiter(global_context->getIORateLimiter().getS3ReadLimiter());
             global_context->reloadDeltaTreeConfig(*config);
             DM::SegmentReadTaskScheduler::instance().updateConfig(global_context->getSettingsRef());
             if (FileCache::instance() != nullptr)
diff --git a/dbms/src/Server/tests/gtest_storage_config.cpp b/dbms/src/Server/tests/gtest_storage_config.cpp
index 4d4561069cc..fefc45e3a31 100644
--- a/dbms/src/Server/tests/gtest_storage_config.cpp
+++ b/dbms/src/Server/tests/gtest_storage_config.cpp
@@ -595,6 +595,8 @@ try
 max_bytes_per_sec=0
 max_read_bytes_per_sec=0
 max_write_bytes_per_sec=0
+s3_max_read_bytes_per_sec=0
+s3_max_get_object_streams=0
 foreground_write_weight=1
 background_write_weight=2
 foreground_read_weight=5
@@ -606,6 +608,8 @@ background_read_weight=2
 max_bytes_per_sec=1024000
 max_read_bytes_per_sec=0
 max_write_bytes_per_sec=0
+s3_max_read_bytes_per_sec=2048000
+s3_max_get_object_streams=256
 foreground_write_weight=1
 background_write_weight=2
 foreground_read_weight=5
@@ -617,6 +621,8 @@ background_read_weight=2
 max_bytes_per_sec=0
 max_read_bytes_per_sec=1024000
 max_write_bytes_per_sec=1024000
+s3_max_read_bytes_per_sec=1024
+s3_max_get_object_streams=8
 foreground_write_weight=1
 background_write_weight=2
 foreground_read_weight=5
@@ -628,6 +634,8 @@ background_read_weight=2
 max_bytes_per_sec=1024000
 max_read_bytes_per_sec=1024000
 max_write_bytes_per_sec=1024000
+s3_max_read_bytes_per_sec=4096
+s3_max_get_object_streams=16
 foreground_write_weight=1
 background_write_weight=2
 foreground_read_weight=5
@@ -638,6 +646,8 @@ background_read_weight=2
             [storage]
             [storage.io_rate_limit]
             max_bytes_per_sec=1024000
+            s3_max_read_bytes_per_sec=8192
+            s3_max_get_object_streams=32
             foreground_write_weight=80
             background_write_weight=20
             foreground_read_weight=0
@@ -651,6 +661,8 @@ background_read_weight=2
         ASSERT_EQ(io_config.max_bytes_per_sec, 0);
         ASSERT_EQ(io_config.max_read_bytes_per_sec, 0);
         ASSERT_EQ(io_config.max_write_bytes_per_sec, 0);
+        ASSERT_EQ(io_config.s3_max_read_bytes_per_sec, 0);
+        ASSERT_EQ(io_config.s3_max_get_object_streams, 0);
         ASSERT_TRUE(io_config.use_max_bytes_per_sec);
         ASSERT_EQ(io_config.fg_write_weight, 0);
         ASSERT_EQ(io_config.bg_write_weight, 100);
@@ -669,6 +681,8 @@ background_read_weight=2
         ASSERT_EQ(io_config.max_bytes_per_sec, 0);
         ASSERT_EQ(io_config.max_read_bytes_per_sec, 0);
         ASSERT_EQ(io_config.max_write_bytes_per_sec, 0);
+        ASSERT_EQ(io_config.s3_max_read_bytes_per_sec, 0);
+        ASSERT_EQ(io_config.s3_max_get_object_streams, 0);
         ASSERT_TRUE(io_config.use_max_bytes_per_sec);
         ASSERT_EQ(io_config.fg_write_weight, 1);
         ASSERT_EQ(io_config.bg_write_weight, 2);
@@ -687,6 +701,8 @@ background_read_weight=2
         ASSERT_EQ(io_config.max_bytes_per_sec, 1024000);
         ASSERT_EQ(io_config.max_read_bytes_per_sec, 0);
         ASSERT_EQ(io_config.max_write_bytes_per_sec, 0);
+        ASSERT_EQ(io_config.s3_max_read_bytes_per_sec, 2048000);
+        ASSERT_EQ(io_config.s3_max_get_object_streams, 256);
         ASSERT_TRUE(io_config.use_max_bytes_per_sec);
         ASSERT_EQ(io_config.fg_write_weight, 1);
         ASSERT_EQ(io_config.bg_write_weight, 2);
@@ -705,6 +721,8 @@ background_read_weight=2
         ASSERT_EQ(io_config.max_bytes_per_sec, 0); // ignored
         ASSERT_EQ(io_config.max_read_bytes_per_sec, 1024000);
         ASSERT_EQ(io_config.max_write_bytes_per_sec, 1024000);
+        ASSERT_EQ(io_config.s3_max_read_bytes_per_sec, 1024);
+        ASSERT_EQ(io_config.s3_max_get_object_streams, 8);
         ASSERT_FALSE(io_config.use_max_bytes_per_sec); // use max_read_bytes_per_sec and max_write_bytes_per_sec
         ASSERT_EQ(io_config.fg_write_weight, 1);
         ASSERT_EQ(io_config.bg_write_weight, 2);
@@ -725,6 +743,8 @@ background_read_weight=2
         ASSERT_EQ(io_config.max_bytes_per_sec, 1024000); // ignored
         ASSERT_EQ(io_config.max_read_bytes_per_sec, 1024000);
         ASSERT_EQ(io_config.max_write_bytes_per_sec, 1024000);
+        ASSERT_EQ(io_config.s3_max_read_bytes_per_sec, 4096);
+        ASSERT_EQ(io_config.s3_max_get_object_streams, 16);
         ASSERT_FALSE(io_config.use_max_bytes_per_sec); // use max_read_bytes_per_sec and max_write_bytes_per_sec
         ASSERT_EQ(io_config.fg_write_weight, 1);
         ASSERT_EQ(io_config.bg_write_weight, 2);
@@ -745,6 +765,8 @@ background_read_weight=2
         ASSERT_EQ(io_config.max_bytes_per_sec, 1024000);
         ASSERT_EQ(io_config.max_read_bytes_per_sec, 0);
         ASSERT_EQ(io_config.max_write_bytes_per_sec, 0);
+        ASSERT_EQ(io_config.s3_max_read_bytes_per_sec, 8192);
+        ASSERT_EQ(io_config.s3_max_get_object_streams, 32);
         ASSERT_TRUE(io_config.use_max_bytes_per_sec);
         ASSERT_EQ(io_config.fg_write_weight, 80);
         ASSERT_EQ(io_config.bg_write_weight, 20);
diff --git a/dbms/src/Storages/S3/MockS3Client.h b/dbms/src/Storages/S3/MockS3Client.h
index fcd4f17e67b..2a22168a4c8 100644
--- a/dbms/src/Storages/S3/MockS3Client.h
+++ b/dbms/src/Storages/S3/MockS3Client.h
@@ -29,8 +29,16 @@ class MockS3Client final : public S3::TiFlashS3Client
         const String & bucket,
         const String & root,
         const Aws::Auth::AWSCredentials & cred,
-        const Aws::Client::ClientConfiguration & cfg)
-        : TiFlashS3Client(bucket, root, cred, cfg, Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, false)
+        const Aws::Client::ClientConfiguration & cfg,
+        std::shared_ptr<S3ReadLimiter> s3_read_limiter = nullptr)
+        : TiFlashS3Client(
+            bucket,
+            root,
+            cred,
+            cfg,
+            Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
+            false,
+            std::move(s3_read_limiter))
     {}
 
     ~MockS3Client() override = default;
diff --git a/dbms/src/Storages/S3/S3Common.cpp b/dbms/src/Storages/S3/S3Common.cpp
index 3543f2129b2..79677bd0607 100644
--- a/dbms/src/Storages/S3/S3Common.cpp
+++ b/dbms/src/Storages/S3/S3Common.cpp
@@ -182,20 +182,24 @@ TiFlashS3Client::TiFlashS3Client(
     const Aws::Auth::AWSCredentials & credentials,
     const Aws::Client::ClientConfiguration & clientConfiguration,
     Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy signPayloads,
-    bool useVirtualAddressing)
+    bool useVirtualAddressing,
+    std::shared_ptr<S3ReadLimiter> s3_read_limiter_)
     : Aws::S3::S3Client(credentials, clientConfiguration, signPayloads, useVirtualAddressing)
     , bucket_name(bucket_name_)
     , key_root(normalizedRoot(root_))
+    , s3_read_limiter(std::move(s3_read_limiter_))
     , log(Logger::get(fmt::format("bucket={} root={}", bucket_name, key_root)))
 {}
 
 TiFlashS3Client::TiFlashS3Client(
     const String & bucket_name_,
     const String & root_,
-    std::unique_ptr<Aws::S3::S3Client> && raw_client)
+    std::unique_ptr<Aws::S3::S3Client> && raw_client,
+    std::shared_ptr<S3ReadLimiter> s3_read_limiter_)
     : Aws::S3::S3Client(std::move(*raw_client))
     , bucket_name(bucket_name_)
     , key_root(normalizedRoot(root_))
+    , s3_read_limiter(std::move(s3_read_limiter_))
     , log(Logger::get(fmt::format("bucket={} root={}", bucket_name, key_root)))
 {}
 
@@ -345,7 +349,8 @@ void ClientFactory::init(const StorageS3Config & config_, bool mock_s3_)
     {
         auto [s3_client, vendor] = create(config, log);
         cloud_vendor = vendor;
-        shared_tiflash_client = std::make_shared<TiFlashS3Client>(config.bucket, config.root, std::move(s3_client));
+        shared_tiflash_client
+            = std::make_shared<TiFlashS3Client>(config.bucket, config.root, std::move(s3_client), shared_s3_read_limiter);
     }
     else
     {
@@ -353,7 +358,12 @@ void ClientFactory::init(const StorageS3Config & config_, bool mock_s3_)
         Aws::Client::ClientConfiguration cfg(true, /*defaultMode=*/"standard", /*shouldDisableIMDS=*/true);
         cfg.region = Aws::Region::US_EAST_1; // default region
         Aws::Auth::AWSCredentials cred("mock_access_key", "mock_secret_key");
-        shared_tiflash_client = std::make_unique<tests::MockS3Client>(config.bucket, config.root, cred, cfg);
+        shared_tiflash_client = std::make_unique<tests::MockS3Client>(
+            config.bucket,
+            config.root,
+            cred,
+            cfg,
+            shared_s3_read_limiter);
     }
     client_is_inited = true; // init finish
 }
@@ -383,7 +393,8 @@ std::shared_ptr<TiFlashS3Client> ClientFactory::initClientFromWriteNode()
 
     auto [s3_client, vendor] = create(config, log);
     cloud_vendor = vendor;
-    shared_tiflash_client = std::make_shared<TiFlashS3Client>(config.bucket, config.root, std::move(s3_client));
+    shared_tiflash_client
+        = std::make_shared<TiFlashS3Client>(config.bucket, config.root, std::move(s3_client), shared_s3_read_limiter);
     client_is_inited = true; // init finish
     return shared_tiflash_client;
 }
diff --git a/dbms/src/Storages/S3/S3Common.h b/dbms/src/Storages/S3/S3Common.h
index 456d8c002c2..b5e2c21ce4c 100644
--- a/dbms/src/Storages/S3/S3Common.h
+++ b/dbms/src/Storages/S3/S3Common.h
@@ -28,6 +28,8 @@
 #include <common/types.h>
 
 #include <magic_enum.hpp>
+#include <memory>
+#include <mutex>
 
 namespace pingcap::kv
 {
@@ -41,6 +43,7 @@ extern const int S3_ERROR;
 
 namespace DB::S3
 {
+class S3ReadLimiter;
 
 inline String S3ErrorMessage(const Aws::S3::S3Error & e)
 {
@@ -70,12 +73,14 @@ class TiFlashS3Client : public Aws::S3::S3Client
         const Aws::Auth::AWSCredentials & credentials,
         const Aws::Client::ClientConfiguration & clientConfiguration,
         Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy signPayloads,
-        bool useVirtualAddressing);
+        bool useVirtualAddressing,
+        std::shared_ptr<S3ReadLimiter> s3_read_limiter_ = nullptr);
 
     TiFlashS3Client(
         const String & bucket_name_,
         const String & root_,
-        std::unique_ptr<Aws::S3::S3Client> && raw_client);
+        std::unique_ptr<Aws::S3::S3Client> && raw_client,
+        std::shared_ptr<S3ReadLimiter> s3_read_limiter_ = nullptr);
 
     const String & bucket() const { return bucket_name; }
 
@@ -89,9 +94,25 @@ class TiFlashS3Client : public Aws::S3::S3Client
         req.WithBucket(bucket_name).WithKey(is_root_single_slash ? key : key_root + key);
     }
 
+    /// Returns the shared node-level limiter for S3 remote reads.
+    std::shared_ptr<S3ReadLimiter> getS3ReadLimiter() const
+    {
+        std::lock_guard lock(s3_read_limiter_mutex);
+        return s3_read_limiter;
+    }
+
+    /// Publish a new node-level limiter to this client. Existing and future readers share the same object.
+    void setS3ReadLimiter(std::shared_ptr<S3ReadLimiter> limiter)
+    {
+        std::lock_guard lock(s3_read_limiter_mutex);
+        s3_read_limiter = std::move(limiter);
+    }
+
 private:
     const String bucket_name;
     String key_root;
+    mutable std::mutex s3_read_limiter_mutex;
+    std::shared_ptr<S3ReadLimiter> s3_read_limiter;
 
 public:
     LoggerPtr log;
@@ -146,6 +167,14 @@ class ClientFactory
 
     std::shared_ptr<TiFlashS3Client> sharedTiFlashClient();
 
+    void setS3ReadLimiter(const std::shared_ptr<S3ReadLimiter> & limiter)
+    {
+        std::unique_lock lock_init(mtx_init);
+        shared_s3_read_limiter = limiter;
+        if (shared_tiflash_client != nullptr)
+            shared_tiflash_client->setS3ReadLimiter(shared_s3_read_limiter);
+    }
+
     S3GCMethod gc_method = S3GCMethod::Lifecycle;
 
     CloudVendor cloud_vendor = CloudVendor::Unknown;
@@ -171,6 +200,7 @@ class ClientFactory
     mutable std::mutex mtx_init; // protect `config` `shared_tiflash_client` `kv_cluster`
     StorageS3Config config;
     std::shared_ptr<TiFlashS3Client> shared_tiflash_client;
+    std::shared_ptr<S3ReadLimiter> shared_s3_read_limiter;
     pingcap::kv::Cluster * kv_cluster = nullptr;
 
     LoggerPtr log;
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.cpp b/dbms/src/Storages/S3/S3ReadLimiter.cpp
new file mode 100644
index 00000000000..9a9b412b7e5
--- /dev/null
+++ b/dbms/src/Storages/S3/S3ReadLimiter.cpp
@@ -0,0 +1,240 @@
+// Copyright 2023 PingCAP, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Common/CurrentMetrics.h>
+#include <Common/Stopwatch.h>
+#include <Common/TiFlashMetrics.h>
+#include <Storages/S3/S3ReadLimiter.h>
+
+#include <algorithm>
+
+#include <ext/scope_guard.h>
+
+namespace CurrentMetrics
+{
+extern const Metric S3ActiveGetObjectStreams;
+} // namespace CurrentMetrics
+
+namespace DB::S3
+{
+namespace
+{
+template <typename F>
+void recordWaitIfNeeded(bool waited, const Stopwatch & sw, F && observe)
+{
+    if (!waited)
+        return;
+    observe(sw.elapsedSeconds());
+}
+} // namespace
+
+S3ReadLimiter::StreamToken::~StreamToken()
+{
+    reset();
+}
+
+void S3ReadLimiter::StreamToken::reset()
+{
+    if (owner == nullptr)
+        return;
+    owner->releaseStream();
+    owner = nullptr;
+}
+
+S3ReadLimiter::S3ReadLimiter(UInt64 max_read_bytes_per_sec_, UInt64 max_streams_, UInt64 refill_period_ms_)
+    : refill_period_ms(refill_period_ms_)
+    , max_read_bytes_per_sec(max_read_bytes_per_sec_)
+    , max_streams(max_streams_)
+    , active_streams(0)
+    , available_bytes(static_cast<double>(burstBytesPerPeriod(max_read_bytes_per_sec_)))
+    , last_refill_time(Clock::now())
+    , stop(false)
+    , log(Logger::get("S3ReadLimiter"))
+{
+    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_max_read_bytes_per_sec).Set(max_read_bytes_per_sec_);
+    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_max_get_object_streams).Set(max_streams_);
+    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_active_get_object_streams).Set(0);
+}
+
+S3ReadLimiter::~S3ReadLimiter()
+{
+    setStop();
+}
+
+void S3ReadLimiter::updateConfig(UInt64 max_read_bytes_per_sec_, UInt64 max_streams_)
+{
+    {
+        std::lock_guard lock(bytes_mutex);
+        max_read_bytes_per_sec.store(max_read_bytes_per_sec_, std::memory_order_relaxed);
+        available_bytes = std::min(available_bytes, static_cast<double>(burstBytesPerPeriod(max_read_bytes_per_sec_)));
+        if (max_read_bytes_per_sec_ == 0)
+            available_bytes = 0;
+        last_refill_time = Clock::now();
+    }
+    {
+        std::lock_guard lock(stream_mutex);
+        max_streams.store(max_streams_, std::memory_order_relaxed);
+    }
+    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_max_read_bytes_per_sec).Set(max_read_bytes_per_sec_);
+    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_max_get_object_streams).Set(max_streams_);
+    bytes_cv.notify_all();
+    stream_cv.notify_all();
+}
+
+std::unique_ptr<S3ReadLimiter::StreamToken> S3ReadLimiter::acquireStream()
+{
+    const auto limit = max_streams.load(std::memory_order_relaxed);
+    if (limit == 0)
+        return nullptr;
+
+    Stopwatch sw;
+    bool waited = false;
+    std::unique_lock lock(stream_mutex);
+    while (!stop && max_streams.load(std::memory_order_relaxed) != 0
+           && active_streams.load(std::memory_order_relaxed) >= max_streams.load(std::memory_order_relaxed))
+    {
+        if (!waited)
+        {
+            GET_METRIC(tiflash_storage_s3_read_limiter, type_stream_wait_count).Increment();
+            waited = true;
+        }
+        stream_cv.wait(lock);
+    }
+
+    recordWaitIfNeeded(waited, sw, [](double seconds) {
+        GET_METRIC(tiflash_storage_s3_read_limiter_wait_seconds, type_stream_wait).Observe(seconds);
+    });
+
+    if (stop || max_streams.load(std::memory_order_relaxed) == 0)
+        return nullptr;
+
+    auto cur = active_streams.fetch_add(1, std::memory_order_relaxed) + 1;
+    CurrentMetrics::add(CurrentMetrics::S3ActiveGetObjectStreams);
+    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_active_get_object_streams).Set(cur);
+    return std::make_unique<StreamToken>(this);
+}
+
+void S3ReadLimiter::requestBytes(UInt64 bytes, S3ReadSource source)
+{
+    if (bytes == 0)
+        return;
+
+    switch (source)
+    {
+    case S3ReadSource::DirectRead:
+        GET_METRIC(tiflash_storage_s3_read_limiter, type_direct_read_bytes).Increment(bytes);
+        break;
+    case S3ReadSource::FileCacheDownload:
+        GET_METRIC(tiflash_storage_s3_read_limiter, type_filecache_download_bytes).Increment(bytes);
+        break;
+    }
+
+    const auto limit = max_read_bytes_per_sec.load(std::memory_order_relaxed);
+    if (limit == 0)
+        return;
+
+    Stopwatch sw;
+    bool waited = false;
+    std::unique_lock lock(bytes_mutex);
+    SCOPE_EXIT({
+        recordWaitIfNeeded(waited, sw, [](double seconds) {
+            GET_METRIC(tiflash_storage_s3_read_limiter_wait_seconds, type_byte_wait).Observe(seconds);
+        });
+    });
+    while (!stop)
+    {
+        const auto current_limit = max_read_bytes_per_sec.load(std::memory_order_relaxed);
+        if (current_limit == 0)
+            return;
+
+        const auto now = Clock::now();
+        refillBytesLocked(now);
+        if (available_bytes >= static_cast<double>(bytes))
+        {
+            available_bytes -= static_cast<double>(bytes);
+            return;
+        }
+
+        if (!waited)
+        {
+            GET_METRIC(tiflash_storage_s3_read_limiter, type_byte_wait_count).Increment();
+            waited = true;
+        }
+
+        const auto missing = static_cast<double>(bytes) - available_bytes;
+        const auto wait_us = std::max<UInt64>(
+            1,
+            static_cast<UInt64>(missing * 1000000.0 / static_cast<double>(current_limit)));
+        bytes_cv.wait_for(lock, std::chrono::microseconds(wait_us));
+    }
+}
+
+UInt64 S3ReadLimiter::getSuggestedChunkSize(UInt64 preferred_chunk_size) const
+{
+    const auto limit = max_read_bytes_per_sec.load(std::memory_order_relaxed);
+    if (limit == 0)
+        return preferred_chunk_size;
+    return std::max<UInt64>(1, std::min(preferred_chunk_size, burstBytesPerPeriod(limit)));
+}
+
+void S3ReadLimiter::setStop()
+{
+    {
+        std::lock_guard lock_stream(stream_mutex);
+        std::lock_guard lock_bytes(bytes_mutex);
+        if (stop)
+            return;
+        stop = true;
+    }
+    stream_cv.notify_all();
+    bytes_cv.notify_all();
+}
+
+void S3ReadLimiter::releaseStream()
+{
+    auto cur = active_streams.fetch_sub(1, std::memory_order_relaxed) - 1;
+    CurrentMetrics::sub(CurrentMetrics::S3ActiveGetObjectStreams);
+    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_active_get_object_streams).Set(cur);
+    stream_cv.notify_one();
+}
+
+void S3ReadLimiter::refillBytesLocked(Clock::time_point now)
+{
+    const auto current_limit = max_read_bytes_per_sec.load(std::memory_order_relaxed);
+    if (current_limit == 0)
+    {
+        available_bytes = 0;
+        last_refill_time = now;
+        return;
+    }
+
+    const auto elapsed_ns
+        = std::chrono::duration_cast<std::chrono::nanoseconds>(now - last_refill_time).count();
+    if (elapsed_ns <= 0)
+        return;
+
+    const auto burst_bytes = static_cast<double>(burstBytesPerPeriod(current_limit));
+    available_bytes = std::min(
+        burst_bytes,
+        available_bytes + static_cast<double>(current_limit) * static_cast<double>(elapsed_ns) / 1000000000.0);
+    last_refill_time = now;
+}
+
+UInt64 S3ReadLimiter::burstBytesPerPeriod(UInt64 max_read_bytes_per_sec_) const
+{
+    if (max_read_bytes_per_sec_ == 0)
+        return 0;
+    return std::max<UInt64>(1, max_read_bytes_per_sec_ * refill_period_ms / 1000);
+}
+} // namespace DB::S3
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.h b/dbms/src/Storages/S3/S3ReadLimiter.h
new file mode 100644
index 00000000000..077fd4be8ed
--- /dev/null
+++ b/dbms/src/Storages/S3/S3ReadLimiter.h
@@ -0,0 +1,122 @@
+// Copyright 2023 PingCAP, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Common/Logger.h>
+#include <common/types.h>
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+
+namespace DB::S3
+{
+enum class S3ReadSource
+{
+    DirectRead,
+    FileCacheDownload,
+};
+
+class S3ReadLimiter : public std::enable_shared_from_this<S3ReadLimiter>
+{
+public:
+    class StreamToken
+    {
+    public:
+        explicit StreamToken(S3ReadLimiter * owner_)
+            : owner(owner_)
+        {}
+
+        ~StreamToken();
+
+        StreamToken(const StreamToken &) = delete;
+        StreamToken & operator=(const StreamToken &) = delete;
+
+        StreamToken(StreamToken && other) noexcept
+            : owner(other.owner)
+        {
+            other.owner = nullptr;
+        }
+
+        StreamToken & operator=(StreamToken && other) noexcept
+        {
+            if (this == &other)
+                return *this;
+            reset();
+            owner = other.owner;
+            other.owner = nullptr;
+            return *this;
+        }
+
+        void reset();
+
+    private:
+        S3ReadLimiter * owner;
+    };
+
+    /// A lightweight node-level limiter for S3 remote reads.
+    ///
+    /// It limits two dimensions together:
+    /// - concurrently active `GetObject` body streams
+    /// - total remote-read bytes consumed by direct reads and FileCache downloads
+    explicit S3ReadLimiter(UInt64 max_read_bytes_per_sec_ = 0, UInt64 max_streams_ = 0, UInt64 refill_period_ms_ = 100);
+
+    ~S3ReadLimiter();
+
+    /// Update both byte-rate and stream limits. `0` disables the corresponding limit.
+    void updateConfig(UInt64 max_read_bytes_per_sec_, UInt64 max_streams_);
+
+    /// Acquire a token that must live as long as the `GetObject` body stream remains active.
+    /// Returns `nullptr` when the stream limit is disabled.
+    [[nodiscard]] std::unique_ptr<StreamToken> acquireStream();
+
+    /// Charge remote-read bytes. The call blocks when the current node-level budget is exhausted.
+    void requestBytes(UInt64 bytes, S3ReadSource source);
+
+    /// Suggest a chunk size that keeps limiter-enabled readers from creating large bursts.
+    UInt64 getSuggestedChunkSize(UInt64 preferred_chunk_size) const;
+
+    UInt64 maxReadBytesPerSec() const { return max_read_bytes_per_sec.load(std::memory_order_relaxed); }
+    UInt64 maxStreams() const { return max_streams.load(std::memory_order_relaxed); }
+    UInt64 activeStreams() const { return active_streams.load(std::memory_order_relaxed); }
+
+    void setStop();
+
+private:
+    using Clock = std::chrono::steady_clock;
+
+    void releaseStream();
+    void refillBytesLocked(Clock::time_point now);
+    UInt64 burstBytesPerPeriod(UInt64 max_read_bytes_per_sec_) const;
+
+    const UInt64 refill_period_ms;
+    std::atomic<UInt64> max_read_bytes_per_sec;
+    std::atomic<UInt64> max_streams;
+    std::atomic<UInt64> active_streams;
+
+    mutable std::mutex stream_mutex;
+    std::condition_variable stream_cv;
+
+    mutable std::mutex bytes_mutex;
+    std::condition_variable bytes_cv;
+    double available_bytes;
+    Clock::time_point last_refill_time;
+    bool stop;
+
+    LoggerPtr log;
+};
+} // namespace DB::S3
diff --git a/dbms/src/Storages/S3/tests/gtest_s3client.cpp b/dbms/src/Storages/S3/tests/gtest_s3client.cpp
index 61ac7dbc27b..66bbad412ba 100644
--- a/dbms/src/Storages/S3/tests/gtest_s3client.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_s3client.cpp
@@ -15,8 +15,10 @@
 #include <Common/FailPoint.h>
 #include <Common/Logger.h>
 #include <Debug/TiFlashTestEnv.h>
+#include <IO/BaseFile/RateLimiter.h>
 #include <Storages/S3/Lifecycle.h>
 #include <Storages/S3/S3Common.h>
+#include <Storages/S3/S3ReadLimiter.h>
 #include <TestUtils/TiFlashTestBasic.h>
 #include <aws/core/AmazonWebServiceRequest.h>
 #include <aws/core/utils/xml/XmlSerializer.h>
@@ -252,6 +254,26 @@ try
 }
 CATCH
 
+TEST_F(S3ClientTest, PublishS3ReadLimiter)
+{
+    auto limiter = std::make_shared<S3ReadLimiter>(4096, 7);
+    ClientFactory::instance().setS3ReadLimiter(limiter);
+    ASSERT_EQ(client->getS3ReadLimiter(), limiter);
+
+    IORateLimiter io_rate_limiter;
+    IORateLimitConfig cfg;
+    cfg.s3_max_read_bytes_per_sec = 8192;
+    cfg.s3_max_get_object_streams = 9;
+    io_rate_limiter.updateLimiterByConfig(cfg);
+
+    auto published = io_rate_limiter.getS3ReadLimiter();
+    ASSERT_NE(published, nullptr);
+    ClientFactory::instance().setS3ReadLimiter(published);
+    ASSERT_EQ(ClientFactory::instance().sharedTiFlashClient()->getS3ReadLimiter(), published);
+    ASSERT_EQ(published->maxReadBytesPerSec(), 8192);
+    ASSERT_EQ(published->maxStreams(), 9);
+}
+
 TEST_F(S3ClientTest, ListPrefixEarlyStopOnTruncatedResult)
 try
 {

From a4b9e53e4a5008909c8d34306784518ade16c1c1 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Wed, 25 Mar 2026 00:27:35 +0800
Subject: [PATCH 02/36] disagg: throttle S3 readers and deduplicate cache
 misses

---
 dbms/src/Common/TiFlashMetrics.h              |   8 +
 .../IO/BaseFile/tests/gtest_rate_limiter.cpp  |  29 +++
 dbms/src/Interpreters/Settings.h              |   1 +
 dbms/src/Storages/S3/FileCache.cpp            | 223 ++++++++++++++----
 dbms/src/Storages/S3/FileCache.h              |   8 +-
 dbms/src/Storages/S3/S3RandomAccessFile.cpp   | 152 ++++++++++++
 dbms/src/Storages/S3/S3RandomAccessFile.h     |   7 +
 .../src/Storages/S3/tests/gtest_filecache.cpp |  96 ++++++++
 dbms/src/Storages/S3/tests/gtest_s3file.cpp   |  30 +++
 9 files changed, 504 insertions(+), 50 deletions(-)

diff --git a/dbms/src/Common/TiFlashMetrics.h b/dbms/src/Common/TiFlashMetrics.h
index 6dba9f1735b..d49d3017cb2 100644
--- a/dbms/src/Common/TiFlashMetrics.h
+++ b/dbms/src/Common/TiFlashMetrics.h
@@ -901,6 +901,10 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       F(type_dtfile_full, {"type", "dtfile_full"}),                                                                                 \
       F(type_dtfile_download, {"type", "dtfile_download"}),                                                                         \
       F(type_dtfile_download_failed, {"type", "dtfile_download_failed"}),                                                           \
+      F(type_wait_on_downloading, {"type", "wait_on_downloading"}),                                                                 \
+      F(type_wait_on_downloading_hit, {"type", "wait_on_downloading_hit"}),                                                         \
+      F(type_wait_on_downloading_timeout, {"type", "wait_on_downloading_timeout"}),                                                 \
+      F(type_wait_on_downloading_failed, {"type", "wait_on_downloading_failed"}),                                                   \
       F(type_page_hit, {"type", "page_hit"}),                                                                                       \
       F(type_page_miss, {"type", "page_miss"}),                                                                                     \
       F(type_page_evict, {"type", "page_evict"}),                                                                                   \
@@ -915,6 +919,10 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       F(type_page_evict_bytes, {"type", "page_evict_bytes"}),                                                                       \
       F(type_page_download_bytes, {"type", "page_download_bytes"}),                                                                 \
       F(type_page_read_bytes, {"type", "page_read_bytes"}))                                                                         \
+    M(tiflash_storage_remote_cache_status,                                                                                           \
+      "Remote cache status",                                                                                                        \
+      Gauge,                                                                                                                        \
+      F(type_bg_downloading_count, {{"type", "bg_downloading_count"}}))                                                            \
     M(tiflash_storage_io_limiter_pending_seconds,                                                                                   \
       "I/O limiter pending duration in seconds",                                                                                    \
       Histogram,                                                                                                                    \
diff --git a/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp b/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
index a50dcf880d6..376c39f5ac7 100644
--- a/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
+++ b/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
@@ -14,12 +14,14 @@
 
 #include <Common/Exception.h>
 #include <IO/BaseFile/RateLimiter.h>
+#include <Storages/S3/S3ReadLimiter.h>
 #include <TestUtils/TiFlashTestBasic.h>
 #include <fcntl.h>
 #include <gtest/gtest.h>
 #include <unistd.h>
 
 #include <ctime>
+#include <future>
 #include <random>
 #include <thread>
 
@@ -374,6 +376,33 @@ TEST(ReadLimiterTest, ReadMany)
     ASSERT_EQ(read_limiter.alloc_bytes, 100);
 }
 
+TEST(S3ReadLimiterTest, StreamTokenBlocksUntilRelease)
+{
+    auto limiter = std::make_shared<S3::S3ReadLimiter>(0, 1);
+    auto token1 = limiter->acquireStream();
+    ASSERT_NE(token1, nullptr);
+    ASSERT_EQ(limiter->activeStreams(), 1);
+
+    auto future = std::async(std::launch::async, [&]() { return limiter->acquireStream(); });
+    ASSERT_EQ(future.wait_for(50ms), std::future_status::timeout);
+
+    token1.reset();
+    auto token2 = future.get();
+    ASSERT_NE(token2, nullptr);
+    ASSERT_EQ(limiter->activeStreams(), 1);
+    token2.reset();
+    ASSERT_EQ(limiter->activeStreams(), 0);
+}
+
+TEST(S3ReadLimiterTest, ByteRequestsWaitForRefill)
+{
+    S3::S3ReadLimiter limiter(1000, 0, 100);
+    limiter.requestBytes(100, S3::S3ReadSource::DirectRead);
+    AtomicStopwatch watch;
+    limiter.requestBytes(100, S3::S3ReadSource::DirectRead);
+    ASSERT_GE(watch.elapsedMilliseconds(), 80);
+}
+
 #ifdef __linux__
 TEST(IORateLimiterTest, IOStat)
 {
diff --git a/dbms/src/Interpreters/Settings.h b/dbms/src/Interpreters/Settings.h
index 054a97ac0b5..fb9472131b2 100644
--- a/dbms/src/Interpreters/Settings.h
+++ b/dbms/src/Interpreters/Settings.h
@@ -250,6 +250,7 @@ struct Settings
     M(SettingDouble, dt_filecache_downloading_count_scale, 2.0, "Max concurrency of download task count of FileCache = number of logical cpu cores * dt_filecache_downloading_count_scale.")                                                              \
     M(SettingDouble, dt_filecache_max_downloading_count_scale, 10.0, "Max queue size of download task count of FileCache = number of logical cpu cores * dt_filecache_max_downloading_count_scale.")                                    \
     M(SettingUInt64, dt_filecache_min_age_seconds, 1800, "Files of the same priority can only be evicted from files that were not accessed within `dt_filecache_min_age_seconds` seconds.")                                             \
+    M(SettingUInt64, dt_filecache_wait_on_downloading_ms, 0, "When a remote cache lookup sees the same key is already being downloaded, wait up to this many milliseconds for that download to finish. 0 disables the bounded wait.") \
     M(SettingBool, dt_enable_fetch_memtableset, true, "Whether fetching delta cache in FetchDisaggPages")                                                                                                                               \
     M(SettingUInt64, dt_fetch_pages_packet_limit_size, 512 * 1024, "Response packet bytes limit of FetchDisaggPages, 0 means one page per packet")                                                                                      \
     M(SettingDouble, dt_fetch_page_concurrency_scale, 4.0, "Concurrency of fetching pages of one query equals to num_streams * dt_fetch_page_concurrency_scale.")                                                                       \
diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 3f953639d8f..08ceb49b767 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -34,6 +34,7 @@
 #include <Storages/S3/FileCache.h>
 #include <Storages/S3/FileCachePerf.h>
 #include <Storages/S3/S3Common.h>
+#include <Storages/S3/S3ReadLimiter.h>
 #include <aws/core/utils/memory/stl/AWSStreamFwd.h>
 #include <aws/s3/model/GetObjectRequest.h>
 #include <common/logger_useful.h>
@@ -81,6 +82,41 @@ namespace DB
 {
 using FileType = FileSegment::FileType;
 
+namespace
+{
+constexpr UInt64 default_wait_log_interval_seconds = 30;
+constexpr UInt64 wait_ready_timeout_seconds = 300;
+constexpr size_t s3_download_limiter_buffer_size = 128 * 1024;
+
+enum class WaitResult
+{
+    Hit,
+    Timeout,
+    Failed,
+};
+
+void observeWaitOnDownloadingMetrics(FileType file_type, WaitResult result, UInt64 bytes, double wait_seconds)
+{
+    UNUSED(file_type);
+    UNUSED(bytes);
+    UNUSED(wait_seconds);
+    GET_METRIC(tiflash_storage_remote_cache, type_wait_on_downloading).Increment();
+    switch (result)
+    {
+    case WaitResult::Hit:
+        GET_METRIC(tiflash_storage_remote_cache, type_wait_on_downloading_hit).Increment();
+        break;
+    case WaitResult::Timeout:
+        GET_METRIC(tiflash_storage_remote_cache, type_wait_on_downloading_timeout).Increment();
+        break;
+    case WaitResult::Failed:
+        GET_METRIC(tiflash_storage_remote_cache, type_wait_on_downloading_failed).Increment();
+        break;
+    }
+}
+
+} // namespace
+
 std::unique_ptr<FileCache> FileCache::global_file_cache_instance;
 
 FileSegment::Status FileSegment::waitForNotEmpty()
@@ -98,7 +134,9 @@ FileSegment::Status FileSegment::waitForNotEmpty()
     {
         SYNC_FOR("before_FileSegment::waitForNotEmpty_wait"); // just before actual waiting...
 
-        auto is_done = cv_ready.wait_for(lock, std::chrono::seconds(30), [&] { return status != Status::Empty; });
+        auto is_done = cv_ready.wait_for(lock, std::chrono::seconds(default_wait_log_interval_seconds), [&] {
+            return status != Status::Empty;
+        });
         if (is_done)
             break;
 
@@ -110,7 +148,7 @@ FileSegment::Status FileSegment::waitForNotEmpty()
             elapsed_secs);
 
         // Snapshot time is 300s
-        if (elapsed_secs > 300)
+        if (elapsed_secs > wait_ready_timeout_seconds)
         {
             throw Exception(
                 ErrorCodes::S3_ERROR,
@@ -123,6 +161,15 @@ FileSegment::Status FileSegment::waitForNotEmpty()
     return status;
 }
 
+FileSegment::Status FileSegment::waitForNotEmptyFor(std::chrono::milliseconds timeout)
+{
+    std::unique_lock lock(mtx);
+    if (status != Status::Empty)
+        return status;
+    cv_ready.wait_for(lock, timeout, [&] { return status != Status::Empty; });
+    return status;
+}
+
 void CacheSizeHistogram::addFileSegment(const FileSegmentPtr & file_seg)
 {
     if (!file_seg)
@@ -346,6 +393,7 @@ FileSegmentPtr FileCache::get(const S3::S3FilenameView & s3_fname, const std::op
     auto & table = tables[static_cast<UInt64>(file_type)];
 
     FileSegmentPtr file_seg;
+    UInt64 wait_ms = 0;
     {
         std::unique_lock lock(mtx);
         if (auto f = table.get(s3_key); f != nullptr)
@@ -358,50 +406,82 @@ FileSegmentPtr FileCache::get(const S3::S3FilenameView & s3_fname, const std::op
             }
             else
             {
-                GET_METRIC(tiflash_storage_remote_cache, type_dtfile_miss).Increment();
-                return nullptr;
+                wait_ms = wait_on_downloading_ms.load(std::memory_order_relaxed);
+                if (wait_ms == 0)
+                {
+                    GET_METRIC(tiflash_storage_remote_cache, type_dtfile_miss).Increment();
+                    return nullptr;
+                }
+                file_seg = f;
             }
         }
-
-        GET_METRIC(tiflash_storage_remote_cache, type_dtfile_miss).Increment();
-        switch (canCache(file_type))
+        else
         {
-        case ShouldCacheRes::RejectTypeNotMatch:
-            GET_METRIC(tiflash_storage_remote_cache, type_dtfile_not_cache_type).Increment();
-            return nullptr;
-        case ShouldCacheRes::RejectTooManyDownloading:
-            GET_METRIC(tiflash_storage_remote_cache, type_dtfile_too_many_download).Increment();
-            return nullptr;
-        case ShouldCacheRes::Cache:
-            break;
-        }
+            GET_METRIC(tiflash_storage_remote_cache, type_dtfile_miss).Increment();
+            switch (canCache(file_type))
+            {
+            case ShouldCacheRes::RejectTypeNotMatch:
+                GET_METRIC(tiflash_storage_remote_cache, type_dtfile_not_cache_type).Increment();
+                return nullptr;
+            case ShouldCacheRes::RejectTooManyDownloading:
+                GET_METRIC(tiflash_storage_remote_cache, type_dtfile_too_many_download).Increment();
+                return nullptr;
+            case ShouldCacheRes::Cache:
+                break;
+            }
+
+            // File not exists, try to download and cache it in background.
+
+            // We don't know the exact size of a object/file, but we need reserve space to save the object/file.
+            // A certain amount of space is reserved for each file type.
+            auto estimated_size = filesize ? *filesize : getEstimatedSizeOfFileType(file_type);
+            if (!reserveSpaceImpl(file_type, estimated_size, EvictMode::TryEvict, lock))
+            {
+                // Space still not enough after eviction.
+                GET_METRIC(tiflash_storage_remote_cache, type_dtfile_full).Increment();
+                LOG_DEBUG(
+                    log,
+                    "s3_key={} space not enough(capacity={} used={} estimated_size={}), skip cache",
+                    s3_key,
+                    cache_capacity,
+                    cache_used,
+                    estimated_size);
+                return nullptr;
+            }
 
-        // File not exists, try to download and cache it in background.
+            file_seg = std::make_shared<FileSegment>(
+                toLocalFilename(s3_key),
+                FileSegment::Status::Empty,
+                estimated_size,
+                file_type);
+            table.set(s3_key, file_seg);
+        }
+    } // Release the lock before submiting bg download task. Because bgDownload may be blocked when the queue is full.
 
-        // We don't know the exact size of a object/file, but we need reserve space to save the object/file.
-        // A certain amount of space is reserved for each file type.
-        auto estimated_size = filesize ? *filesize : getEstimatedSizeOfFileType(file_type);
-        if (!reserveSpaceImpl(file_type, estimated_size, EvictMode::TryEvict, lock))
+    if (wait_ms != 0)
+    {
+        Stopwatch wait_watch;
+        auto status = file_seg->waitForNotEmptyFor(std::chrono::milliseconds(wait_ms));
+        const auto waited_bytes = filesize.value_or(file_seg->getSize());
+        if (status == FileSegment::Status::Complete)
         {
-            // Space still not enough after eviction.
-            GET_METRIC(tiflash_storage_remote_cache, type_dtfile_full).Increment();
-            LOG_DEBUG(
-                log,
-                "s3_key={} space not enough(capacity={} used={} estimated_size={}), skip cache",
-                s3_key,
-                cache_capacity,
-                cache_used,
-                estimated_size);
-            return nullptr;
+            observeWaitOnDownloadingMetrics(
+                file_type,
+                WaitResult::Hit,
+                waited_bytes,
+                wait_watch.elapsedSeconds());
+            GET_METRIC(tiflash_storage_remote_cache, type_dtfile_hit).Increment();
+            return file_seg;
         }
 
-        file_seg = std::make_shared<FileSegment>(
-            toLocalFilename(s3_key),
-            FileSegment::Status::Empty,
-            estimated_size,
-            file_type);
-        table.set(s3_key, file_seg);
-    } // Release the lock before submiting bg download task. Because bgDownload may be blocked when the queue is full.
+        observeWaitOnDownloadingMetrics(
+            file_type,
+            status == FileSegment::Status::Failed ? WaitResult::Failed : WaitResult::Timeout,
+            waited_bytes,
+            wait_watch.elapsedSeconds());
+        GET_METRIC(tiflash_storage_remote_cache, type_dtfile_miss).Increment();
+        return nullptr;
+    }
 
     bgDownload(s3_key, file_seg);
 
@@ -990,7 +1070,8 @@ void downloadToLocal(
     Aws::IOStream & istr,
     const String & fname,
     Int64 content_length,
-    const WriteLimiterPtr & write_limiter)
+    const WriteLimiterPtr & write_limiter,
+    const std::shared_ptr<S3::S3ReadLimiter> & read_limiter)
 {
     // create an empty file with write_limiter
     // each time `ofile.write` is called, the write speed will be controlled by the write_limiter.
@@ -1000,20 +1081,46 @@ void downloadToLocal(
         return;
 
     GET_METRIC(tiflash_storage_remote_cache_bytes, type_dtfile_download_bytes).Increment(content_length);
-    static const Int64 MAX_BUFFER_SIZE = 128 * 1024; // 128k
-    ReadBufferFromIStream rbuf(istr, std::min(content_length, MAX_BUFFER_SIZE));
-    WriteBufferFromWritableFile wbuf(ofile, std::min(content_length, MAX_BUFFER_SIZE));
-    copyData(rbuf, wbuf, content_length);
-    wbuf.sync();
+    if (read_limiter == nullptr)
+    {
+        static const Int64 MAX_BUFFER_SIZE = 128 * 1024; // 128k
+        ReadBufferFromIStream rbuf(istr, std::min(content_length, MAX_BUFFER_SIZE));
+        WriteBufferFromWritableFile wbuf(ofile, std::min(content_length, MAX_BUFFER_SIZE));
+        copyData(rbuf, wbuf, content_length);
+        wbuf.sync();
+        return;
+    }
+
+    std::array<char, s3_download_limiter_buffer_size> buffer{};
+    Int64 remaining = content_length;
+    while (remaining > 0)
+    {
+        auto to_read = std::min<Int64>(remaining, static_cast<Int64>(buffer.size()));
+        read_limiter->requestBytes(to_read, S3::S3ReadSource::FileCacheDownload);
+        istr.read(buffer.data(), to_read);
+        auto gcount = istr.gcount();
+        RUNTIME_CHECK_MSG(gcount >= 0, "negative gcount for remote download");
+        if (gcount == 0)
+            break;
+        auto written = ofile->write(buffer.data(), gcount);
+        RUNTIME_CHECK(written == gcount, fname, written, gcount);
+        remaining -= gcount;
+        if (gcount < to_read)
+            break;
+    }
+    RUNTIME_CHECK_MSG(remaining == 0, "download {} incomplete, remaining={} content_length={}", fname, remaining, content_length);
+    ofile->fsync();
 }
 
 void FileCache::downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, const WriteLimiterPtr & write_limiter)
 {
     Stopwatch sw;
     auto client = S3::ClientFactory::instance().sharedTiFlashClient();
+    auto read_limiter = client->getS3ReadLimiter();
     Aws::S3::Model::GetObjectRequest req;
     client->setBucketAndKeyWithRoot(req, s3_key);
     ProfileEvents::increment(ProfileEvents::S3GetObject);
+    auto stream_token = read_limiter != nullptr ? read_limiter->acquireStream() : nullptr;
     auto outcome = client->GetObject(req);
     if (!outcome.IsSuccess())
     {
@@ -1041,7 +1148,8 @@ void FileCache::downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, c
     // download as a temp file then rename to a formal file
     prepareParentDir(local_fname);
     auto temp_fname = toTemporaryFilename(local_fname);
-    downloadToLocal(result.GetBody(), temp_fname, content_length, write_limiter);
+    SYNC_FOR("before_FileCache::downloadImpl_download_to_local");
+    downloadToLocal(result.GetBody(), temp_fname, content_length, write_limiter, read_limiter);
     std::filesystem::rename(temp_fname, local_fname);
 
 #ifndef NDEBUG
@@ -1070,8 +1178,10 @@ void FileCache::downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, c
 void FileCache::bgDownloadExecutor(
     const String & s3_key,
     FileSegmentPtr & file_seg,
-    const WriteLimiterPtr & write_limiter)
+    const WriteLimiterPtr & write_limiter,
+    std::chrono::steady_clock::time_point enqueue_time)
 {
+    UNUSED(enqueue_time);
     try
     {
         GET_METRIC(tiflash_storage_remote_cache, type_dtfile_download).Increment();
@@ -1082,7 +1192,6 @@ void FileCache::bgDownloadExecutor(
         // ignore the exception here, and log as warning.
         tryLogCurrentWarningException(log, fmt::format("Download s3_key={} failed", s3_key));
     }
-
     if (!file_seg->isReadyToRead())
     {
         file_seg->setStatus(FileSegment::Status::Failed);
@@ -1096,6 +1205,8 @@ void FileCache::bgDownloadExecutor(
         bg_download_succ_count.fetch_add(1, std::memory_order_relaxed);
     }
     bg_downloading_count.fetch_sub(1, std::memory_order_relaxed);
+    GET_METRIC(tiflash_storage_remote_cache_status, type_bg_downloading_count)
+        .Set(bg_downloading_count.load(std::memory_order_relaxed));
     LOG_DEBUG(
         log,
         "downloading count {} => s3_key {} finished",
@@ -1106,15 +1217,18 @@ void FileCache::bgDownloadExecutor(
 void FileCache::bgDownload(const String & s3_key, FileSegmentPtr & file_seg)
 {
     bg_downloading_count.fetch_add(1, std::memory_order_relaxed);
+    GET_METRIC(tiflash_storage_remote_cache_status, type_bg_downloading_count)
+        .Set(bg_downloading_count.load(std::memory_order_relaxed));
     LOG_DEBUG(
         log,
         "downloading count {} => s3_key {} start",
         bg_downloading_count.load(std::memory_order_relaxed),
         s3_key);
     auto write_limiter = rate_limiter.getBgWriteLimiter();
+    auto enqueue_time = std::chrono::steady_clock::now();
     S3FileCachePool::get().scheduleOrThrowOnError(
-        [this, s3_key = s3_key, file_seg = file_seg, limiter = std::move(write_limiter)]() mutable {
-            bgDownloadExecutor(s3_key, file_seg, limiter);
+        [this, s3_key = s3_key, file_seg = file_seg, limiter = std::move(write_limiter), enqueue_time]() mutable {
+            bgDownloadExecutor(s3_key, file_seg, limiter, enqueue_time);
         });
 }
 
@@ -1374,6 +1488,17 @@ void FileCache::updateConfig(const Settings & settings)
             cache_min_age);
         cache_min_age_seconds.store(cache_min_age, std::memory_order_relaxed);
     }
+
+    UInt64 new_wait_ms = settings.dt_filecache_wait_on_downloading_ms;
+    if (new_wait_ms != wait_on_downloading_ms.load(std::memory_order_relaxed))
+    {
+        LOG_INFO(
+            log,
+            "Update S3FileCache bounded wait config: wait_on_downloading_ms {} => {}",
+            wait_on_downloading_ms.load(std::memory_order_relaxed),
+            new_wait_ms);
+        wait_on_downloading_ms.store(new_wait_ms, std::memory_order_relaxed);
+    }
 }
 
 // Evict the cached files until no file of >= `file_type` is in cache.
diff --git a/dbms/src/Storages/S3/FileCache.h b/dbms/src/Storages/S3/FileCache.h
index 6e1ef0fbb74..2fa089408bf 100644
--- a/dbms/src/Storages/S3/FileCache.h
+++ b/dbms/src/Storages/S3/FileCache.h
@@ -88,6 +88,7 @@ class FileSegment
     }
 
     Status waitForNotEmpty();
+    Status waitForNotEmptyFor(std::chrono::milliseconds timeout);
 
     void setComplete(UInt64 size_)
     {
@@ -351,7 +352,11 @@ class FileCache
 
     void bgDownload(const String & s3_key, FileSegmentPtr & file_seg);
     void fgDownload(const String & s3_key, FileSegmentPtr & file_seg);
-    void bgDownloadExecutor(const String & s3_key, FileSegmentPtr & file_seg, const WriteLimiterPtr & write_limiter);
+    void bgDownloadExecutor(
+        const String & s3_key,
+        FileSegmentPtr & file_seg,
+        const WriteLimiterPtr & write_limiter,
+        std::chrono::steady_clock::time_point enqueue_time);
     void downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, const WriteLimiterPtr & write_limiter);
 
     static String toTemporaryFilename(const String & fname);
@@ -465,6 +470,7 @@ class FileCache
     const UInt16 logical_cores;
     IORateLimiter & rate_limiter;
     std::atomic<UInt64> cache_min_age_seconds = 1800;
+    std::atomic<UInt64> wait_on_downloading_ms = 0;
     std::atomic<double> download_count_scale = 2.0;
     std::atomic<double> max_downloading_count_scale = 10.0;
     // the on-going background download count
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.cpp b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
index e9bfa3f8208..35661c0ac96 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.cpp
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
@@ -24,6 +24,7 @@
 #include <Storages/S3/S3Common.h>
 #include <Storages/S3/S3Filename.h>
 #include <Storages/S3/S3RandomAccessFile.h>
+#include <Storages/S3/S3ReadLimiter.h>
 #include <aws/core/utils/Outcome.h>
 #include <aws/s3/model/GetObjectRequest.h>
 #include <common/likely.h>
@@ -57,6 +58,11 @@ extern const char force_s3_random_access_file_read_fail[];
 
 namespace DB::S3
 {
+namespace
+{
+constexpr size_t s3_read_limiter_preferred_chunk_size = 128 * 1024;
+}
+
 String S3RandomAccessFile::summary() const
 {
     return fmt::format("remote_fname={} cur_offset={} cur_retry={}", remote_fname, cur_offset, cur_retry);
@@ -69,6 +75,7 @@ S3RandomAccessFile::S3RandomAccessFile(
     : client_ptr(std::move(client_ptr_))
     , remote_fname(remote_fname_)
     , cur_offset(0)
+    , read_limiter(client_ptr->getS3ReadLimiter())
     , log(Logger::get(remote_fname))
     , scan_context(scan_context_)
 {
@@ -79,6 +86,7 @@ S3RandomAccessFile::S3RandomAccessFile(
 
 S3RandomAccessFile::~S3RandomAccessFile()
 {
+    resetReadStreamToken();
     CurrentMetrics::sub(CurrentMetrics::S3RandomAccessFile);
 }
 
@@ -120,6 +128,9 @@ ssize_t S3RandomAccessFile::read(char * buf, size_t size)
 
 ssize_t S3RandomAccessFile::readImpl(char * buf, size_t size)
 {
+    if (read_limiter != nullptr)
+        return readChunked(buf, size);
+
     Stopwatch sw;
     ProfileEvents::increment(ProfileEvents::S3IORead, 1);
     auto & istr = read_result.GetBody();
@@ -177,6 +188,74 @@ ssize_t S3RandomAccessFile::readImpl(char * buf, size_t size)
     return gcount;
 }
 
+ssize_t S3RandomAccessFile::readChunked(char * buf, size_t size)
+{
+    Stopwatch sw;
+    ProfileEvents::increment(ProfileEvents::S3IORead, 1);
+
+    auto & istr = read_result.GetBody();
+    const auto chunk_size = read_limiter->getSuggestedChunkSize(s3_read_limiter_preferred_chunk_size);
+    size_t total_gcount = 0;
+    while (total_gcount < size)
+    {
+        auto to_read = std::min(size - total_gcount, static_cast<size_t>(chunk_size));
+        read_limiter->requestBytes(to_read, S3ReadSource::DirectRead);
+        istr.read(buf + total_gcount, to_read);
+        auto gcount = istr.gcount();
+        total_gcount += gcount;
+        if (static_cast<size_t>(gcount) < to_read)
+            break;
+    }
+
+    fiu_do_on(FailPoints::force_s3_random_access_file_read_fail, {
+        LOG_WARNING(log, "failpoint force_s3_random_access_file_read_fail is triggered, return S3StreamError");
+        return S3StreamError;
+    });
+
+    if (total_gcount < size && (!istr.eof() || cur_offset + total_gcount != static_cast<size_t>(content_length)))
+    {
+        ProfileEvents::increment(ProfileEvents::S3IOReadError);
+        auto state = istr.rdstate();
+        auto elapsed_secs = sw.elapsedSeconds();
+        GET_METRIC(tiflash_storage_s3_request_seconds, type_read_stream_err).Observe(elapsed_secs);
+        LOG_WARNING(
+            log,
+            "Cannot read from istream, size={} gcount={} state=0x{:02X} cur_offset={} content_length={} "
+            "errno={} errmsg={} cost={:.6f}s",
+            size,
+            total_gcount,
+            state,
+            cur_offset,
+            content_length,
+            errno,
+            strerror(errno),
+            elapsed_secs);
+        return (state & std::ios_base::failbit || state & std::ios_base::badbit) ? S3StreamError : S3UnknownError;
+    }
+
+    auto elapsed_secs = sw.elapsedSeconds();
+    if (scan_context)
+    {
+        scan_context->disagg_s3file_read_time_ms += elapsed_secs * 1000;
+        scan_context->disagg_s3file_read_count += 1;
+        scan_context->disagg_s3file_read_bytes += total_gcount;
+    }
+    GET_METRIC(tiflash_storage_s3_request_seconds, type_read_stream).Observe(elapsed_secs);
+    if (elapsed_secs > 0.01)
+    {
+        LOG_DEBUG(
+            log,
+            "gcount={} cur_offset={} content_length={} cost={:.3f}s",
+            total_gcount,
+            cur_offset,
+            content_length,
+            elapsed_secs);
+    }
+    cur_offset += total_gcount;
+    ProfileEvents::increment(ProfileEvents::S3ReadBytes, total_gcount);
+    return total_gcount;
+}
+
 off_t S3RandomAccessFile::seek(off_t offset_, int whence)
 {
     while (true)
@@ -211,12 +290,16 @@ off_t S3RandomAccessFile::seekImpl(off_t offset_, int whence)
     {
         ProfileEvents::increment(ProfileEvents::S3IOSeekBackward, 1);
         // Backward seek, need to reset the retry count and re-initialize
+        resetReadStreamToken();
         cur_offset = offset_;
         cur_retry = 0;
         initialize("seek backward");
         return cur_offset;
     }
 
+    if (read_limiter != nullptr)
+        return seekChunked(offset_);
+
     // Forward seek
     Stopwatch sw;
     ProfileEvents::increment(ProfileEvents::S3IOSeek, 1);
@@ -259,6 +342,66 @@ off_t S3RandomAccessFile::seekImpl(off_t offset_, int whence)
     cur_offset = offset_;
     return cur_offset;
 }
+
+off_t S3RandomAccessFile::seekChunked(off_t offset)
+{
+    Stopwatch sw;
+    ProfileEvents::increment(ProfileEvents::S3IOSeek, 1);
+    auto & istr = read_result.GetBody();
+    const auto chunk_size = read_limiter->getSuggestedChunkSize(s3_read_limiter_preferred_chunk_size);
+    size_t total_ignored = 0;
+    const auto bytes_to_ignore = static_cast<size_t>(offset - cur_offset);
+    while (total_ignored < bytes_to_ignore)
+    {
+        auto to_ignore = std::min(bytes_to_ignore - total_ignored, static_cast<size_t>(chunk_size));
+        read_limiter->requestBytes(to_ignore, S3ReadSource::DirectRead);
+        istr.ignore(to_ignore);
+        auto ignored = istr.gcount();
+        total_ignored += ignored;
+        if (static_cast<size_t>(ignored) < to_ignore)
+            break;
+    }
+
+    if (total_ignored < bytes_to_ignore)
+    {
+        ProfileEvents::increment(ProfileEvents::S3IOSeekError);
+        auto state = istr.rdstate();
+        auto elapsed_secs = sw.elapsedSeconds();
+        GET_METRIC(tiflash_storage_s3_request_seconds, type_read_stream_err).Observe(elapsed_secs);
+        LOG_WARNING(
+            log,
+            "Cannot ignore from istream, state=0x{:02X}, ignored={} expected={} errno={} errmsg={} cost={:.6f}s",
+            state,
+            total_ignored,
+            bytes_to_ignore,
+            errno,
+            strerror(errno),
+            elapsed_secs);
+        return (state & std::ios_base::failbit || state & std::ios_base::badbit) ? S3StreamError : S3UnknownError;
+    }
+
+    auto elapsed_secs = sw.elapsedSeconds();
+    if (scan_context)
+    {
+        scan_context->disagg_s3file_seek_time_ms += elapsed_secs * 1000;
+        scan_context->disagg_s3file_seek_count += 1;
+        scan_context->disagg_s3file_seek_bytes += bytes_to_ignore;
+    }
+    GET_METRIC(tiflash_storage_s3_request_seconds, type_read_stream).Observe(elapsed_secs);
+    if (elapsed_secs > 0.01)
+    {
+        LOG_DEBUG(
+            log,
+            "ignore_count={} cur_offset={} content_length={} cost={:.3f}s",
+            bytes_to_ignore,
+            cur_offset,
+            content_length,
+            elapsed_secs);
+    }
+    ProfileEvents::increment(ProfileEvents::S3ReadBytes, bytes_to_ignore);
+    cur_offset = offset;
+    return cur_offset;
+}
 String S3RandomAccessFile::readRangeOfObject()
 {
     return fmt::format("bytes={}-", cur_offset);
@@ -279,6 +422,7 @@ void S3RandomAccessFile::initialize(std::string_view action)
 {
     while (cur_retry < max_retry)
     {
+        auto next_stream_token = read_limiter != nullptr ? read_limiter->acquireStream() : nullptr;
         Stopwatch sw_get_object;
         SCOPE_EXIT({
             auto elapsed_secs = sw_get_object.elapsedSeconds();
@@ -309,6 +453,7 @@ void S3RandomAccessFile::initialize(std::string_view action)
         });
         if (!outcome.IsSuccess())
         {
+            next_stream_token.reset();
             Int64 delay_ms = details::calculateDelayForNextRetry(cur_retry);
             cur_retry += 1;
             auto el = sw_get_object.elapsedSeconds();
@@ -334,6 +479,7 @@ void S3RandomAccessFile::initialize(std::string_view action)
         }
         read_result = outcome.GetResultWithOwnership();
         RUNTIME_CHECK(read_result.GetBody(), remote_fname, strerror(errno));
+        read_stream_token = std::move(next_stream_token);
         return; // init successfully
     }
     // exceed max retry times
@@ -344,6 +490,12 @@ void S3RandomAccessFile::initialize(std::string_view action)
         remote_fname);
 }
 
+void S3RandomAccessFile::resetReadStreamToken()
+{
+    if (read_stream_token != nullptr)
+        read_stream_token.reset();
+}
+
 inline static RandomAccessFilePtr tryOpenCachedFile(const String & remote_fname, std::optional<UInt64> filesize)
 {
     try
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.h b/dbms/src/Storages/S3/S3RandomAccessFile.h
index e700c7491dd..2b59317a4a6 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.h
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.h
@@ -18,6 +18,7 @@
 #include <Common/Logger.h>
 #include <IO/BaseFile/RandomAccessFile.h>
 #include <Storages/DeltaMerge/ScanContext_fwd.h>
+#include <Storages/S3/S3ReadLimiter.h>
 #include <aws/s3/model/GetObjectResult.h>
 #include <common/types.h>
 
@@ -31,6 +32,7 @@
 namespace DB::S3
 {
 class TiFlashS3Client;
+class S3ReadLimiter;
 }
 
 namespace DB::ErrorCodes
@@ -93,6 +95,9 @@ class S3RandomAccessFile final : public RandomAccessFile
     off_t seekImpl(off_t offset, int whence);
     ssize_t readImpl(char * buf, size_t size);
     String readRangeOfObject();
+    ssize_t readChunked(char * buf, size_t size);
+    off_t seekChunked(off_t offset);
+    void resetReadStreamToken();
 
     // When reading, it is necessary to pass the extra information of file, such file size, to S3RandomAccessFile::create.
     // It is troublesome to pass parameters layer by layer. So currently, use thread_local global variable to pass parameters.
@@ -105,6 +110,8 @@ class S3RandomAccessFile final : public RandomAccessFile
     off_t cur_offset;
     Aws::S3::Model::GetObjectResult read_result;
     Int64 content_length = 0;
+    std::shared_ptr<S3ReadLimiter> read_limiter;
+    std::unique_ptr<S3ReadLimiter::StreamToken> read_stream_token;
 
     DB::LoggerPtr log;
     bool is_close = false;
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index a69149e97bd..69fbfefe7c5 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -26,6 +26,7 @@
 #include <Storages/S3/FileCache.h>
 #include <Storages/S3/S3Common.h>
 #include <Storages/S3/S3Filename.h>
+#include <Storages/S3/S3ReadLimiter.h>
 #include <Storages/S3/S3WritableFile.h>
 #include <TestUtils/TiFlashTestBasic.h>
 #include <aws/s3/model/CreateBucketRequest.h>
@@ -1126,13 +1127,108 @@ TEST_F(FileCacheTest, UpdateConfig)
     // small dt_filecache_max_downloading_count_scale, the queue size should be at least vcores * concurrency
     settings.dt_filecache_downloading_count_scale = 2.0;
     settings.dt_filecache_max_downloading_count_scale = 0.1;
+    settings.dt_filecache_wait_on_downloading_ms = 7;
     file_cache.updateConfig(settings);
     ASSERT_DOUBLE_EQ(file_cache.download_count_scale, 2.0);
     ASSERT_DOUBLE_EQ(file_cache.max_downloading_count_scale, 0.1);
+    ASSERT_EQ(file_cache.wait_on_downloading_ms.load(std::memory_order_relaxed), 7);
     ASSERT_EQ(S3FileCachePool::get().getMaxThreads(), vcores * 2.0);
     ASSERT_EQ(S3FileCachePool::get().getQueueSize(), vcores * 2.0);
 }
 
+TEST_F(FileCacheTest, FileSegmentWaitForNotEmptyFor)
+{
+    auto file_seg = std::make_shared<FileSegment>("/tmp/test", FileSegment::Status::Empty, 128, FileType::Merged);
+    ASSERT_EQ(file_seg->waitForNotEmptyFor(10ms), FileSegment::Status::Empty);
+
+    auto complete_future = std::async(std::launch::async, [&]() { return file_seg->waitForNotEmptyFor(200ms); });
+    std::this_thread::sleep_for(20ms);
+    file_seg->setComplete(256);
+    ASSERT_EQ(complete_future.get(), FileSegment::Status::Complete);
+
+    auto failed_seg = std::make_shared<FileSegment>("/tmp/test_failed", FileSegment::Status::Empty, 128, FileType::Meta);
+    auto failed_future = std::async(std::launch::async, [&]() { return failed_seg->waitForNotEmptyFor(200ms); });
+    std::this_thread::sleep_for(20ms);
+    failed_seg->setStatus(FileSegment::Status::Failed);
+    ASSERT_EQ(failed_future.get(), FileSegment::Status::Failed);
+}
+
+TEST_F(FileCacheTest, GetWaitOnDownloadingHitAndTimeout)
+{
+    auto cache_dir = fmt::format("{}/wait_on_downloading", tmp_dir);
+    StorageRemoteCacheConfig cache_config{.dir = cache_dir, .capacity = cache_capacity, .dtfile_level = 100};
+
+    UInt16 vcores = 2;
+    IORateLimiter rate_limiter;
+    FileCache file_cache(capacity_metrics, cache_config, vcores, rate_limiter);
+
+    Settings settings;
+    settings.dt_filecache_downloading_count_scale = 2.0;
+    settings.dt_filecache_max_downloading_count_scale = 2.0;
+    settings.dt_filecache_wait_on_downloading_ms = 200;
+    file_cache.updateConfig(settings);
+
+    auto objects = genObjects(/*store_count*/ 1, /*table_count*/ 1, /*file_count*/ 1, {"1.merged", "2.merged"});
+    auto sp_download = SyncPointCtl::enableInScope("before_FileCache::downloadImpl_download_to_local");
+
+    auto first_key = S3FilenameView::fromKey(objects[0].key);
+    ASSERT_EQ(file_cache.get(first_key, objects[0].size), nullptr);
+    sp_download.waitAndPause();
+
+    auto wait_hit = std::async(std::launch::async, [&]() { return file_cache.get(first_key, objects[0].size); });
+    std::this_thread::sleep_for(20ms);
+    sp_download.next();
+    auto hit_seg = wait_hit.get();
+    ASSERT_NE(hit_seg, nullptr);
+    ASSERT_TRUE(hit_seg->isReadyToRead());
+
+    settings.dt_filecache_wait_on_downloading_ms = 30;
+    file_cache.updateConfig(settings);
+    auto second_key = S3FilenameView::fromKey(objects[1].key);
+    ASSERT_EQ(file_cache.get(second_key, objects[1].size), nullptr);
+    sp_download.waitAndPause();
+    auto wait_timeout = std::async(std::launch::async, [&]() { return file_cache.get(second_key, objects[1].size); });
+    ASSERT_EQ(wait_timeout.get(), nullptr);
+    sp_download.next();
+    sp_download.disable();
+
+    waitForBgDownload(file_cache);
+}
+
+TEST_F(FileCacheTest, BgDownloadRespectsS3StreamLimiter)
+{
+    auto cache_dir = fmt::format("{}/bg_download_limiter", tmp_dir);
+    StorageRemoteCacheConfig cache_config{.dir = cache_dir, .capacity = cache_capacity, .dtfile_level = 100};
+
+    UInt16 vcores = 2;
+    IORateLimiter rate_limiter;
+    FileCache file_cache(capacity_metrics, cache_config, vcores, rate_limiter);
+    Settings settings;
+    settings.dt_filecache_downloading_count_scale = 2.0;
+    settings.dt_filecache_max_downloading_count_scale = 2.0;
+    file_cache.updateConfig(settings);
+
+    auto limiter = std::make_shared<S3ReadLimiter>(0, 1);
+    s3_client->setS3ReadLimiter(limiter);
+    SCOPE_EXIT({ s3_client->setS3ReadLimiter(nullptr); });
+
+    auto objects = genObjects(/*store_count*/ 1, /*table_count*/ 1, /*file_count*/ 1, {"3.merged", "4.merged"});
+    auto sp_download = SyncPointCtl::enableInScope("before_FileCache::downloadImpl_download_to_local");
+
+    ASSERT_EQ(file_cache.get(S3FilenameView::fromKey(objects[0].key), objects[0].size), nullptr);
+    sp_download.waitAndPause();
+    ASSERT_EQ(limiter->activeStreams(), 1);
+
+    ASSERT_EQ(file_cache.get(S3FilenameView::fromKey(objects[1].key), objects[1].size), nullptr);
+    std::this_thread::sleep_for(50ms);
+    ASSERT_EQ(limiter->activeStreams(), 1);
+
+    sp_download.next();
+    sp_download.disable();
+    waitForBgDownload(file_cache);
+    ASSERT_EQ(limiter->activeStreams(), 0);
+}
+
 TEST_F(FileCacheTest, GetBeingBlock)
 {
     auto cache_dir = fmt::format("{}/update_config", tmp_dir);
diff --git a/dbms/src/Storages/S3/tests/gtest_s3file.cpp b/dbms/src/Storages/S3/tests/gtest_s3file.cpp
index a2bf0edc7a9..2e9ad7b127d 100644
--- a/dbms/src/Storages/S3/tests/gtest_s3file.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_s3file.cpp
@@ -36,6 +36,7 @@
 #include <Storages/S3/S3Common.h>
 #include <Storages/S3/S3Filename.h>
 #include <Storages/S3/S3RandomAccessFile.h>
+#include <Storages/S3/S3ReadLimiter.h>
 #include <Storages/S3/S3WritableFile.h>
 #include <TestUtils/FunctionTestUtils.h>
 #include <TestUtils/InputStreamTestUtils.h>
@@ -354,6 +355,35 @@ try
 }
 CATCH
 
+TEST_P(S3FileTest, StreamLimiterBlocksSecondDirectReader)
+try
+{
+    const String key1 = "/a/b/c/stream_limit_1";
+    const String key2 = "/a/b/c/stream_limit_2";
+    writeFile(key1, 4096, WriteSettings{});
+    writeFile(key2, 4096, WriteSettings{});
+
+    auto limiter = std::make_shared<S3ReadLimiter>(0, 1);
+    s3_client->setS3ReadLimiter(limiter);
+    SCOPE_EXIT({ s3_client->setS3ReadLimiter(nullptr); });
+
+    auto file1 = std::make_shared<S3RandomAccessFile>(s3_client, key1, nullptr);
+    ASSERT_EQ(limiter->activeStreams(), 1);
+
+    auto future = std::async(std::launch::async, [&]() {
+        return std::make_shared<S3RandomAccessFile>(s3_client, key2, nullptr);
+    });
+    ASSERT_EQ(future.wait_for(50ms), std::future_status::timeout);
+
+    file1.reset();
+    auto file2 = future.get();
+    ASSERT_NE(file2, nullptr);
+    ASSERT_EQ(limiter->activeStreams(), 1);
+    file2.reset();
+    ASSERT_EQ(limiter->activeStreams(), 0);
+}
+CATCH
+
 TEST_P(S3FileTest, WriteRead)
 try
 {

From df14a20649550107b145491222ebca0041bf553b Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Fri, 3 Apr 2026 14:39:26 +0800
Subject: [PATCH 03/36] Refine metrics and comments

Signed-off-by: JaySon-Huang <tshent@qq.com>
---
 dbms/src/Common/TiFlashMetrics.h              | 48 ++++++++-----------
 dbms/src/IO/BaseFile/IORateLimitConfig.cpp    |  8 ++--
 dbms/src/IO/BaseFile/IORateLimitConfig.h      |  2 +
 dbms/src/IO/BaseFile/RateLimiter.cpp          |  7 +--
 dbms/src/Server/Server.cpp                    |  3 ++
 dbms/src/Storages/S3/FileCache.cpp            | 31 +++++++-----
 dbms/src/Storages/S3/S3Common.cpp             | 15 +++---
 dbms/src/Storages/S3/S3RandomAccessFile.cpp   |  8 +++-
 dbms/src/Storages/S3/S3RandomAccessFile.h     |  2 +-
 dbms/src/Storages/S3/S3ReadLimiter.cpp        | 31 ++++++------
 dbms/src/Storages/S3/S3ReadLimiter.h          | 10 ++++
 .../src/Storages/S3/tests/gtest_filecache.cpp |  3 +-
 12 files changed, 97 insertions(+), 71 deletions(-)

diff --git a/dbms/src/Common/TiFlashMetrics.h b/dbms/src/Common/TiFlashMetrics.h
index d49d3017cb2..fd9646bb07a 100644
--- a/dbms/src/Common/TiFlashMetrics.h
+++ b/dbms/src/Common/TiFlashMetrics.h
@@ -374,7 +374,7 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       Histogram,                                                                                                                    \
       F(type_read, {{"type", "read"}}, EqualWidthBuckets{1 * 1024 * 1024, 60, 50 * 1024 * 1024}))                                   \
     M(tiflash_storage_io_limiter,                                                                                                   \
-      "Storage I/O limiter metrics",                                                                                                \
+      "Storage I/O limiter byte flow",                                                                                              \
       Counter,                                                                                                                      \
       F(type_fg_read_req_bytes, {"type", "fg_read_req_bytes"}),                                                                     \
       F(type_fg_read_alloc_bytes, {"type", "fg_read_alloc_bytes"}),                                                                 \
@@ -383,14 +383,17 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       F(type_fg_write_req_bytes, {"type", "fg_write_req_bytes"}),                                                                   \
       F(type_fg_write_alloc_bytes, {"type", "fg_write_alloc_bytes"}),                                                               \
       F(type_bg_write_req_bytes, {"type", "bg_write_req_bytes"}),                                                                   \
-      F(type_bg_write_alloc_bytes, {"type", "bg_write_alloc_bytes"}))                                                               \
+      F(type_bg_write_alloc_bytes, {"type", "bg_write_alloc_bytes"}),                                                               \
+      F(type_s3_direct_read_bytes, {{"type", "s3_direct_read_bytes"}}),                                                             \
+      F(type_s3_filecache_download_bytes, {{"type", "s3_filecache_download_bytes"}}))                                               \
     M(tiflash_storage_io_limiter_curr,                                                                                              \
       "Current limit bytes per second of Storage I/O limiter",                                                                      \
       Gauge,                                                                                                                        \
       F(type_fg_read_bytes, {"type", "fg_read_bytes"}),                                                                             \
       F(type_bg_read_bytes, {"type", "bg_read_bytes"}),                                                                             \
       F(type_fg_write_bytes, {"type", "fg_write_bytes"}),                                                                           \
-      F(type_bg_write_bytes, {"type", "bg_write_bytes"}))                                                                           \
+      F(type_bg_write_bytes, {"type", "bg_write_bytes"}),                                                                           \
+      F(type_s3_read_bytes, {"type", "s3_read_bytes"}))                                                                             \
     M(tiflash_storage_rough_set_filter_rate,                                                                                        \
       "Bucketed histogram of rough set filter rate",                                                                                \
       Histogram,                                                                                                                    \
@@ -791,24 +794,11 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       F(type_head_object, {{"type", "head_object"}}, ExpBuckets{0.001, 2, 20}),                                                     \
       F(type_read_stream, {{"type", "read_stream"}}, ExpBuckets{0.0001, 2, 20}),                                                    \
       F(type_read_stream_err, {{"type", "read_stream_err"}}, ExpBuckets{0.0001, 2, 20}))                                            \
-    M(tiflash_storage_s3_read_limiter,                                                                                              \
-      "S3 read limiter counters",                                                                                                   \
-      Counter,                                                                                                                      \
-      F(type_stream_wait_count, {{"type", "stream_wait_count"}}),                                                                   \
-      F(type_byte_wait_count, {{"type", "byte_wait_count"}}),                                                                       \
-      F(type_direct_read_bytes, {{"type", "direct_read_bytes"}}),                                                                   \
-      F(type_filecache_download_bytes, {{"type", "filecache_download_bytes"}}))                                                     \
-    M(tiflash_storage_s3_read_limiter_wait_seconds,                                                                                 \
-      "S3 read limiter wait duration in seconds",                                                                                   \
-      Histogram,                                                                                                                    \
-      F(type_stream_wait, {{"type", "stream_wait"}}, ExpBuckets{0.0001, 2, 20}),                                                   \
-      F(type_byte_wait, {{"type", "byte_wait"}}, ExpBuckets{0.0001, 2, 20}))                                                       \
     M(tiflash_storage_s3_read_limiter_status,                                                                                       \
       "S3 read limiter status",                                                                                                     \
       Gauge,                                                                                                                        \
       F(type_active_get_object_streams, {{"type", "active_get_object_streams"}}),                                                   \
-      F(type_max_get_object_streams, {{"type", "max_get_object_streams"}}),                                                         \
-      F(type_max_read_bytes_per_sec, {{"type", "max_read_bytes_per_sec"}}))                                                         \
+      F(type_max_get_object_streams, {{"type", "max_get_object_streams"}}))                                                         \
     M(tiflash_storage_s3_http_request_seconds,                                                                                      \
       "S3 request duration breakdown in seconds",                                                                                   \
       Histogram,                                                                                                                    \
@@ -919,17 +909,28 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       F(type_page_evict_bytes, {"type", "page_evict_bytes"}),                                                                       \
       F(type_page_download_bytes, {"type", "page_download_bytes"}),                                                                 \
       F(type_page_read_bytes, {"type", "page_read_bytes"}))                                                                         \
-    M(tiflash_storage_remote_cache_status,                                                                                           \
+    M(tiflash_storage_remote_cache_status,                                                                                          \
       "Remote cache status",                                                                                                        \
       Gauge,                                                                                                                        \
-      F(type_bg_downloading_count, {{"type", "bg_downloading_count"}}))                                                            \
+      F(type_bg_downloading_count, {{"type", "bg_downloading_count"}}))                                                             \
     M(tiflash_storage_io_limiter_pending_seconds,                                                                                   \
       "I/O limiter pending duration in seconds",                                                                                    \
       Histogram,                                                                                                                    \
       F(type_fg_read, {{"type", "fg_read"}}, ExpBuckets{0.001, 2, 20}),                                                             \
       F(type_bg_read, {{"type", "bg_read"}}, ExpBuckets{0.001, 2, 20}),                                                             \
       F(type_fg_write, {{"type", "fg_write"}}, ExpBuckets{0.001, 2, 20}),                                                           \
-      F(type_bg_write, {{"type", "bg_write"}}, ExpBuckets{0.001, 2, 20}))                                                           \
+      F(type_bg_write, {{"type", "bg_write"}}, ExpBuckets{0.001, 2, 20}),                                                           \
+      F(type_s3_read_stream, {{"type", "s3_read_stream"}}, ExpBuckets{0.001, 2, 20}),                                               \
+      F(type_s3_read_byte, {{"type", "s3_read_byte"}}, ExpBuckets{0.001, 2, 20}))                                                   \
+    M(tiflash_storage_io_limiter_pending_count,                                                                                     \
+      "I/O limiter pending count",                                                                                                  \
+      Counter,                                                                                                                      \
+      F(type_fg_read, {"type", "fg_read"}),                                                                                         \
+      F(type_bg_read, {"type", "bg_read"}),                                                                                         \
+      F(type_fg_write, {"type", "fg_write"}),                                                                                       \
+      F(type_bg_write, {"type", "bg_write"}),                                                                                       \
+      F(type_s3_read_byte, {"type", "s3_read_byte"}),                                                                               \
+      F(type_s3_read_stream, {"type", "s3_read_stream"}))                                                                           \
     M(tiflash_system_seconds,                                                                                                       \
       "system calls duration in seconds",                                                                                           \
       Histogram,                                                                                                                    \
@@ -1021,13 +1022,6 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       F(type_load_dmfile_local, {{"type", "load_dmfile_local"}}, ExpBuckets{0.001, 2, 20}),                                         \
       F(type_load_dmfile_s3, {{"type", "load_dmfile_s3"}}, ExpBuckets{0.001, 2, 20}),                                               \
       F(type_search, {{"type", "search"}}, ExpBuckets{0.001, 2, 20}))                                                               \
-    M(tiflash_storage_io_limiter_pending_count,                                                                                     \
-      "I/O limiter pending count",                                                                                                  \
-      Counter,                                                                                                                      \
-      F(type_fg_read, {"type", "fg_read"}),                                                                                         \
-      F(type_bg_read, {"type", "bg_read"}),                                                                                         \
-      F(type_fg_write, {"type", "fg_write"}),                                                                                       \
-      F(type_bg_write, {"type", "bg_write"}))                                                                                       \
     M(tiflash_read_thread_internal_us,                                                                                              \
       "Durations of read thread internal components",                                                                               \
       Histogram,                                                                                                                    \
diff --git a/dbms/src/IO/BaseFile/IORateLimitConfig.cpp b/dbms/src/IO/BaseFile/IORateLimitConfig.cpp
index 7427e83322a..bbbe45d5b21 100644
--- a/dbms/src/IO/BaseFile/IORateLimitConfig.cpp
+++ b/dbms/src/IO/BaseFile/IORateLimitConfig.cpp
@@ -172,12 +172,12 @@ bool IORateLimitConfig::operator==(const IORateLimitConfig & config) const
     return config.max_bytes_per_sec == max_bytes_per_sec && config.max_read_bytes_per_sec == max_read_bytes_per_sec
         && config.max_write_bytes_per_sec == max_write_bytes_per_sec
         && config.s3_max_read_bytes_per_sec == s3_max_read_bytes_per_sec
-        && config.s3_max_get_object_streams == s3_max_get_object_streams
+        && config.s3_max_get_object_streams == s3_max_get_object_streams //
         && config.bg_write_weight == bg_write_weight && config.fg_write_weight == fg_write_weight
         && config.bg_read_weight == bg_read_weight && config.fg_read_weight == fg_read_weight
-        && config.emergency_pct == emergency_pct
-        && config.high_pct == high_pct && config.medium_pct == medium_pct && config.tune_base == tune_base
-        && config.min_bytes_per_sec == min_bytes_per_sec && config.auto_tune_sec == auto_tune_sec;
+        && config.emergency_pct == emergency_pct && config.high_pct == high_pct && config.medium_pct == medium_pct
+        && config.tune_base == tune_base && config.min_bytes_per_sec == min_bytes_per_sec
+        && config.auto_tune_sec == auto_tune_sec;
 }
 
 } // namespace DB
diff --git a/dbms/src/IO/BaseFile/IORateLimitConfig.h b/dbms/src/IO/BaseFile/IORateLimitConfig.h
index 45e1fc0e685..6a75a3bf00c 100644
--- a/dbms/src/IO/BaseFile/IORateLimitConfig.h
+++ b/dbms/src/IO/BaseFile/IORateLimitConfig.h
@@ -28,7 +28,9 @@ struct IORateLimitConfig
     // For disk that read bandwidth and write bandwith are calculated separately, such as GCP's persistent disks.
     UInt64 max_read_bytes_per_sec;
     UInt64 max_write_bytes_per_sec;
+    // Node-level byte budget shared by all S3 direct reads and FileCache downloads. `0` disables byte throttling.
     UInt64 s3_max_read_bytes_per_sec;
+    // Node-level cap for concurrently active `GetObject` response bodies. `0` disables stream throttling.
     UInt64 s3_max_get_object_streams;
 
     // only true when both max_read_bytes_per_sec and max_write_bytes_per_sec are 0
diff --git a/dbms/src/IO/BaseFile/RateLimiter.cpp b/dbms/src/IO/BaseFile/RateLimiter.cpp
index c3ae14b4872..f992de746eb 100644
--- a/dbms/src/IO/BaseFile/RateLimiter.cpp
+++ b/dbms/src/IO/BaseFile/RateLimiter.cpp
@@ -525,15 +525,16 @@ void IORateLimiter::updateLimiterByConfig(const IORateLimitConfig & cfg)
     std::lock_guard lock(limiter_mtx);
     updateReadLimiter(cfg.getBgReadMaxBytesPerSec(), cfg.getFgReadMaxBytesPerSec());
     updateWriteLimiter(cfg.getBgWriteMaxBytesPerSec(), cfg.getFgWriteMaxBytesPerSec());
+
+    // updateS3ReadLimiter
     if (cfg.s3_max_read_bytes_per_sec == 0 && cfg.s3_max_get_object_streams == 0)
     {
         s3_read_limiter = nullptr;
     }
     else if (s3_read_limiter == nullptr)
     {
-        s3_read_limiter = std::make_shared<S3::S3ReadLimiter>(
-            cfg.s3_max_read_bytes_per_sec,
-            cfg.s3_max_get_object_streams);
+        s3_read_limiter
+            = std::make_shared<S3::S3ReadLimiter>(cfg.s3_max_read_bytes_per_sec, cfg.s3_max_get_object_streams);
     }
     else
     {
diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp
index 8f08296c604..1b4ed5d65d6 100644
--- a/dbms/src/Server/Server.cpp
+++ b/dbms/src/Server/Server.cpp
@@ -942,6 +942,8 @@ try
 
     /// Initialize RateLimiter.
     global_context->initializeRateLimiter(config(), bg_pool, blockable_bg_pool);
+    // ClientFactory keeps the process-wide shared S3 client. Publish the latest limiter explicitly so
+    // every existing and future `TiFlashS3Client` observes the same node-level S3 read budget.
     S3::ClientFactory::instance().setS3ReadLimiter(global_context->getIORateLimiter().getS3ReadLimiter());
 
     global_context->setServerInfo(server_info);
@@ -972,6 +974,7 @@ try
             buildLoggers(*config);
             global_context->getTMTContext().reloadConfig(*config);
             global_context->getIORateLimiter().updateConfig(*config);
+            // Config reload may replace the limiter instance or disable it. Re-publish it to the shared S3 client.
             S3::ClientFactory::instance().setS3ReadLimiter(global_context->getIORateLimiter().getS3ReadLimiter());
             global_context->reloadDeltaTreeConfig(*config);
             DM::SegmentReadTaskScheduler::instance().updateConfig(global_context->getSettingsRef());
diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 08ceb49b767..746c27e7da1 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -47,6 +47,7 @@
 #include <filesystem>
 #include <magic_enum.hpp>
 #include <queue>
+#include <ranges>
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
@@ -406,6 +407,8 @@ FileSegmentPtr FileCache::get(const S3::S3FilenameView & s3_fname, const std::op
             }
             else
             {
+                // Another thread is already downloading the same object. Optionally wait for a bounded time and
+                // reuse that result instead of opening one more `GetObject` stream for the same key.
                 wait_ms = wait_on_downloading_ms.load(std::memory_order_relaxed);
                 if (wait_ms == 0)
                 {
@@ -465,11 +468,7 @@ FileSegmentPtr FileCache::get(const S3::S3FilenameView & s3_fname, const std::op
         const auto waited_bytes = filesize.value_or(file_seg->getSize());
         if (status == FileSegment::Status::Complete)
         {
-            observeWaitOnDownloadingMetrics(
-                file_type,
-                WaitResult::Hit,
-                waited_bytes,
-                wait_watch.elapsedSeconds());
+            observeWaitOnDownloadingMetrics(file_type, WaitResult::Hit, waited_bytes, wait_watch.elapsedSeconds());
             GET_METRIC(tiflash_storage_remote_cache, type_dtfile_hit).Increment();
             return file_seg;
         }
@@ -747,10 +746,10 @@ std::vector<FileType> FileCache::getEvictFileTypes(FileType evict_for, bool evic
         std::vector<FileType> evict_types;
         evict_types.push_back(evict_for); // First, try evict with the same file type.
         // Second, try evict from the lower priority file type.
-        for (auto itr = std::rbegin(all_file_types); itr != std::rend(all_file_types); ++itr)
+        for (auto file_type : all_file_types | std::views::reverse)
         {
-            if (*itr != evict_for)
-                evict_types.push_back(*itr);
+            if (file_type != evict_for)
+                evict_types.push_back(file_type);
         }
         return evict_types;
     }
@@ -758,10 +757,10 @@ std::vector<FileType> FileCache::getEvictFileTypes(FileType evict_for, bool evic
     {
         std::vector<FileType> evict_types;
         // Evict from the lower priority file type first.
-        for (auto itr = std::rbegin(all_file_types); itr != std::rend(all_file_types); ++itr)
+        for (auto file_type : all_file_types | std::views::reverse)
         {
-            evict_types.push_back(*itr);
-            if (*itr == evict_for)
+            evict_types.push_back(file_type);
+            if (file_type == evict_for)
             {
                 // Do not evict higher priority file type
                 break;
@@ -1095,6 +1094,8 @@ void downloadToLocal(
     Int64 remaining = content_length;
     while (remaining > 0)
     {
+        // Avoid `ReadBufferFromIStream` here so FileCache downloads can charge the shared S3 byte limiter per chunk
+        // without introducing extra heap allocation on the hot path.
         auto to_read = std::min<Int64>(remaining, static_cast<Int64>(buffer.size()));
         read_limiter->requestBytes(to_read, S3::S3ReadSource::FileCacheDownload);
         istr.read(buffer.data(), to_read);
@@ -1108,7 +1109,12 @@ void downloadToLocal(
         if (gcount < to_read)
             break;
     }
-    RUNTIME_CHECK_MSG(remaining == 0, "download {} incomplete, remaining={} content_length={}", fname, remaining, content_length);
+    RUNTIME_CHECK_MSG(
+        remaining == 0,
+        "download {} incomplete, remaining={} content_length={}",
+        fname,
+        remaining,
+        content_length);
     ofile->fsync();
 }
 
@@ -1120,6 +1126,7 @@ void FileCache::downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, c
     Aws::S3::Model::GetObjectRequest req;
     client->setBucketAndKeyWithRoot(req, s3_key);
     ProfileEvents::increment(ProfileEvents::S3GetObject);
+    // Limit live background-download streams with the same token used by direct readers.
     auto stream_token = read_limiter != nullptr ? read_limiter->acquireStream() : nullptr;
     auto outcome = client->GetObject(req);
     if (!outcome.IsSuccess())
diff --git a/dbms/src/Storages/S3/S3Common.cpp b/dbms/src/Storages/S3/S3Common.cpp
index 79677bd0607..acbc39573f3 100644
--- a/dbms/src/Storages/S3/S3Common.cpp
+++ b/dbms/src/Storages/S3/S3Common.cpp
@@ -349,8 +349,11 @@ void ClientFactory::init(const StorageS3Config & config_, bool mock_s3_)
     {
         auto [s3_client, vendor] = create(config, log);
         cloud_vendor = vendor;
-        shared_tiflash_client
-            = std::make_shared<TiFlashS3Client>(config.bucket, config.root, std::move(s3_client), shared_s3_read_limiter);
+        shared_tiflash_client = std::make_shared<TiFlashS3Client>(
+            config.bucket,
+            config.root,
+            std::move(s3_client),
+            shared_s3_read_limiter);
     }
     else
     {
@@ -358,12 +361,8 @@ void ClientFactory::init(const StorageS3Config & config_, bool mock_s3_)
         Aws::Client::ClientConfiguration cfg(true, /*defaultMode=*/"standard", /*shouldDisableIMDS=*/true);
         cfg.region = Aws::Region::US_EAST_1; // default region
         Aws::Auth::AWSCredentials cred("mock_access_key", "mock_secret_key");
-        shared_tiflash_client = std::make_unique<tests::MockS3Client>(
-            config.bucket,
-            config.root,
-            cred,
-            cfg,
-            shared_s3_read_limiter);
+        shared_tiflash_client
+            = std::make_unique<tests::MockS3Client>(config.bucket, config.root, cred, cfg, shared_s3_read_limiter);
     }
     client_is_inited = true; // init finish
 }
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.cpp b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
index 35661c0ac96..79b688a91ac 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.cpp
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
@@ -129,6 +129,7 @@ ssize_t S3RandomAccessFile::read(char * buf, size_t size)
 ssize_t S3RandomAccessFile::readImpl(char * buf, size_t size)
 {
     if (read_limiter != nullptr)
+        // Charge the shared node-level budget in small chunks instead of allowing a single large `read()` to burst.
         return readChunked(buf, size);
 
     Stopwatch sw;
@@ -198,6 +199,8 @@ ssize_t S3RandomAccessFile::readChunked(char * buf, size_t size)
     size_t total_gcount = 0;
     while (total_gcount < size)
     {
+        // The limiter charges requested bytes before the actual stream read so direct reads and FileCache downloads
+        // compete for the same node-level remote-read budget.
         auto to_read = std::min(size - total_gcount, static_cast<size_t>(chunk_size));
         read_limiter->requestBytes(to_read, S3ReadSource::DirectRead);
         istr.read(buf + total_gcount, to_read);
@@ -289,7 +292,7 @@ off_t S3RandomAccessFile::seekImpl(off_t offset_, int whence)
     if (offset_ < cur_offset)
     {
         ProfileEvents::increment(ProfileEvents::S3IOSeekBackward, 1);
-        // Backward seek, need to reset the retry count and re-initialize
+        // The current body stream is forward-only. Re-open from the target offset and release the old stream slot first.
         resetReadStreamToken();
         cur_offset = offset_;
         cur_retry = 0;
@@ -353,6 +356,7 @@ off_t S3RandomAccessFile::seekChunked(off_t offset)
     const auto bytes_to_ignore = static_cast<size_t>(offset - cur_offset);
     while (total_ignored < bytes_to_ignore)
     {
+        // `ignore()` still drains the response body from S3, so it must be accounted against the same byte budget.
         auto to_ignore = std::min(bytes_to_ignore - total_ignored, static_cast<size_t>(chunk_size));
         read_limiter->requestBytes(to_ignore, S3ReadSource::DirectRead);
         istr.ignore(to_ignore);
@@ -422,6 +426,8 @@ void S3RandomAccessFile::initialize(std::string_view action)
 {
     while (cur_retry < max_retry)
     {
+        // Hold the token for the whole body lifetime so the stream cap reflects live `GetObject` responses,
+        // including callers that read slowly or perform forward seeks.
         auto next_stream_token = read_limiter != nullptr ? read_limiter->acquireStream() : nullptr;
         Stopwatch sw_get_object;
         SCOPE_EXIT({
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.h b/dbms/src/Storages/S3/S3RandomAccessFile.h
index 2b59317a4a6..e1db8aa3098 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.h
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.h
@@ -33,7 +33,7 @@ namespace DB::S3
 {
 class TiFlashS3Client;
 class S3ReadLimiter;
-}
+} // namespace DB::S3
 
 namespace DB::ErrorCodes
 {
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.cpp b/dbms/src/Storages/S3/S3ReadLimiter.cpp
index 9a9b412b7e5..0a70737c28c 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.cpp
+++ b/dbms/src/Storages/S3/S3ReadLimiter.cpp
@@ -18,7 +18,6 @@
 #include <Storages/S3/S3ReadLimiter.h>
 
 #include <algorithm>
-
 #include <ext/scope_guard.h>
 
 namespace CurrentMetrics
@@ -30,6 +29,7 @@ namespace DB::S3
 {
 namespace
 {
+// We only emit wait metrics after the call actually blocked, so the hot path keeps the zero-wait case cheap.
 template <typename F>
 void recordWaitIfNeeded(bool waited, const Stopwatch & sw, F && observe)
 {
@@ -62,7 +62,7 @@ S3ReadLimiter::S3ReadLimiter(UInt64 max_read_bytes_per_sec_, UInt64 max_streams_
     , stop(false)
     , log(Logger::get("S3ReadLimiter"))
 {
-    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_max_read_bytes_per_sec).Set(max_read_bytes_per_sec_);
+    GET_METRIC(tiflash_storage_io_limiter_curr, type_s3_read_bytes).Set(max_read_bytes_per_sec_);
     GET_METRIC(tiflash_storage_s3_read_limiter_status, type_max_get_object_streams).Set(max_streams_);
     GET_METRIC(tiflash_storage_s3_read_limiter_status, type_active_get_object_streams).Set(0);
 }
@@ -86,7 +86,7 @@ void S3ReadLimiter::updateConfig(UInt64 max_read_bytes_per_sec_, UInt64 max_stre
         std::lock_guard lock(stream_mutex);
         max_streams.store(max_streams_, std::memory_order_relaxed);
     }
-    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_max_read_bytes_per_sec).Set(max_read_bytes_per_sec_);
+    GET_METRIC(tiflash_storage_io_limiter_curr, type_s3_read_bytes).Set(max_read_bytes_per_sec_);
     GET_METRIC(tiflash_storage_s3_read_limiter_status, type_max_get_object_streams).Set(max_streams_);
     bytes_cv.notify_all();
     stream_cv.notify_all();
@@ -101,19 +101,20 @@ std::unique_ptr<S3ReadLimiter::StreamToken> S3ReadLimiter::acquireStream()
     Stopwatch sw;
     bool waited = false;
     std::unique_lock lock(stream_mutex);
+    // A token is held for the whole lifetime of one `GetObject` body, not just the initial request.
     while (!stop && max_streams.load(std::memory_order_relaxed) != 0
            && active_streams.load(std::memory_order_relaxed) >= max_streams.load(std::memory_order_relaxed))
     {
         if (!waited)
         {
-            GET_METRIC(tiflash_storage_s3_read_limiter, type_stream_wait_count).Increment();
+            GET_METRIC(tiflash_storage_io_limiter_pending_count, type_s3_read_stream).Increment();
             waited = true;
         }
         stream_cv.wait(lock);
     }
 
     recordWaitIfNeeded(waited, sw, [](double seconds) {
-        GET_METRIC(tiflash_storage_s3_read_limiter_wait_seconds, type_stream_wait).Observe(seconds);
+        GET_METRIC(tiflash_storage_io_limiter_pending_seconds, type_s3_read_stream).Observe(seconds);
     });
 
     if (stop || max_streams.load(std::memory_order_relaxed) == 0)
@@ -133,10 +134,10 @@ void S3ReadLimiter::requestBytes(UInt64 bytes, S3ReadSource source)
     switch (source)
     {
     case S3ReadSource::DirectRead:
-        GET_METRIC(tiflash_storage_s3_read_limiter, type_direct_read_bytes).Increment(bytes);
+        GET_METRIC(tiflash_storage_io_limiter, type_s3_direct_read_bytes).Increment(bytes);
         break;
     case S3ReadSource::FileCacheDownload:
-        GET_METRIC(tiflash_storage_s3_read_limiter, type_filecache_download_bytes).Increment(bytes);
+        GET_METRIC(tiflash_storage_io_limiter, type_s3_filecache_download_bytes).Increment(bytes);
         break;
     }
 
@@ -149,12 +150,13 @@ void S3ReadLimiter::requestBytes(UInt64 bytes, S3ReadSource source)
     std::unique_lock lock(bytes_mutex);
     SCOPE_EXIT({
         recordWaitIfNeeded(waited, sw, [](double seconds) {
-            GET_METRIC(tiflash_storage_s3_read_limiter_wait_seconds, type_byte_wait).Observe(seconds);
+            GET_METRIC(tiflash_storage_io_limiter_pending_seconds, type_s3_read_byte).Observe(seconds);
         });
     });
     while (!stop)
     {
         const auto current_limit = max_read_bytes_per_sec.load(std::memory_order_relaxed);
+        // Config reload can disable the limiter while callers are waiting.
         if (current_limit == 0)
             return;
 
@@ -168,14 +170,15 @@ void S3ReadLimiter::requestBytes(UInt64 bytes, S3ReadSource source)
 
         if (!waited)
         {
-            GET_METRIC(tiflash_storage_s3_read_limiter, type_byte_wait_count).Increment();
+            GET_METRIC(tiflash_storage_io_limiter_pending_count, type_s3_read_byte).Increment();
             waited = true;
         }
 
+        // Sleep only for the missing budget instead of a fixed interval so large readers converge quickly
+        // after budget becomes available again.
         const auto missing = static_cast<double>(bytes) - available_bytes;
-        const auto wait_us = std::max<UInt64>(
-            1,
-            static_cast<UInt64>(missing * 1000000.0 / static_cast<double>(current_limit)));
+        const auto wait_us
+            = std::max<UInt64>(1, static_cast<UInt64>(missing * 1000000.0 / static_cast<double>(current_limit)));
         bytes_cv.wait_for(lock, std::chrono::microseconds(wait_us));
     }
 }
@@ -219,12 +222,12 @@ void S3ReadLimiter::refillBytesLocked(Clock::time_point now)
         return;
     }
 
-    const auto elapsed_ns
-        = std::chrono::duration_cast<std::chrono::nanoseconds>(now - last_refill_time).count();
+    const auto elapsed_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(now - last_refill_time).count();
     if (elapsed_ns <= 0)
         return;
 
     const auto burst_bytes = static_cast<double>(burstBytesPerPeriod(current_limit));
+    // Clamp to one refill-period burst so a temporarily idle reader cannot accumulate an unbounded burst.
     available_bytes = std::min(
         burst_bytes,
         available_bytes + static_cast<double>(current_limit) * static_cast<double>(elapsed_ns) / 1000000000.0);
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.h b/dbms/src/Storages/S3/S3ReadLimiter.h
index 077fd4be8ed..cadd34c52f9 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.h
+++ b/dbms/src/Storages/S3/S3ReadLimiter.h
@@ -34,6 +34,11 @@ enum class S3ReadSource
 class S3ReadLimiter : public std::enable_shared_from_this<S3ReadLimiter>
 {
 public:
+    /// RAII handle for one live `GetObject` body stream.
+    ///
+    /// The token is acquired before the request body starts being consumed and released when the
+    /// response stream is destroyed or re-opened. This keeps the stream limiter aligned with the
+    /// actual number of concurrent remote-read streams instead of just the number of requests sent.
     class StreamToken
     {
     public:
@@ -62,6 +67,7 @@ class S3ReadLimiter : public std::enable_shared_from_this<S3ReadLimiter>
             return *this;
         }
 
+        /// Releases one active stream slot early. Destruction does the same automatically.
         void reset();
 
     private:
@@ -99,8 +105,11 @@ class S3ReadLimiter : public std::enable_shared_from_this<S3ReadLimiter>
 private:
     using Clock = std::chrono::steady_clock;
 
+    /// Return one `GetObject` stream slot back to the limiter and wake one waiter.
     void releaseStream();
+    /// Refill the token bucket according to elapsed wall time. Caller must hold `bytes_mutex`.
     void refillBytesLocked(Clock::time_point now);
+    /// Limit the instantaneous burst so long reads are naturally split into small limiter-aware chunks.
     UInt64 burstBytesPerPeriod(UInt64 max_read_bytes_per_sec_) const;
 
     const UInt64 refill_period_ms;
@@ -113,6 +122,7 @@ class S3ReadLimiter : public std::enable_shared_from_this<S3ReadLimiter>
 
     mutable std::mutex bytes_mutex;
     std::condition_variable bytes_cv;
+    // Token-bucket state for S3 byte throttling.
     double available_bytes;
     Clock::time_point last_refill_time;
     bool stop;
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index 69fbfefe7c5..cb54af751a3 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -1146,7 +1146,8 @@ TEST_F(FileCacheTest, FileSegmentWaitForNotEmptyFor)
     file_seg->setComplete(256);
     ASSERT_EQ(complete_future.get(), FileSegment::Status::Complete);
 
-    auto failed_seg = std::make_shared<FileSegment>("/tmp/test_failed", FileSegment::Status::Empty, 128, FileType::Meta);
+    auto failed_seg
+        = std::make_shared<FileSegment>("/tmp/test_failed", FileSegment::Status::Empty, 128, FileType::Meta);
     auto failed_future = std::async(std::launch::async, [&]() { return failed_seg->waitForNotEmptyFor(200ms); });
     std::this_thread::sleep_for(20ms);
     failed_seg->setStatus(FileSegment::Status::Failed);

From 1805682fb6b3e2d645edc50ad8cc42418d3b7696 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 13:08:30 +0800
Subject: [PATCH 04/36] disagg: add remote cache phase-2 metrics and coverage

---
 dbms/src/Common/TiFlashMetrics.cpp            |  73 ++++++++++++
 dbms/src/Common/TiFlashMetrics.h              |  49 ++++++++
 dbms/src/Storages/S3/FileCache.cpp            | 111 ++++++++++++++++--
 .../src/Storages/S3/tests/gtest_filecache.cpp |  77 ++++++++++++
 4 files changed, 300 insertions(+), 10 deletions(-)

diff --git a/dbms/src/Common/TiFlashMetrics.cpp b/dbms/src/Common/TiFlashMetrics.cpp
index 4b8a30e3ebf..765540645eb 100644
--- a/dbms/src/Common/TiFlashMetrics.cpp
+++ b/dbms/src/Common/TiFlashMetrics.cpp
@@ -22,6 +22,14 @@
 
 namespace DB
 {
+namespace
+{
+constexpr std::array remote_cache_file_type_labels = {"merged", "coldata", "other"};
+constexpr std::array remote_cache_wait_result_labels = {"hit", "timeout", "failed"};
+constexpr std::array remote_cache_download_stage_labels = {"queue_wait", "download"};
+constexpr auto remote_cache_bg_download_stage_buckets = ExpBuckets{0.0001, 2, 20};
+} // namespace
+
 TiFlashMetrics & TiFlashMetrics::instance()
 {
     static TiFlashMetrics inst; // Instantiated on first use.
@@ -78,6 +86,49 @@ TiFlashMetrics::TiFlashMetrics()
                                                     .Name("tiflash_storage_s3_store_summary_bytes")
                                                     .Help("S3 storage summary bytes by store and file type")
                                                     .Register(*registry);
+
+    registered_remote_cache_wait_on_downloading_result_family
+        = &prometheus::BuildCounter()
+               .Name("tiflash_storage_remote_cache_wait_on_downloading_result")
+               .Help("Bounded wait result of remote cache downloading")
+               .Register(*registry);
+    registered_remote_cache_wait_on_downloading_bytes_family
+        = &prometheus::BuildCounter()
+               .Name("tiflash_storage_remote_cache_wait_on_downloading_bytes")
+               .Help("Bytes covered by remote cache bounded wait")
+               .Register(*registry);
+    registered_remote_cache_bg_download_stage_seconds_family
+        = &prometheus::BuildHistogram()
+               .Name("tiflash_storage_remote_cache_bg_download_stage_seconds")
+               .Help("Remote cache background download stage duration")
+               .Register(*registry);
+
+    for (size_t file_type_idx = 0; file_type_idx < remote_cache_file_type_labels.size(); ++file_type_idx)
+    {
+        for (size_t result_idx = 0; result_idx < remote_cache_wait_result_labels.size(); ++result_idx)
+        {
+            auto labels = prometheus::Labels{
+                {"result", std::string(remote_cache_wait_result_labels[result_idx])},
+                {"file_type", std::string(remote_cache_file_type_labels[file_type_idx])},
+            };
+            remote_cache_wait_on_downloading_result_metrics[file_type_idx][result_idx]
+                = &registered_remote_cache_wait_on_downloading_result_family->Add(labels);
+            remote_cache_wait_on_downloading_bytes_metrics[file_type_idx][result_idx]
+                = &registered_remote_cache_wait_on_downloading_bytes_family->Add(labels);
+        }
+        for (size_t stage_idx = 0; stage_idx < remote_cache_download_stage_labels.size(); ++stage_idx)
+        {
+            prometheus::Histogram::BucketBoundaries buckets = ExpBuckets{
+                remote_cache_bg_download_stage_buckets.start,
+                remote_cache_bg_download_stage_buckets.base,
+                remote_cache_bg_download_stage_buckets.size};
+            remote_cache_bg_download_stage_seconds_metrics[file_type_idx][stage_idx]
+                = &registered_remote_cache_bg_download_stage_seconds_family->Add(
+                    {{"stage", std::string(remote_cache_download_stage_labels[stage_idx])},
+                     {"file_type", std::string(remote_cache_file_type_labels[file_type_idx])}},
+                    buckets);
+        }
+    }
 }
 
 void TiFlashMetrics::addReplicaSyncRU(UInt32 keyspace_id, UInt64 ru)
@@ -287,4 +338,26 @@ void TiFlashMetrics::setS3StoreSummaryBytes(UInt64 store_id, UInt64 data_file_by
     it->second.data_file_bytes->Set(data_file_bytes);
     it->second.dt_file_bytes->Set(dt_file_bytes);
 }
+
+prometheus::Counter & TiFlashMetrics::getRemoteCacheWaitOnDownloadingResultCounter(
+    TiFlashMetrics::RemoteCacheFileTypeMetric file_type,
+    TiFlashMetrics::RemoteCacheWaitResultMetric result)
+{
+    return *remote_cache_wait_on_downloading_result_metrics[static_cast<size_t>(file_type)]
+                                                           [static_cast<size_t>(result)];
+}
+
+prometheus::Counter & TiFlashMetrics::getRemoteCacheWaitOnDownloadingBytesCounter(
+    TiFlashMetrics::RemoteCacheFileTypeMetric file_type,
+    TiFlashMetrics::RemoteCacheWaitResultMetric result)
+{
+    return *remote_cache_wait_on_downloading_bytes_metrics[static_cast<size_t>(file_type)][static_cast<size_t>(result)];
+}
+
+prometheus::Histogram & TiFlashMetrics::getRemoteCacheBgDownloadStageSecondsHistogram(
+    TiFlashMetrics::RemoteCacheFileTypeMetric file_type,
+    TiFlashMetrics::RemoteCacheDownloadStageMetric stage)
+{
+    return *remote_cache_bg_download_stage_seconds_metrics[static_cast<size_t>(file_type)][static_cast<size_t>(stage)];
+}
 } // namespace DB
diff --git a/dbms/src/Common/TiFlashMetrics.h b/dbms/src/Common/TiFlashMetrics.h
index fd9646bb07a..90849e518f7 100644
--- a/dbms/src/Common/TiFlashMetrics.h
+++ b/dbms/src/Common/TiFlashMetrics.h
@@ -27,6 +27,7 @@
 #include <prometheus/histogram.h>
 #include <prometheus/registry.h>
 
+#include <array>
 #include <cassert>
 #include <ext/scope_guard.h>
 #include <mutex>
@@ -1335,6 +1336,29 @@ class TiFlashMetrics
 public:
     static TiFlashMetrics & instance();
 
+    enum class RemoteCacheFileTypeMetric : size_t
+    {
+        Merged = 0,
+        ColData,
+        Other,
+        Count,
+    };
+
+    enum class RemoteCacheWaitResultMetric : size_t
+    {
+        Hit = 0,
+        Timeout,
+        Failed,
+        Count,
+    };
+
+    enum class RemoteCacheDownloadStageMetric : size_t
+    {
+        QueueWait = 0,
+        Download,
+        Count,
+    };
+
     void addReplicaSyncRU(UInt32 keyspace_id, UInt64 ru);
     UInt64 debugQueryReplicaSyncRU(UInt32 keyspace_id);
     enum class MemoryAllocType
@@ -1356,6 +1380,15 @@ class TiFlashMetrics
         const DM::ReadRUType type);
 
     void setS3StoreSummaryBytes(UInt64 store_id, UInt64 data_file_bytes, UInt64 dt_file_bytes);
+    prometheus::Counter & getRemoteCacheWaitOnDownloadingResultCounter(
+        RemoteCacheFileTypeMetric file_type,
+        RemoteCacheWaitResultMetric result);
+    prometheus::Counter & getRemoteCacheWaitOnDownloadingBytesCounter(
+        RemoteCacheFileTypeMetric file_type,
+        RemoteCacheWaitResultMetric result);
+    prometheus::Histogram & getRemoteCacheBgDownloadStageSecondsHistogram(
+        RemoteCacheFileTypeMetric file_type,
+        RemoteCacheDownloadStageMetric stage);
 
 private:
     TiFlashMetrics();
@@ -1407,6 +1440,22 @@ class TiFlashMetrics
     std::shared_mutex s3_store_summary_bytes_mtx;
     std::unordered_map<UInt64, S3StoreSummaryBytesMetrics> registered_s3_store_summary_bytes_metrics;
 
+    prometheus::Family<prometheus::Counter> * registered_remote_cache_wait_on_downloading_result_family;
+    std::array<
+        std::array<prometheus::Counter *, static_cast<size_t>(RemoteCacheWaitResultMetric::Count)>,
+        static_cast<size_t>(RemoteCacheFileTypeMetric::Count)>
+        remote_cache_wait_on_downloading_result_metrics{};
+    prometheus::Family<prometheus::Counter> * registered_remote_cache_wait_on_downloading_bytes_family;
+    std::array<
+        std::array<prometheus::Counter *, static_cast<size_t>(RemoteCacheWaitResultMetric::Count)>,
+        static_cast<size_t>(RemoteCacheFileTypeMetric::Count)>
+        remote_cache_wait_on_downloading_bytes_metrics{};
+    prometheus::Family<prometheus::Histogram> * registered_remote_cache_bg_download_stage_seconds_family;
+    std::array<
+        std::array<prometheus::Histogram *, static_cast<size_t>(RemoteCacheDownloadStageMetric::Count)>,
+        static_cast<size_t>(RemoteCacheFileTypeMetric::Count)>
+        remote_cache_bg_download_stage_seconds_metrics{};
+
 public:
 #define MAKE_METRIC_MEMBER_M(family_name, help, type, ...) \
     MetricFamily<prometheus::type> family_name             \
diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 746c27e7da1..37cdbd836eb 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -96,12 +96,74 @@ enum class WaitResult
     Failed,
 };
 
+enum class BgDownloadStage
+{
+    QueueWait,
+    Download,
+};
+
+TiFlashMetrics::RemoteCacheFileTypeMetric toMetricFileType(FileType file_type)
+{
+    switch (file_type)
+    {
+    case FileType::Merged:
+        return TiFlashMetrics::RemoteCacheFileTypeMetric::Merged;
+    case FileType::DeleteMarkColData:
+    case FileType::VersionColData:
+    case FileType::HandleColData:
+    case FileType::ColData:
+        return TiFlashMetrics::RemoteCacheFileTypeMetric::ColData;
+    default:
+        return TiFlashMetrics::RemoteCacheFileTypeMetric::Other;
+    }
+}
+
 void observeWaitOnDownloadingMetrics(FileType file_type, WaitResult result, UInt64 bytes, double wait_seconds)
 {
-    UNUSED(file_type);
-    UNUSED(bytes);
-    UNUSED(wait_seconds);
     GET_METRIC(tiflash_storage_remote_cache, type_wait_on_downloading).Increment();
+    auto & metrics = TiFlashMetrics::instance();
+    auto metric_file_type = toMetricFileType(file_type);
+    switch (result)
+    {
+    case WaitResult::Hit:
+        metrics
+            .getRemoteCacheWaitOnDownloadingResultCounter(
+                metric_file_type,
+                TiFlashMetrics::RemoteCacheWaitResultMetric::Hit)
+            .Increment();
+        metrics
+            .getRemoteCacheWaitOnDownloadingBytesCounter(
+                metric_file_type,
+                TiFlashMetrics::RemoteCacheWaitResultMetric::Hit)
+            .Increment(bytes);
+        break;
+    case WaitResult::Timeout:
+        metrics
+            .getRemoteCacheWaitOnDownloadingResultCounter(
+                metric_file_type,
+                TiFlashMetrics::RemoteCacheWaitResultMetric::Timeout)
+            .Increment();
+        metrics
+            .getRemoteCacheWaitOnDownloadingBytesCounter(
+                metric_file_type,
+                TiFlashMetrics::RemoteCacheWaitResultMetric::Timeout)
+            .Increment(bytes);
+        break;
+    case WaitResult::Failed:
+        metrics
+            .getRemoteCacheWaitOnDownloadingResultCounter(
+                metric_file_type,
+                TiFlashMetrics::RemoteCacheWaitResultMetric::Failed)
+            .Increment();
+        metrics
+            .getRemoteCacheWaitOnDownloadingBytesCounter(
+                metric_file_type,
+                TiFlashMetrics::RemoteCacheWaitResultMetric::Failed)
+            .Increment(bytes);
+        break;
+    }
+
+    UNUSED(wait_seconds);
     switch (result)
     {
     case WaitResult::Hit:
@@ -116,6 +178,29 @@ void observeWaitOnDownloadingMetrics(FileType file_type, WaitResult result, UInt
     }
 }
 
+void observeBgDownloadStageMetrics(FileType file_type, BgDownloadStage stage, double seconds)
+{
+    auto & metrics = TiFlashMetrics::instance();
+    auto metric_file_type = toMetricFileType(file_type);
+    switch (stage)
+    {
+    case BgDownloadStage::QueueWait:
+        metrics
+            .getRemoteCacheBgDownloadStageSecondsHistogram(
+                metric_file_type,
+                TiFlashMetrics::RemoteCacheDownloadStageMetric::QueueWait)
+            .Observe(seconds);
+        break;
+    case BgDownloadStage::Download:
+        metrics
+            .getRemoteCacheBgDownloadStageSecondsHistogram(
+                metric_file_type,
+                TiFlashMetrics::RemoteCacheDownloadStageMetric::Download)
+            .Observe(seconds);
+        break;
+    }
+}
+
 } // namespace
 
 std::unique_ptr<FileCache> FileCache::global_file_cache_instance;
@@ -746,10 +831,10 @@ std::vector<FileType> FileCache::getEvictFileTypes(FileType evict_for, bool evic
         std::vector<FileType> evict_types;
         evict_types.push_back(evict_for); // First, try evict with the same file type.
         // Second, try evict from the lower priority file type.
-        for (auto file_type : all_file_types | std::views::reverse)
+        for (auto itr = std::rbegin(all_file_types); itr != std::rend(all_file_types); ++itr)
         {
-            if (file_type != evict_for)
-                evict_types.push_back(file_type);
+            if (*itr != evict_for)
+                evict_types.push_back(*itr);
         }
         return evict_types;
     }
@@ -757,10 +842,10 @@ std::vector<FileType> FileCache::getEvictFileTypes(FileType evict_for, bool evic
     {
         std::vector<FileType> evict_types;
         // Evict from the lower priority file type first.
-        for (auto file_type : all_file_types | std::views::reverse)
+        for (auto itr = std::rbegin(all_file_types); itr != std::rend(all_file_types); ++itr)
         {
-            evict_types.push_back(file_type);
-            if (file_type == evict_for)
+            evict_types.push_back(*itr);
+            if (*itr == evict_for)
             {
                 // Do not evict higher priority file type
                 break;
@@ -1188,7 +1273,12 @@ void FileCache::bgDownloadExecutor(
     const WriteLimiterPtr & write_limiter,
     std::chrono::steady_clock::time_point enqueue_time)
 {
-    UNUSED(enqueue_time);
+    observeBgDownloadStageMetrics(
+        file_seg->getFileType(),
+        BgDownloadStage::QueueWait,
+        std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - enqueue_time)
+            .count());
+    Stopwatch download_watch;
     try
     {
         GET_METRIC(tiflash_storage_remote_cache, type_dtfile_download).Increment();
@@ -1199,6 +1289,7 @@ void FileCache::bgDownloadExecutor(
         // ignore the exception here, and log as warning.
         tryLogCurrentWarningException(log, fmt::format("Download s3_key={} failed", s3_key));
     }
+    observeBgDownloadStageMetrics(file_seg->getFileType(), BgDownloadStage::Download, download_watch.elapsedSeconds());
     if (!file_seg->isReadyToRead())
     {
         file_seg->setStatus(FileSegment::Status::Failed);
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index cb54af751a3..b5a746e293d 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -1230,6 +1230,83 @@ TEST_F(FileCacheTest, BgDownloadRespectsS3StreamLimiter)
     ASSERT_EQ(limiter->activeStreams(), 0);
 }
 
+TEST_F(FileCacheTest, GetWaitOnDownloadingSupportsColDataAndOther)
+{
+    auto cache_dir = fmt::format("{}/wait_on_downloading_non_merged", tmp_dir);
+    StorageRemoteCacheConfig cache_config{.dir = cache_dir, .capacity = cache_capacity, .dtfile_level = 100};
+
+    UInt16 vcores = 2;
+    IORateLimiter rate_limiter;
+    FileCache file_cache(capacity_metrics, cache_config, vcores, rate_limiter);
+
+    Settings settings;
+    settings.dt_filecache_downloading_count_scale = 2.0;
+    settings.dt_filecache_max_downloading_count_scale = 2.0;
+    settings.dt_filecache_wait_on_downloading_ms = 200;
+    file_cache.updateConfig(settings);
+
+    auto objects = genObjects(/*store_count*/ 1, /*table_count*/ 1, /*file_count*/ 1, {"1.dat", "meta"});
+    auto sp_download = SyncPointCtl::enableInScope("before_FileCache::downloadImpl_download_to_local");
+
+    auto run_wait_hit_case = [&](const ObjectInfo & obj, FileType expected_file_type) {
+        auto key = S3FilenameView::fromKey(obj.key);
+        ASSERT_EQ(file_cache.get(key, obj.size), nullptr);
+        sp_download.waitAndPause();
+
+        auto wait_hit = std::async(std::launch::async, [&]() { return file_cache.get(key, obj.size); });
+        std::this_thread::sleep_for(20ms);
+        sp_download.next();
+
+        auto file_seg = wait_hit.get();
+        ASSERT_NE(file_seg, nullptr);
+        ASSERT_TRUE(file_seg->isReadyToRead());
+        ASSERT_EQ(file_seg->getSize(), obj.size);
+        ASSERT_EQ(file_seg->getFileType(), expected_file_type);
+    };
+
+    run_wait_hit_case(objects[0], FileType::ColData);
+    run_wait_hit_case(objects[1], FileType::Meta);
+
+    sp_download.disable();
+    waitForBgDownload(file_cache);
+}
+
+TEST_F(FileCacheTest, BgDownloadSupportsColDataAndOther)
+{
+    auto cache_dir = fmt::format("{}/bg_download_non_merged", tmp_dir);
+    StorageRemoteCacheConfig cache_config{.dir = cache_dir, .capacity = cache_capacity, .dtfile_level = 100};
+
+    UInt16 vcores = 2;
+    IORateLimiter rate_limiter;
+    FileCache file_cache(capacity_metrics, cache_config, vcores, rate_limiter);
+    Settings settings;
+    settings.dt_filecache_downloading_count_scale = 2.0;
+    settings.dt_filecache_max_downloading_count_scale = 2.0;
+    file_cache.updateConfig(settings);
+
+    auto objects = genObjects(/*store_count*/ 1, /*table_count*/ 1, /*file_count*/ 1, {"2.dat", "meta"});
+
+    for (const auto & obj : objects)
+    {
+        auto key = S3FilenameView::fromKey(obj.key);
+        ASSERT_EQ(file_cache.get(key, obj.size), nullptr);
+    }
+
+    waitForBgDownload(file_cache);
+
+    std::array<FileType, 2> expected_file_types = {FileType::ColData, FileType::Meta};
+    size_t index = 0;
+    for (const auto & obj : objects)
+    {
+        auto file_seg = file_cache.get(S3FilenameView::fromKey(obj.key), obj.size);
+        ASSERT_NE(file_seg, nullptr);
+        ASSERT_TRUE(file_seg->isReadyToRead());
+        ASSERT_EQ(file_seg->getSize(), obj.size);
+        ASSERT_EQ(file_seg->getFileType(), expected_file_types[index]);
+        ++index;
+    }
+}
+
 TEST_F(FileCacheTest, GetBeingBlock)
 {
     auto cache_dir = fmt::format("{}/update_config", tmp_dir);

From 099b2d9258f655f2113b3a3c13d91bb82a30e6c7 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 13:13:09 +0800
Subject: [PATCH 05/36] disagg: remove duplicate S3 active stream metric

---
 dbms/src/Common/CurrentMetrics.cpp     | 1 -
 dbms/src/Storages/S3/S3ReadLimiter.cpp | 7 -------
 2 files changed, 8 deletions(-)

diff --git a/dbms/src/Common/CurrentMetrics.cpp b/dbms/src/Common/CurrentMetrics.cpp
index 97fd4987f9a..5cb62909ae1 100644
--- a/dbms/src/Common/CurrentMetrics.cpp
+++ b/dbms/src/Common/CurrentMetrics.cpp
@@ -86,7 +86,6 @@
     M(RegionPersisterRunMode)                   \
     M(S3Requests)                               \
     M(S3RandomAccessFile)                       \
-    M(S3ActiveGetObjectStreams)                 \
     M(GlobalStorageRunMode)                     \
     M(GlobalThread)                             \
     M(GlobalThreadActive)                       \
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.cpp b/dbms/src/Storages/S3/S3ReadLimiter.cpp
index 0a70737c28c..b10c6935b58 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.cpp
+++ b/dbms/src/Storages/S3/S3ReadLimiter.cpp
@@ -20,11 +20,6 @@
 #include <algorithm>
 #include <ext/scope_guard.h>
 
-namespace CurrentMetrics
-{
-extern const Metric S3ActiveGetObjectStreams;
-} // namespace CurrentMetrics
-
 namespace DB::S3
 {
 namespace
@@ -121,7 +116,6 @@ std::unique_ptr<S3ReadLimiter::StreamToken> S3ReadLimiter::acquireStream()
         return nullptr;
 
     auto cur = active_streams.fetch_add(1, std::memory_order_relaxed) + 1;
-    CurrentMetrics::add(CurrentMetrics::S3ActiveGetObjectStreams);
     GET_METRIC(tiflash_storage_s3_read_limiter_status, type_active_get_object_streams).Set(cur);
     return std::make_unique<StreamToken>(this);
 }
@@ -207,7 +201,6 @@ void S3ReadLimiter::setStop()
 void S3ReadLimiter::releaseStream()
 {
     auto cur = active_streams.fetch_sub(1, std::memory_order_relaxed) - 1;
-    CurrentMetrics::sub(CurrentMetrics::S3ActiveGetObjectStreams);
     GET_METRIC(tiflash_storage_s3_read_limiter_status, type_active_get_object_streams).Set(cur);
     stream_cv.notify_one();
 }

From df725754a3d773f01d668be4bf05610bdc962b92 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 13:50:32 +0800
Subject: [PATCH 06/36] disagg: share file cache wait implementation

---
 dbms/src/Storages/S3/FileCache.cpp | 52 +++++++++++++++++++-----------
 dbms/src/Storages/S3/FileCache.h   |  3 ++
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 37cdbd836eb..b7b36e551ba 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -206,35 +206,58 @@ void observeBgDownloadStageMetrics(FileType file_type, BgDownloadStage stage, do
 std::unique_ptr<FileCache> FileCache::global_file_cache_instance;
 
 FileSegment::Status FileSegment::waitForNotEmpty()
+{
+    // Foreground callers expect the file to become readable eventually. This path keeps logging
+    // slow waits and fails hard after the built-in timeout instead of silently returning `Empty`.
+    return waitForNotEmptyImpl(std::nullopt, /*log_progress*/ true, /*throw_on_timeout*/ true);
+}
+
+FileSegment::Status FileSegment::waitForNotEmptyFor(std::chrono::milliseconds timeout)
+{
+    // Bounded-wait callers treat timeout as a normal outcome and will fall back to another path,
+    // so this variant waits only once for the specified budget and returns the current status.
+    return waitForNotEmptyImpl(timeout, /*log_progress*/ false, /*throw_on_timeout*/ false);
+}
+
+FileSegment::Status FileSegment::waitForNotEmptyImpl(
+    std::optional<std::chrono::milliseconds> timeout,
+    bool log_progress,
+    bool throw_on_timeout)
 {
     std::unique_lock lock(mtx);
 
     if (status != Status::Empty)
         return status;
 
-    PerfContext::file_cache.fg_wait_download_from_s3++;
+    if (log_progress)
+        PerfContext::file_cache.fg_wait_download_from_s3++;
 
     Stopwatch watch;
 
     while (true)
     {
+        auto wait_interval = timeout.value_or(std::chrono::seconds(default_wait_log_interval_seconds));
         SYNC_FOR("before_FileSegment::waitForNotEmpty_wait"); // just before actual waiting...
 
-        auto is_done = cv_ready.wait_for(lock, std::chrono::seconds(default_wait_log_interval_seconds), [&] {
-            return status != Status::Empty;
-        });
+        auto is_done = cv_ready.wait_for(lock, wait_interval, [&] { return status != Status::Empty; });
         if (is_done)
             break;
 
+        if (timeout.has_value())
+            break;
+
         double elapsed_secs = watch.elapsedSeconds();
-        LOG_WARNING(
-            Logger::get(),
-            "FileCache is still waiting FileSegment ready, file={} elapsed={}s",
-            local_fname,
-            elapsed_secs);
+        if (log_progress)
+        {
+            LOG_WARNING(
+                Logger::get(),
+                "FileCache is still waiting FileSegment ready, file={} elapsed={}s",
+                local_fname,
+                elapsed_secs);
+        }
 
         // Snapshot time is 300s
-        if (elapsed_secs > wait_ready_timeout_seconds)
+        if (throw_on_timeout && elapsed_secs > wait_ready_timeout_seconds)
         {
             throw Exception(
                 ErrorCodes::S3_ERROR,
@@ -247,15 +270,6 @@ FileSegment::Status FileSegment::waitForNotEmpty()
     return status;
 }
 
-FileSegment::Status FileSegment::waitForNotEmptyFor(std::chrono::milliseconds timeout)
-{
-    std::unique_lock lock(mtx);
-    if (status != Status::Empty)
-        return status;
-    cv_ready.wait_for(lock, timeout, [&] { return status != Status::Empty; });
-    return status;
-}
-
 void CacheSizeHistogram::addFileSegment(const FileSegmentPtr & file_seg)
 {
     if (!file_seg)
diff --git a/dbms/src/Storages/S3/FileCache.h b/dbms/src/Storages/S3/FileCache.h
index 2fa089408bf..31238a3f159 100644
--- a/dbms/src/Storages/S3/FileCache.h
+++ b/dbms/src/Storages/S3/FileCache.h
@@ -36,6 +36,7 @@
 #include <filesystem>
 #include <magic_enum.hpp>
 #include <mutex>
+#include <optional>
 #include <unordered_map>
 
 namespace DB
@@ -149,6 +150,8 @@ class FileSegment
     }
 
 private:
+    Status waitForNotEmptyImpl(std::optional<std::chrono::milliseconds> timeout, bool log_progress, bool throw_on_timeout);
+
     mutable std::mutex mtx;
     const String local_fname;
     Status status;

From d074a7b7f60ddb95127d2f0418a59b2a581a1906 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 14:10:52 +0800
Subject: [PATCH 07/36] disagg: clarify file cache get paths

---
 dbms/src/Storages/S3/FileCache.cpp | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index b7b36e551ba..8b2f567fd7c 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -501,25 +501,27 @@ FileSegmentPtr FileCache::get(const S3::S3FilenameView & s3_fname, const std::op
             f->setLastAccessTime(std::chrono::system_clock::now());
             if (f->isReadyToRead())
             {
+                // Hot-cache fast path: the file is already materialized locally, so return the existing segment
+                // immediately without touching any download scheduling or bounded-wait logic.
                 GET_METRIC(tiflash_storage_remote_cache, type_dtfile_hit).Increment();
                 return f;
             }
-            else
+
+            // Another thread is already downloading the same object. Optionally wait for a bounded time and
+            // reuse that result instead of opening one more `GetObject` stream for the same key.
+            wait_ms = wait_on_downloading_ms.load(std::memory_order_relaxed);
+            if (wait_ms == 0)
             {
-                // Another thread is already downloading the same object. Optionally wait for a bounded time and
-                // reuse that result instead of opening one more `GetObject` stream for the same key.
-                wait_ms = wait_on_downloading_ms.load(std::memory_order_relaxed);
-                if (wait_ms == 0)
-                {
-                    GET_METRIC(tiflash_storage_remote_cache, type_dtfile_miss).Increment();
-                    return nullptr;
-                }
-                file_seg = f;
+                GET_METRIC(tiflash_storage_remote_cache, type_dtfile_miss).Increment();
+                return nullptr;
             }
+            file_seg = f;
         }
         else
         {
             GET_METRIC(tiflash_storage_remote_cache, type_dtfile_miss).Increment();
+            // Admission control before any reservation work: skip file types that should never enter FileCache,
+            // and stop creating new `Empty` placeholders once background downloading is already saturated.
             switch (canCache(file_type))
             {
             case ShouldCacheRes::RejectTypeNotMatch:
@@ -562,6 +564,9 @@ FileSegmentPtr FileCache::get(const S3::S3FilenameView & s3_fname, const std::op
 
     if (wait_ms != 0)
     {
+        // Follower path: another thread already inserted the `Empty` segment and is downloading this key.
+        // Wait only for the configured bounded budget, then either reuse the completed file or return miss
+        // so the caller can fall back without opening a duplicate download stream for the same object.
         Stopwatch wait_watch;
         auto status = file_seg->waitForNotEmptyFor(std::chrono::milliseconds(wait_ms));
         const auto waited_bytes = filesize.value_or(file_seg->getSize());
@@ -577,6 +582,8 @@ FileSegmentPtr FileCache::get(const S3::S3FilenameView & s3_fname, const std::op
             status == FileSegment::Status::Failed ? WaitResult::Failed : WaitResult::Timeout,
             waited_bytes,
             wait_watch.elapsedSeconds());
+        // Timeout is intentionally surfaced as a cache miss here. The caller can fall back to another read path,
+        // while the original downloader keeps making progress in background instead of being duplicated by followers.
         GET_METRIC(tiflash_storage_remote_cache, type_dtfile_miss).Increment();
         return nullptr;
     }

From 03f2d091dbf15cfadd6347d0841e912475bd5448 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 14:49:44 +0800
Subject: [PATCH 08/36] disagg: handle failed bounded wait downloads

---
 dbms/src/Common/FailPoint.cpp                 |  1 +
 dbms/src/Storages/S3/FileCache.cpp            |  8 ++-
 .../src/Storages/S3/tests/gtest_filecache.cpp | 62 +++++++++++++++++++
 3 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/dbms/src/Common/FailPoint.cpp b/dbms/src/Common/FailPoint.cpp
index 6795e6bd67e..4146829b0d7 100644
--- a/dbms/src/Common/FailPoint.cpp
+++ b/dbms/src/Common/FailPoint.cpp
@@ -115,6 +115,7 @@ namespace DB
     M(exception_when_fetch_disagg_pages)                     \
     M(cop_send_failure)                                      \
     M(file_cache_fg_download_fail)                           \
+    M(file_cache_bg_download_fail)                           \
     M(force_set_parallel_prehandle_threshold)                \
     M(force_raise_prehandle_exception)                       \
     M(force_agg_on_partial_block)                            \
diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 8b2f567fd7c..3331c2372ff 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -77,6 +77,7 @@ extern const int FILE_DOESNT_EXIST;
 namespace DB::FailPoints
 {
 extern const char file_cache_fg_download_fail[];
+extern const char file_cache_bg_download_fail[];
 } // namespace DB::FailPoints
 
 namespace DB
@@ -1262,6 +1263,7 @@ void FileCache::downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, c
     prepareParentDir(local_fname);
     auto temp_fname = toTemporaryFilename(local_fname);
     SYNC_FOR("before_FileCache::downloadImpl_download_to_local");
+    FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::file_cache_bg_download_fail);
     downloadToLocal(result.GetBody(), temp_fname, content_length, write_limiter, read_limiter);
     std::filesystem::rename(temp_fname, local_fname);
 
@@ -1317,7 +1319,9 @@ void FileCache::bgDownloadExecutor(
         GET_METRIC(tiflash_storage_remote_cache, type_dtfile_download_failed).Increment();
         bg_download_fail_count.fetch_add(1, std::memory_order_relaxed);
         file_seg.reset();
-        remove(s3_key);
+        // Followers may still hold the failed segment while waking up from bounded wait. Force removal so
+        // the failed placeholder does not stay published in the cache table and block later retries.
+        remove(s3_key, /*force*/ true);
     }
     else
     {
@@ -1372,7 +1376,7 @@ void FileCache::fgDownload(const String & s3_key, FileSegmentPtr & file_seg)
         file_seg->setStatus(FileSegment::Status::Failed);
         GET_METRIC(tiflash_storage_remote_cache, type_dtfile_download_failed).Increment();
         file_seg.reset();
-        remove(s3_key);
+        remove(s3_key, /*force*/ true);
     }
 
     LOG_DEBUG(log, "foreground downloading => s3_key {} finished", s3_key);
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index b5a746e293d..80235d159c5 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -15,6 +15,7 @@
 #include <Common/Logger.h>
 #include <Common/Stopwatch.h>
 #include <Common/SyncPoint/SyncPoint.h>
+#include <Common/FailPoint.h>
 #include <Debug/TiFlashTestEnv.h>
 #include <IO/BaseFile/RateLimiter.h>
 #include <IO/IOThreadPools.h>
@@ -53,6 +54,11 @@ namespace DB::ErrorCodes
 extern const int FILE_DOESNT_EXIST;
 }
 
+namespace DB::FailPoints
+{
+extern const char file_cache_bg_download_fail[];
+}
+
 namespace DB::tests::S3
 {
 class FileCacheTest : public ::testing::Test
@@ -1173,9 +1179,11 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingHitAndTimeout)
     auto sp_download = SyncPointCtl::enableInScope("before_FileCache::downloadImpl_download_to_local");
 
     auto first_key = S3FilenameView::fromKey(objects[0].key);
+    // First request publishes the `Empty` placeholder and starts the background download.
     ASSERT_EQ(file_cache.get(first_key, objects[0].size), nullptr);
     sp_download.waitAndPause();
 
+    // With a generous bounded-wait budget, the follower should reuse the downloader result instead of returning miss.
     auto wait_hit = std::async(std::launch::async, [&]() { return file_cache.get(first_key, objects[0].size); });
     std::this_thread::sleep_for(20ms);
     sp_download.next();
@@ -1186,6 +1194,7 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingHitAndTimeout)
     settings.dt_filecache_wait_on_downloading_ms = 30;
     file_cache.updateConfig(settings);
     auto second_key = S3FilenameView::fromKey(objects[1].key);
+    // Re-run the same pattern with a much smaller budget so the follower times out and returns miss.
     ASSERT_EQ(file_cache.get(second_key, objects[1].size), nullptr);
     sp_download.waitAndPause();
     auto wait_timeout = std::async(std::launch::async, [&]() { return file_cache.get(second_key, objects[1].size); });
@@ -1196,6 +1205,57 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingHitAndTimeout)
     waitForBgDownload(file_cache);
 }
 
+TEST_F(FileCacheTest, GetWaitOnDownloadingReturnsMissWhenDownloaderFails)
+{
+    auto cache_dir = fmt::format("{}/wait_on_downloading_failed", tmp_dir);
+    StorageRemoteCacheConfig cache_config{.dir = cache_dir, .capacity = cache_capacity, .dtfile_level = 100};
+
+    UInt16 vcores = 2;
+    IORateLimiter rate_limiter;
+    FileCache file_cache(capacity_metrics, cache_config, vcores, rate_limiter);
+
+    Settings settings;
+    settings.dt_filecache_downloading_count_scale = 2.0;
+    settings.dt_filecache_max_downloading_count_scale = 2.0;
+    settings.dt_filecache_wait_on_downloading_ms = 200;
+    file_cache.updateConfig(settings);
+
+    auto objects = genObjects(/*store_count*/ 1, /*table_count*/ 1, /*file_count*/ 1, {"1.merged"});
+    auto sp_download = SyncPointCtl::enableInScope("before_FileCache::downloadImpl_download_to_local");
+
+    auto key = S3FilenameView::fromKey(objects[0].key);
+    // First caller creates the `Empty` placeholder and starts the background download.
+    ASSERT_EQ(file_cache.get(key, objects[0].size), nullptr);
+    sp_download.waitAndPause();
+
+    // The follower reaches `get()` while the same key is still being downloaded. Inject a failure right before
+    // the downloader starts copying the body so the follower wakes up with `Status::Failed` and returns miss.
+    FailPointHelper::enableFailPoint(FailPoints::file_cache_bg_download_fail);
+    auto wait_failed = std::async(std::launch::async, [&]() { return file_cache.get(key, objects[0].size); });
+    std::this_thread::sleep_for(20ms);
+    sp_download.next();
+    ASSERT_EQ(wait_failed.get(), nullptr);
+    FailPointHelper::disableFailPoint(FailPoints::file_cache_bg_download_fail);
+    sp_download.disable();
+
+    waitForBgDownload(file_cache);
+    ASSERT_EQ(file_cache.bg_download_fail_count.load(std::memory_order_relaxed), 1);
+
+    // The failed placeholder must be removed from the cache table. Otherwise later requests would keep observing
+    // the stale failed entry instead of creating a fresh download task.
+    {
+        std::lock_guard lock(file_cache.mtx);
+        auto & table = file_cache.tables[static_cast<UInt64>(FileType::Merged)];
+        ASSERT_EQ(table.get(objects[0].key), nullptr);
+    }
+
+    // A later foreground retry should succeed, proving the failed follower path does not leave the cache stuck.
+    auto file_seg = file_cache.getOrWait(key, objects[0].size);
+    ASSERT_NE(file_seg, nullptr);
+    ASSERT_TRUE(file_seg->isReadyToRead());
+    ASSERT_EQ(file_seg->getSize(), objects[0].size);
+}
+
 TEST_F(FileCacheTest, BgDownloadRespectsS3StreamLimiter)
 {
     auto cache_dir = fmt::format("{}/bg_download_limiter", tmp_dir);
@@ -1250,6 +1310,8 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingSupportsColDataAndOther)
 
     auto run_wait_hit_case = [&](const ObjectInfo & obj, FileType expected_file_type) {
         auto key = S3FilenameView::fromKey(obj.key);
+        // The first request creates the placeholder, and the second request should hit the same bounded-wait path
+        // regardless of whether the file is classified as coldata or other.
         ASSERT_EQ(file_cache.get(key, obj.size), nullptr);
         sp_download.waitAndPause();
 

From 337b5d0df471efb75387a2da4379d04360894d47 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 15:18:14 +0800
Subject: [PATCH 09/36] disagg: reuse copyData in limited cache downloads

---
 dbms/src/Storages/S3/FileCache.cpp | 91 ++++++++++++++++++------------
 1 file changed, 56 insertions(+), 35 deletions(-)

diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 3331c2372ff..0d797f58b25 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -88,7 +88,47 @@ namespace
 {
 constexpr UInt64 default_wait_log_interval_seconds = 30;
 constexpr UInt64 wait_ready_timeout_seconds = 300;
-constexpr size_t s3_download_limiter_buffer_size = 128 * 1024;
+
+// A tiny FileCache-only ReadBuffer variant that charges the shared S3 limiter before each refill.
+// This lets downloadToLocal keep using the existing copyData/write-buffer path instead of maintaining
+// a separate hand-written read/write loop for limiter-enabled downloads.
+class ReadBufferFromIStreamWithLimiter : public BufferWithOwnMemory<ReadBuffer>
+{
+public:
+    ReadBufferFromIStreamWithLimiter(
+        std::istream & istr_,
+        size_t size,
+        const std::shared_ptr<S3::S3ReadLimiter> & limiter_,
+        S3::S3ReadSource source_)
+        : BufferWithOwnMemory<ReadBuffer>(size)
+        , istr(istr_)
+        , limiter(limiter_)
+        , source(source_)
+    {}
+
+private:
+    bool nextImpl() override
+    {
+        if (limiter != nullptr)
+            limiter->requestBytes(internal_buffer.size(), source);
+
+        istr.read(internal_buffer.begin(), internal_buffer.size());
+        auto gcount = istr.gcount();
+        if (!gcount)
+        {
+            if (istr.eof())
+                return false;
+            throw Exception("Cannot read from istream", ErrorCodes::CANNOT_READ_FROM_ISTREAM);
+        }
+
+        working_buffer.resize(gcount);
+        return true;
+    }
+
+    std::istream & istr;
+    std::shared_ptr<S3::S3ReadLimiter> limiter;
+    S3::S3ReadSource source;
+};
 
 enum class WaitResult
 {
@@ -1177,7 +1217,7 @@ void downloadToLocal(
     const String & fname,
     Int64 content_length,
     const WriteLimiterPtr & write_limiter,
-    const std::shared_ptr<S3::S3ReadLimiter> & read_limiter)
+    const std::shared_ptr<S3::S3ReadLimiter> & s3_read_limiter)
 {
     // create an empty file with write_limiter
     // each time `ofile.write` is called, the write speed will be controlled by the write_limiter.
@@ -1187,54 +1227,35 @@ void downloadToLocal(
         return;
 
     GET_METRIC(tiflash_storage_remote_cache_bytes, type_dtfile_download_bytes).Increment(content_length);
-    if (read_limiter == nullptr)
+    constexpr Int64 max_buffer_size = 128 * 1024; // 128 KiB
+    auto buffer_size = std::min<Int64>(content_length, max_buffer_size);
+    if (s3_read_limiter == nullptr)
     {
-        static const Int64 MAX_BUFFER_SIZE = 128 * 1024; // 128k
-        ReadBufferFromIStream rbuf(istr, std::min(content_length, MAX_BUFFER_SIZE));
-        WriteBufferFromWritableFile wbuf(ofile, std::min(content_length, MAX_BUFFER_SIZE));
+        ReadBufferFromIStream rbuf(istr, buffer_size);
+        WriteBufferFromWritableFile wbuf(ofile, buffer_size);
         copyData(rbuf, wbuf, content_length);
         wbuf.sync();
         return;
     }
 
-    std::array<char, s3_download_limiter_buffer_size> buffer{};
-    Int64 remaining = content_length;
-    while (remaining > 0)
-    {
-        // Avoid `ReadBufferFromIStream` here so FileCache downloads can charge the shared S3 byte limiter per chunk
-        // without introducing extra heap allocation on the hot path.
-        auto to_read = std::min<Int64>(remaining, static_cast<Int64>(buffer.size()));
-        read_limiter->requestBytes(to_read, S3::S3ReadSource::FileCacheDownload);
-        istr.read(buffer.data(), to_read);
-        auto gcount = istr.gcount();
-        RUNTIME_CHECK_MSG(gcount >= 0, "negative gcount for remote download");
-        if (gcount == 0)
-            break;
-        auto written = ofile->write(buffer.data(), gcount);
-        RUNTIME_CHECK(written == gcount, fname, written, gcount);
-        remaining -= gcount;
-        if (gcount < to_read)
-            break;
-    }
-    RUNTIME_CHECK_MSG(
-        remaining == 0,
-        "download {} incomplete, remaining={} content_length={}",
-        fname,
-        remaining,
-        content_length);
-    ofile->fsync();
+    // The limiter-aware buffer preserves the old copyData/write-buffer path while charging the shared
+    // S3 budget before each refill from the remote body stream.
+    ReadBufferFromIStreamWithLimiter rbuf(istr, buffer_size, s3_read_limiter, S3::S3ReadSource::FileCacheDownload);
+    WriteBufferFromWritableFile wbuf(ofile, buffer_size);
+    copyData(rbuf, wbuf, content_length);
+    wbuf.sync();
 }
 
 void FileCache::downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, const WriteLimiterPtr & write_limiter)
 {
     Stopwatch sw;
     auto client = S3::ClientFactory::instance().sharedTiFlashClient();
-    auto read_limiter = client->getS3ReadLimiter();
+    auto s3_read_limiter = client->getS3ReadLimiter();
     Aws::S3::Model::GetObjectRequest req;
     client->setBucketAndKeyWithRoot(req, s3_key);
     ProfileEvents::increment(ProfileEvents::S3GetObject);
     // Limit live background-download streams with the same token used by direct readers.
-    auto stream_token = read_limiter != nullptr ? read_limiter->acquireStream() : nullptr;
+    auto stream_token = s3_read_limiter != nullptr ? s3_read_limiter->acquireStream() : nullptr;
     auto outcome = client->GetObject(req);
     if (!outcome.IsSuccess())
     {
@@ -1264,7 +1285,7 @@ void FileCache::downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, c
     auto temp_fname = toTemporaryFilename(local_fname);
     SYNC_FOR("before_FileCache::downloadImpl_download_to_local");
     FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::file_cache_bg_download_fail);
-    downloadToLocal(result.GetBody(), temp_fname, content_length, write_limiter, read_limiter);
+    downloadToLocal(result.GetBody(), temp_fname, content_length, write_limiter, s3_read_limiter);
     std::filesystem::rename(temp_fname, local_fname);
 
 #ifndef NDEBUG

From 72fd48a72b853b3b6822084ce2152c8328bc674e Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 15:39:41 +0800
Subject: [PATCH 10/36] disagg: clarify conservative S3 read charging

---
 dbms/src/Storages/S3/FileCache.cpp          | 11 ++++++++---
 dbms/src/Storages/S3/S3RandomAccessFile.cpp |  4 +++-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 0d797f58b25..b9e0a9abdb8 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -86,9 +86,6 @@ using FileType = FileSegment::FileType;
 
 namespace
 {
-constexpr UInt64 default_wait_log_interval_seconds = 30;
-constexpr UInt64 wait_ready_timeout_seconds = 300;
-
 // A tiny FileCache-only ReadBuffer variant that charges the shared S3 limiter before each refill.
 // This lets downloadToLocal keep using the existing copyData/write-buffer path instead of maintaining
 // a separate hand-written read/write loop for limiter-enabled downloads.
@@ -110,7 +107,12 @@ class ReadBufferFromIStreamWithLimiter : public BufferWithOwnMemory<ReadBuffer>
     bool nextImpl() override
     {
         if (limiter != nullptr)
+        {
+            // Charge the requested refill size before the actual `istream.read()`. This is intentionally
+            // conservative: short reads still spend the full reserved budget for this refill. If we need tighter
+            // accounting later, we can extend this path to compensate with the actual bytes read back from S3.
             limiter->requestBytes(internal_buffer.size(), source);
+        }
 
         istr.read(internal_buffer.begin(), internal_buffer.size());
         auto gcount = istr.gcount();
@@ -265,6 +267,9 @@ FileSegment::Status FileSegment::waitForNotEmptyImpl(
     bool log_progress,
     bool throw_on_timeout)
 {
+    constexpr UInt64 default_wait_log_interval_seconds = 30;
+    constexpr UInt64 wait_ready_timeout_seconds = 300;
+
     std::unique_lock lock(mtx);
 
     if (status != Status::Empty)
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.cpp b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
index 79b688a91ac..3b7cb6a30f3 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.cpp
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
@@ -200,7 +200,9 @@ ssize_t S3RandomAccessFile::readChunked(char * buf, size_t size)
     while (total_gcount < size)
     {
         // The limiter charges requested bytes before the actual stream read so direct reads and FileCache downloads
-        // compete for the same node-level remote-read budget.
+        // compete for the same node-level remote-read budget. This is intentionally conservative: a short read still
+        // spends the full requested budget for this chunk. If we need tighter accounting later, we can add a
+        // compensation path based on the actual bytes read back from S3.
         auto to_read = std::min(size - total_gcount, static_cast<size_t>(chunk_size));
         read_limiter->requestBytes(to_read, S3ReadSource::DirectRead);
         istr.read(buf + total_gcount, to_read);

From d1ac68b6cc0ea6c1fdad063b41d5f0ab9ce41a78 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 15:51:35 +0800
Subject: [PATCH 11/36] disagg: share S3 random access read and seek tails

---
 dbms/src/Storages/S3/S3RandomAccessFile.cpp | 143 ++++++--------------
 dbms/src/Storages/S3/S3RandomAccessFile.h   |   5 +
 2 files changed, 45 insertions(+), 103 deletions(-)

diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.cpp b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
index 3b7cb6a30f3..49a50514de4 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.cpp
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
@@ -136,57 +136,7 @@ ssize_t S3RandomAccessFile::readImpl(char * buf, size_t size)
     ProfileEvents::increment(ProfileEvents::S3IORead, 1);
     auto & istr = read_result.GetBody();
     istr.read(buf, size);
-    size_t gcount = istr.gcount();
-
-    fiu_do_on(FailPoints::force_s3_random_access_file_read_fail, {
-        LOG_WARNING(log, "failpoint force_s3_random_access_file_read_fail is triggered, return S3StreamError");
-        return S3StreamError;
-    });
-
-    // Theoretically, `istr.eof()` is equivalent to `cur_offset + gcount != static_cast<size_t>(content_length)`.
-    // It's just a double check for more safety.
-    if (gcount < size && (!istr.eof() || cur_offset + gcount != static_cast<size_t>(content_length)))
-    {
-        ProfileEvents::increment(ProfileEvents::S3IOReadError);
-        auto state = istr.rdstate();
-        auto elapsed_secs = sw.elapsedSeconds();
-        GET_METRIC(tiflash_storage_s3_request_seconds, type_read_stream_err).Observe(elapsed_secs);
-        LOG_WARNING(
-            log,
-            "Cannot read from istream, size={} gcount={} state=0x{:02X} cur_offset={} content_length={} "
-            "errno={} errmsg={} cost={:.6f}s",
-            size,
-            gcount,
-            state,
-            cur_offset,
-            content_length,
-            errno,
-            strerror(errno),
-            elapsed_secs);
-        return (state & std::ios_base::failbit || state & std::ios_base::badbit) ? S3StreamError : S3UnknownError;
-    }
-
-    auto elapsed_secs = sw.elapsedSeconds();
-    if (scan_context)
-    {
-        scan_context->disagg_s3file_read_time_ms += elapsed_secs * 1000;
-        scan_context->disagg_s3file_read_count += 1;
-        scan_context->disagg_s3file_read_bytes += gcount;
-    }
-    GET_METRIC(tiflash_storage_s3_request_seconds, type_read_stream).Observe(elapsed_secs);
-    if (elapsed_secs > 0.01) // 10ms
-    {
-        LOG_DEBUG(
-            log,
-            "gcount={} cur_offset={} content_length={} cost={:.3f}s",
-            gcount,
-            cur_offset,
-            content_length,
-            elapsed_secs);
-    }
-    cur_offset += gcount;
-    ProfileEvents::increment(ProfileEvents::S3ReadBytes, gcount);
-    return gcount;
+    return finalizeRead(size, istr.gcount(), sw, istr);
 }
 
 ssize_t S3RandomAccessFile::readChunked(char * buf, size_t size)
@@ -212,12 +162,21 @@ ssize_t S3RandomAccessFile::readChunked(char * buf, size_t size)
             break;
     }
 
+    return finalizeRead(size, total_gcount, sw, istr);
+}
+
+ssize_t S3RandomAccessFile::finalizeRead(size_t requested_size, size_t actual_size, const Stopwatch & sw, std::istream & istr)
+{
+    // Keep the post-read handling shared so limiter and non-limiter paths emit identical retries, logging and
+    // observability signals.
     fiu_do_on(FailPoints::force_s3_random_access_file_read_fail, {
         LOG_WARNING(log, "failpoint force_s3_random_access_file_read_fail is triggered, return S3StreamError");
         return S3StreamError;
     });
 
-    if (total_gcount < size && (!istr.eof() || cur_offset + total_gcount != static_cast<size_t>(content_length)))
+    // Theoretically, `istr.eof()` is equivalent to `cur_offset + actual_size != static_cast<size_t>(content_length)`.
+    // It's just a double check for more safety.
+    if (actual_size < requested_size && (!istr.eof() || cur_offset + actual_size != static_cast<size_t>(content_length)))
     {
         ProfileEvents::increment(ProfileEvents::S3IOReadError);
         auto state = istr.rdstate();
@@ -227,8 +186,8 @@ ssize_t S3RandomAccessFile::readChunked(char * buf, size_t size)
             log,
             "Cannot read from istream, size={} gcount={} state=0x{:02X} cur_offset={} content_length={} "
             "errno={} errmsg={} cost={:.6f}s",
-            size,
-            total_gcount,
+            requested_size,
+            actual_size,
             state,
             cur_offset,
             content_length,
@@ -243,7 +202,7 @@ ssize_t S3RandomAccessFile::readChunked(char * buf, size_t size)
     {
         scan_context->disagg_s3file_read_time_ms += elapsed_secs * 1000;
         scan_context->disagg_s3file_read_count += 1;
-        scan_context->disagg_s3file_read_bytes += total_gcount;
+        scan_context->disagg_s3file_read_bytes += actual_size;
     }
     GET_METRIC(tiflash_storage_s3_request_seconds, type_read_stream).Observe(elapsed_secs);
     if (elapsed_secs > 0.01)
@@ -251,14 +210,14 @@ ssize_t S3RandomAccessFile::readChunked(char * buf, size_t size)
         LOG_DEBUG(
             log,
             "gcount={} cur_offset={} content_length={} cost={:.3f}s",
-            total_gcount,
+            actual_size,
             cur_offset,
             content_length,
             elapsed_secs);
     }
-    cur_offset += total_gcount;
-    ProfileEvents::increment(ProfileEvents::S3ReadBytes, total_gcount);
-    return total_gcount;
+    cur_offset += actual_size;
+    ProfileEvents::increment(ProfileEvents::S3ReadBytes, actual_size);
+    return actual_size;
 }
 
 off_t S3RandomAccessFile::seek(off_t offset_, int whence)
@@ -309,43 +268,9 @@ off_t S3RandomAccessFile::seekImpl(off_t offset_, int whence)
     Stopwatch sw;
     ProfileEvents::increment(ProfileEvents::S3IOSeek, 1);
     auto & istr = read_result.GetBody();
-    if (!istr.ignore(offset_ - cur_offset))
-    {
-        ProfileEvents::increment(ProfileEvents::S3IOSeekError);
-        auto state = istr.rdstate();
-        auto elapsed_secs = sw.elapsedSeconds();
-        GET_METRIC(tiflash_storage_s3_request_seconds, type_read_stream_err).Observe(elapsed_secs);
-        LOG_WARNING(
-            log,
-            "Cannot ignore from istream, state=0x{:02X}, errno={} errmsg={} cost={:.6f}s",
-            state,
-            errno,
-            strerror(errno),
-            elapsed_secs);
-        return (state & std::ios_base::failbit || state & std::ios_base::badbit) ? S3StreamError : S3UnknownError;
-    }
-
-    auto elapsed_secs = sw.elapsedSeconds();
-    if (scan_context)
-    {
-        scan_context->disagg_s3file_seek_time_ms += elapsed_secs * 1000;
-        scan_context->disagg_s3file_seek_count += 1;
-        scan_context->disagg_s3file_seek_bytes += offset_ - cur_offset;
-    }
-    GET_METRIC(tiflash_storage_s3_request_seconds, type_read_stream).Observe(elapsed_secs);
-    if (elapsed_secs > 0.01) // 10ms
-    {
-        LOG_DEBUG(
-            log,
-            "ignore_count={} cur_offset={} content_length={} cost={:.3f}s",
-            offset_ - cur_offset,
-            cur_offset,
-            content_length,
-            elapsed_secs);
-    }
-    ProfileEvents::increment(ProfileEvents::S3ReadBytes, offset_ - cur_offset);
-    cur_offset = offset_;
-    return cur_offset;
+    auto bytes_to_ignore = static_cast<size_t>(offset_ - cur_offset);
+    istr.ignore(bytes_to_ignore);
+    return finalizeSeek(offset_, bytes_to_ignore, istr.gcount(), sw, istr);
 }
 
 off_t S3RandomAccessFile::seekChunked(off_t offset)
@@ -368,7 +293,19 @@ off_t S3RandomAccessFile::seekChunked(off_t offset)
             break;
     }
 
-    if (total_ignored < bytes_to_ignore)
+    return finalizeSeek(offset, bytes_to_ignore, total_ignored, sw, istr);
+}
+
+off_t S3RandomAccessFile::finalizeSeek(
+    off_t target_offset,
+    size_t requested_size,
+    size_t actual_size,
+    const Stopwatch & sw,
+    std::istream & istr)
+{
+    // Keep post-seek handling shared so limiter and non-limiter paths emit identical retries, logging and
+    // observability signals.
+    if (actual_size < requested_size)
     {
         ProfileEvents::increment(ProfileEvents::S3IOSeekError);
         auto state = istr.rdstate();
@@ -378,8 +315,8 @@ off_t S3RandomAccessFile::seekChunked(off_t offset)
             log,
             "Cannot ignore from istream, state=0x{:02X}, ignored={} expected={} errno={} errmsg={} cost={:.6f}s",
             state,
-            total_ignored,
-            bytes_to_ignore,
+            actual_size,
+            requested_size,
             errno,
             strerror(errno),
             elapsed_secs);
@@ -391,7 +328,7 @@ off_t S3RandomAccessFile::seekChunked(off_t offset)
     {
         scan_context->disagg_s3file_seek_time_ms += elapsed_secs * 1000;
         scan_context->disagg_s3file_seek_count += 1;
-        scan_context->disagg_s3file_seek_bytes += bytes_to_ignore;
+        scan_context->disagg_s3file_seek_bytes += actual_size;
     }
     GET_METRIC(tiflash_storage_s3_request_seconds, type_read_stream).Observe(elapsed_secs);
     if (elapsed_secs > 0.01)
@@ -399,13 +336,13 @@ off_t S3RandomAccessFile::seekChunked(off_t offset)
         LOG_DEBUG(
             log,
             "ignore_count={} cur_offset={} content_length={} cost={:.3f}s",
-            bytes_to_ignore,
+            actual_size,
             cur_offset,
             content_length,
             elapsed_secs);
     }
-    ProfileEvents::increment(ProfileEvents::S3ReadBytes, bytes_to_ignore);
-    cur_offset = offset;
+    ProfileEvents::increment(ProfileEvents::S3ReadBytes, actual_size);
+    cur_offset = target_offset;
     return cur_offset;
 }
 String S3RandomAccessFile::readRangeOfObject()
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.h b/dbms/src/Storages/S3/S3RandomAccessFile.h
index e1db8aa3098..68ef80bac95 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.h
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.h
@@ -16,6 +16,7 @@
 
 #include <Common/Exception.h>
 #include <Common/Logger.h>
+#include <Common/Stopwatch.h>
 #include <IO/BaseFile/RandomAccessFile.h>
 #include <Storages/DeltaMerge/ScanContext_fwd.h>
 #include <Storages/S3/S3ReadLimiter.h>
@@ -24,6 +25,8 @@
 
 #include <ext/scope_guard.h>
 
+#include <istream>
+
 /// Remove the population of thread_local from Poco
 #ifdef thread_local
 #undef thread_local
@@ -96,6 +99,8 @@ class S3RandomAccessFile final : public RandomAccessFile
     ssize_t readImpl(char * buf, size_t size);
     String readRangeOfObject();
     ssize_t readChunked(char * buf, size_t size);
+    ssize_t finalizeRead(size_t requested_size, size_t actual_size, const Stopwatch & sw, std::istream & istr);
+    off_t finalizeSeek(off_t target_offset, size_t requested_size, size_t actual_size, const Stopwatch & sw, std::istream & istr);
     off_t seekChunked(off_t offset);
     void resetReadStreamToken();
 

From 26a87371406378f06ef04a6190df42a9e6635946 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 16:33:15 +0800
Subject: [PATCH 12/36] disagg: extend S3 read limiter unit coverage

---
 .../IO/BaseFile/tests/gtest_rate_limiter.cpp  | 67 +++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp b/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
index 376c39f5ac7..63761eb33fd 100644
--- a/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
+++ b/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
@@ -383,6 +383,7 @@ TEST(S3ReadLimiterTest, StreamTokenBlocksUntilRelease)
     ASSERT_NE(token1, nullptr);
     ASSERT_EQ(limiter->activeStreams(), 1);
 
+    // The second reader should block until the first stream token is released.
     auto future = std::async(std::launch::async, [&]() { return limiter->acquireStream(); });
     ASSERT_EQ(future.wait_for(50ms), std::future_status::timeout);
 
@@ -397,12 +398,78 @@ TEST(S3ReadLimiterTest, StreamTokenBlocksUntilRelease)
 TEST(S3ReadLimiterTest, ByteRequestsWaitForRefill)
 {
     S3::S3ReadLimiter limiter(1000, 0, 100);
+    // Consume the initial 100-byte burst, then verify the next request waits for at least one refill period.
     limiter.requestBytes(100, S3::S3ReadSource::DirectRead);
     AtomicStopwatch watch;
     limiter.requestBytes(100, S3::S3ReadSource::DirectRead);
     ASSERT_GE(watch.elapsedMilliseconds(), 80);
 }
 
+TEST(S3ReadLimiterTest, UpdateConfigDisablesWaitingStream)
+{
+    auto limiter = std::make_shared<S3::S3ReadLimiter>(0, 1);
+    auto token1 = limiter->acquireStream();
+    ASSERT_NE(token1, nullptr);
+
+    // Once config reload disables stream limiting, a waiter should wake up and observe `nullptr` instead of hanging.
+    auto future = std::async(std::launch::async, [&]() { return limiter->acquireStream(); });
+    ASSERT_EQ(future.wait_for(50ms), std::future_status::timeout);
+
+    limiter->updateConfig(/*max_read_bytes_per_sec*/ 0, /*max_streams*/ 0);
+    auto token2 = future.get();
+    ASSERT_EQ(token2, nullptr);
+
+    token1.reset();
+    ASSERT_EQ(limiter->activeStreams(), 0);
+}
+
+TEST(S3ReadLimiterTest, UpdateConfigDisablesWaitingBytes)
+{
+    S3::S3ReadLimiter limiter(1000, 0, 100);
+    // Exhaust the initial burst, then make sure disabling the byte limit wakes a waiting requester promptly.
+    limiter.requestBytes(100, S3::S3ReadSource::DirectRead);
+
+    auto future = std::async(std::launch::async, [&]() {
+        AtomicStopwatch watch;
+        limiter.requestBytes(100, S3::S3ReadSource::DirectRead);
+        return watch.elapsedMilliseconds();
+    });
+    ASSERT_EQ(future.wait_for(50ms), std::future_status::timeout);
+
+    limiter.updateConfig(/*max_read_bytes_per_sec*/ 0, /*max_streams*/ 0);
+    ASSERT_LT(future.get(), 100);
+}
+
+TEST(S3ReadLimiterTest, SuggestedChunkSizeTracksBurstLimit)
+{
+    // The suggested chunk size should never exceed one refill-period burst when byte limiting is enabled.
+    S3::S3ReadLimiter limiter(/*max_read_bytes_per_sec*/ 1000, /*max_streams*/ 0, /*refill_period_ms*/ 100);
+    ASSERT_EQ(limiter.getSuggestedChunkSize(128 * 1024), 100);
+
+    limiter.updateConfig(/*max_read_bytes_per_sec*/ 5000, /*max_streams*/ 0);
+    ASSERT_EQ(limiter.getSuggestedChunkSize(128 * 1024), 500);
+
+    limiter.updateConfig(/*max_read_bytes_per_sec*/ 0, /*max_streams*/ 0);
+    ASSERT_EQ(limiter.getSuggestedChunkSize(4096), 4096);
+}
+
+TEST(S3ReadLimiterTest, StreamTokenMoveDoesNotDoubleRelease)
+{
+    auto limiter = std::make_shared<S3::S3ReadLimiter>(0, 1);
+    auto token = limiter->acquireStream();
+    ASSERT_NE(token, nullptr);
+    ASSERT_EQ(limiter->activeStreams(), 1);
+
+    // Moving the token transfers ownership without releasing the stream slot.
+    auto moved = std::move(token);
+    ASSERT_EQ(token, nullptr);
+    ASSERT_EQ(limiter->activeStreams(), 1);
+
+    // Releasing the moved token should drop the slot exactly once.
+    moved.reset();
+    ASSERT_EQ(limiter->activeStreams(), 0);
+}
+
 #ifdef __linux__
 TEST(IORateLimiterTest, IOStat)
 {

From 963905a0116354cd80552581970873985e935d5a Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 16:55:16 +0800
Subject: [PATCH 13/36] Refine metrics

Signed-off-by: JaySon-Huang <tshent@qq.com>
---
 dbms/src/Common/TiFlashMetrics.h | 36 ++++++++++++++++----------------
 dbms/src/Interpreters/Settings.h |  8 +++----
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/dbms/src/Common/TiFlashMetrics.h b/dbms/src/Common/TiFlashMetrics.h
index 90849e518f7..33a67dbe24d 100644
--- a/dbms/src/Common/TiFlashMetrics.h
+++ b/dbms/src/Common/TiFlashMetrics.h
@@ -395,6 +395,24 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       F(type_fg_write_bytes, {"type", "fg_write_bytes"}),                                                                           \
       F(type_bg_write_bytes, {"type", "bg_write_bytes"}),                                                                           \
       F(type_s3_read_bytes, {"type", "s3_read_bytes"}))                                                                             \
+    M(tiflash_storage_io_limiter_pending_seconds,                                                                                   \
+      "I/O limiter pending duration in seconds",                                                                                    \
+      Histogram,                                                                                                                    \
+      F(type_fg_read, {{"type", "fg_read"}}, ExpBuckets{0.001, 2, 20}),                                                             \
+      F(type_bg_read, {{"type", "bg_read"}}, ExpBuckets{0.001, 2, 20}),                                                             \
+      F(type_fg_write, {{"type", "fg_write"}}, ExpBuckets{0.001, 2, 20}),                                                           \
+      F(type_bg_write, {{"type", "bg_write"}}, ExpBuckets{0.001, 2, 20}),                                                           \
+      F(type_s3_read_stream, {{"type", "s3_read_stream"}}, ExpBuckets{0.001, 2, 20}),                                               \
+      F(type_s3_read_byte, {{"type", "s3_read_byte"}}, ExpBuckets{0.001, 2, 20}))                                                   \
+    M(tiflash_storage_io_limiter_pending_count,                                                                                     \
+      "I/O limiter pending count",                                                                                                  \
+      Counter,                                                                                                                      \
+      F(type_fg_read, {"type", "fg_read"}),                                                                                         \
+      F(type_bg_read, {"type", "bg_read"}),                                                                                         \
+      F(type_fg_write, {"type", "fg_write"}),                                                                                       \
+      F(type_bg_write, {"type", "bg_write"}),                                                                                       \
+      F(type_s3_read_byte, {"type", "s3_read_byte"}),                                                                               \
+      F(type_s3_read_stream, {"type", "s3_read_stream"}))                                                                           \
     M(tiflash_storage_rough_set_filter_rate,                                                                                        \
       "Bucketed histogram of rough set filter rate",                                                                                \
       Histogram,                                                                                                                    \
@@ -914,24 +932,6 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       "Remote cache status",                                                                                                        \
       Gauge,                                                                                                                        \
       F(type_bg_downloading_count, {{"type", "bg_downloading_count"}}))                                                             \
-    M(tiflash_storage_io_limiter_pending_seconds,                                                                                   \
-      "I/O limiter pending duration in seconds",                                                                                    \
-      Histogram,                                                                                                                    \
-      F(type_fg_read, {{"type", "fg_read"}}, ExpBuckets{0.001, 2, 20}),                                                             \
-      F(type_bg_read, {{"type", "bg_read"}}, ExpBuckets{0.001, 2, 20}),                                                             \
-      F(type_fg_write, {{"type", "fg_write"}}, ExpBuckets{0.001, 2, 20}),                                                           \
-      F(type_bg_write, {{"type", "bg_write"}}, ExpBuckets{0.001, 2, 20}),                                                           \
-      F(type_s3_read_stream, {{"type", "s3_read_stream"}}, ExpBuckets{0.001, 2, 20}),                                               \
-      F(type_s3_read_byte, {{"type", "s3_read_byte"}}, ExpBuckets{0.001, 2, 20}))                                                   \
-    M(tiflash_storage_io_limiter_pending_count,                                                                                     \
-      "I/O limiter pending count",                                                                                                  \
-      Counter,                                                                                                                      \
-      F(type_fg_read, {"type", "fg_read"}),                                                                                         \
-      F(type_bg_read, {"type", "bg_read"}),                                                                                         \
-      F(type_fg_write, {"type", "fg_write"}),                                                                                       \
-      F(type_bg_write, {"type", "bg_write"}),                                                                                       \
-      F(type_s3_read_byte, {"type", "s3_read_byte"}),                                                                               \
-      F(type_s3_read_stream, {"type", "s3_read_stream"}))                                                                           \
     M(tiflash_system_seconds,                                                                                                       \
       "system calls duration in seconds",                                                                                           \
       Histogram,                                                                                                                    \
diff --git a/dbms/src/Interpreters/Settings.h b/dbms/src/Interpreters/Settings.h
index fb9472131b2..eb0243f3141 100644
--- a/dbms/src/Interpreters/Settings.h
+++ b/dbms/src/Interpreters/Settings.h
@@ -240,17 +240,17 @@ struct Settings
     M(SettingBool, remote_checkpoint_only_upload_manifest, true, "Only upload manifest data when uploading checkpoint")                                                                                                                 \
     M(SettingInt64, remote_gc_method, 1, "The method of running GC task on the remote store. 1 - lifecycle, 2 - scan.")                                                                                                                 \
     M(SettingInt64, remote_gc_interval_seconds, 3600, "The interval of running GC task on the remote store. Unit is second.")                                                                                                           \
-    M(SettingInt64, remote_summary_interval_seconds, 0, "The interval of collecting remote S3 storage summary. Unit is second. <=0 disables periodic summary task.")                                                        \
+    M(SettingInt64, remote_summary_interval_seconds, 0, "The interval of collecting remote S3 storage summary. Unit is second. <=0 disables periodic summary task.")                                                                    \
     M(SettingInt64, remote_gc_verify_consistency, 0, "[testing] Verify the consistenct of valid locks when doing GC")                                                                                                                   \
     M(SettingInt64, remote_gc_min_age_seconds, 3600, "The file will NOT be compacted when the time difference between the last modification is less than this threshold")                                                               \
     M(SettingDouble, remote_gc_ratio, 0.5, "The files with valid rate less than this threshold will be compacted")                                                                                                                      \
     M(SettingInt64, remote_gc_small_size, 128 * 1024, "The files with total size less than this threshold will be compacted")                                                                                                           \
     /* Disagg arch reading settings */ \
     M(SettingUInt64, dt_write_page_cache_limit_size, 2 * 1024 * 1024, "Limit size per write batch when compute node writing to PageStorage cache")                                                                                      \
-    M(SettingDouble, dt_filecache_downloading_count_scale, 2.0, "Max concurrency of download task count of FileCache = number of logical cpu cores * dt_filecache_downloading_count_scale.")                                                              \
+    M(SettingDouble, dt_filecache_downloading_count_scale, 2.0, "Max concurrency of download task count of FileCache = number of logical cpu cores * dt_filecache_downloading_count_scale.")                                            \
     M(SettingDouble, dt_filecache_max_downloading_count_scale, 10.0, "Max queue size of download task count of FileCache = number of logical cpu cores * dt_filecache_max_downloading_count_scale.")                                    \
     M(SettingUInt64, dt_filecache_min_age_seconds, 1800, "Files of the same priority can only be evicted from files that were not accessed within `dt_filecache_min_age_seconds` seconds.")                                             \
-    M(SettingUInt64, dt_filecache_wait_on_downloading_ms, 0, "When a remote cache lookup sees the same key is already being downloaded, wait up to this many milliseconds for that download to finish. 0 disables the bounded wait.") \
+    M(SettingUInt64, dt_filecache_wait_on_downloading_ms, 0, "When a remote cache lookup sees the same key is already being downloaded, wait up to this many milliseconds for that download to finish. 0 disables the bounded wait.")   \
     M(SettingBool, dt_enable_fetch_memtableset, true, "Whether fetching delta cache in FetchDisaggPages")                                                                                                                               \
     M(SettingUInt64, dt_fetch_pages_packet_limit_size, 512 * 1024, "Response packet bytes limit of FetchDisaggPages, 0 means one page per packet")                                                                                      \
     M(SettingDouble, dt_fetch_page_concurrency_scale, 4.0, "Concurrency of fetching pages of one query equals to num_streams * dt_fetch_page_concurrency_scale.")                                                                       \
@@ -340,7 +340,7 @@ struct Settings
     M(SettingUInt64, cop_timeout_for_remote_read, 60, "cop timeout seconds for remote read")                                                                                                                                            \
     M(SettingUInt64, auto_spill_check_min_interval_ms, 10, "The minimum interval in millisecond between two successive auto spill check, default value is 100, 0 means no limit")                                                       \
     M(SettingUInt64, join_probe_cache_columns_threshold, 1000, "The threshold that a join key will cache its output columns during probe stage, 0 means never cache")                                                                   \
-    M(SettingBool, enable_hash_join_v2, false, "Enable hash join v2")                                                                                                                                                                    \
+    M(SettingBool, enable_hash_join_v2, false, "Enable hash join v2")                                                                                                                                                                   \
     M(SettingUInt64, join_v2_max_block_size, 8192, "hash join v2 max block size")                                                                                                                                                       \
     M(SettingUInt64, join_v2_probe_enable_prefetch_threshold, 1024 * 1024, "hash join v2 minimum row number of join build table to use prefetch during join probe phase")                                                               \
     M(SettingUInt64, join_v2_probe_prefetch_step, 16, "hash join v2 probe prefetch length")                                                                                                                                             \

From c2ffeb56fa8b952b502ae1d71d7f8a2aa6f89830 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 20:26:02 +0800
Subject: [PATCH 14/36] disagg: enrich file cache observability

---
 dbms/src/Common/TiFlashMetrics.cpp   | 52 ++++++++++++++++++++++++++++
 dbms/src/Common/TiFlashMetrics.h     | 25 ++++++++++++-
 dbms/src/Storages/S3/FileCache.cpp   | 41 +++++++++++++++++++---
 dbms/src/Storages/S3/S3ReadLimiter.h | 13 +++++++
 4 files changed, 125 insertions(+), 6 deletions(-)

diff --git a/dbms/src/Common/TiFlashMetrics.cpp b/dbms/src/Common/TiFlashMetrics.cpp
index 765540645eb..cb3f34b262e 100644
--- a/dbms/src/Common/TiFlashMetrics.cpp
+++ b/dbms/src/Common/TiFlashMetrics.cpp
@@ -26,7 +26,9 @@ namespace
 {
 constexpr std::array remote_cache_file_type_labels = {"merged", "coldata", "other"};
 constexpr std::array remote_cache_wait_result_labels = {"hit", "timeout", "failed"};
+constexpr std::array remote_cache_reject_reason_labels = {"too_many_download"};
 constexpr std::array remote_cache_download_stage_labels = {"queue_wait", "download"};
+constexpr auto remote_cache_wait_on_downloading_buckets = ExpBuckets{0.0001, 2, 20};
 constexpr auto remote_cache_bg_download_stage_buckets = ExpBuckets{0.0001, 2, 20};
 } // namespace
 
@@ -97,11 +99,33 @@ TiFlashMetrics::TiFlashMetrics()
                .Name("tiflash_storage_remote_cache_wait_on_downloading_bytes")
                .Help("Bytes covered by remote cache bounded wait")
                .Register(*registry);
+    // Timeline for one cache miss with possible follower requests:
+    //
+    //   req A: miss -> create Empty -> enqueue bg task ---- queue_wait ---- download ---- Complete/Failed
+    //   req B:                  sees Empty -> -------- wait_on_downloading_seconds --------> hit/timeout/failed
+    //   req C:                         sees Empty -> --- wait_on_downloading_seconds ---> hit/timeout/failed
+    //
+    // `tiflash_storage_remote_cache_bg_download_stage_seconds`
+    //   - downloader-task view
+    //   - measures how long the background download itself spent in `queue_wait` and `download`
     registered_remote_cache_bg_download_stage_seconds_family
         = &prometheus::BuildHistogram()
                .Name("tiflash_storage_remote_cache_bg_download_stage_seconds")
                .Help("Remote cache background download stage duration")
                .Register(*registry);
+    // `tiflash_storage_remote_cache_wait_on_downloading_seconds`
+    //   - follower-request view
+    //   - measures how long a request waited on an existing `Empty` segment before ending as hit/timeout/failed
+    registered_remote_cache_wait_on_downloading_seconds_family
+        = &prometheus::BuildHistogram()
+               .Name("tiflash_storage_remote_cache_wait_on_downloading_seconds")
+               .Help("Bounded wait duration of remote cache downloading")
+               .Register(*registry);
+    registered_remote_cache_reject_family
+        = &prometheus::BuildCounter()
+               .Name("tiflash_storage_remote_cache_reject")
+               .Help("Remote cache admission rejection by reason and file type")
+               .Register(*registry);
 
     for (size_t file_type_idx = 0; file_type_idx < remote_cache_file_type_labels.size(); ++file_type_idx)
     {
@@ -115,6 +139,19 @@ TiFlashMetrics::TiFlashMetrics()
                 = &registered_remote_cache_wait_on_downloading_result_family->Add(labels);
             remote_cache_wait_on_downloading_bytes_metrics[file_type_idx][result_idx]
                 = &registered_remote_cache_wait_on_downloading_bytes_family->Add(labels);
+            prometheus::Histogram::BucketBoundaries wait_buckets = ExpBuckets{
+                remote_cache_wait_on_downloading_buckets.start,
+                remote_cache_wait_on_downloading_buckets.base,
+                remote_cache_wait_on_downloading_buckets.size};
+            remote_cache_wait_on_downloading_seconds_metrics[file_type_idx][result_idx]
+                = &registered_remote_cache_wait_on_downloading_seconds_family->Add(labels, wait_buckets);
+        }
+        for (size_t reason_idx = 0; reason_idx < remote_cache_reject_reason_labels.size(); ++reason_idx)
+        {
+            remote_cache_reject_metrics[file_type_idx][reason_idx]
+                = &registered_remote_cache_reject_family->Add(
+                    {{"reason", std::string(remote_cache_reject_reason_labels[reason_idx])},
+                     {"file_type", std::string(remote_cache_file_type_labels[file_type_idx])}});
         }
         for (size_t stage_idx = 0; stage_idx < remote_cache_download_stage_labels.size(); ++stage_idx)
         {
@@ -354,10 +391,25 @@ prometheus::Counter & TiFlashMetrics::getRemoteCacheWaitOnDownloadingBytesCounte
     return *remote_cache_wait_on_downloading_bytes_metrics[static_cast<size_t>(file_type)][static_cast<size_t>(result)];
 }
 
+prometheus::Histogram & TiFlashMetrics::getRemoteCacheWaitOnDownloadingSecondsHistogram(
+    TiFlashMetrics::RemoteCacheFileTypeMetric file_type,
+    TiFlashMetrics::RemoteCacheWaitResultMetric result)
+{
+    return *remote_cache_wait_on_downloading_seconds_metrics[static_cast<size_t>(file_type)]
+                                                           [static_cast<size_t>(result)];
+}
+
 prometheus::Histogram & TiFlashMetrics::getRemoteCacheBgDownloadStageSecondsHistogram(
     TiFlashMetrics::RemoteCacheFileTypeMetric file_type,
     TiFlashMetrics::RemoteCacheDownloadStageMetric stage)
 {
     return *remote_cache_bg_download_stage_seconds_metrics[static_cast<size_t>(file_type)][static_cast<size_t>(stage)];
 }
+
+prometheus::Counter & TiFlashMetrics::getRemoteCacheRejectCounter(
+    TiFlashMetrics::RemoteCacheFileTypeMetric file_type,
+    TiFlashMetrics::RemoteCacheRejectReasonMetric reason)
+{
+    return *remote_cache_reject_metrics[static_cast<size_t>(file_type)][static_cast<size_t>(reason)];
+}
 } // namespace DB
diff --git a/dbms/src/Common/TiFlashMetrics.h b/dbms/src/Common/TiFlashMetrics.h
index 33a67dbe24d..e3713f6b0fd 100644
--- a/dbms/src/Common/TiFlashMetrics.h
+++ b/dbms/src/Common/TiFlashMetrics.h
@@ -931,7 +931,8 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
     M(tiflash_storage_remote_cache_status,                                                                                          \
       "Remote cache status",                                                                                                        \
       Gauge,                                                                                                                        \
-      F(type_bg_downloading_count, {{"type", "bg_downloading_count"}}))                                                             \
+      F(type_bg_downloading_count, {{"type", "bg_downloading_count"}}),                                                            \
+      F(type_bg_download_queue_count, {{"type", "bg_download_queue_count"}}))                                                      \
     M(tiflash_system_seconds,                                                                                                       \
       "system calls duration in seconds",                                                                                           \
       Histogram,                                                                                                                    \
@@ -1352,6 +1353,12 @@ class TiFlashMetrics
         Count,
     };
 
+    enum class RemoteCacheRejectReasonMetric : size_t
+    {
+        TooManyDownload = 0,
+        Count,
+    };
+
     enum class RemoteCacheDownloadStageMetric : size_t
     {
         QueueWait = 0,
@@ -1386,9 +1393,15 @@ class TiFlashMetrics
     prometheus::Counter & getRemoteCacheWaitOnDownloadingBytesCounter(
         RemoteCacheFileTypeMetric file_type,
         RemoteCacheWaitResultMetric result);
+    prometheus::Histogram & getRemoteCacheWaitOnDownloadingSecondsHistogram(
+        RemoteCacheFileTypeMetric file_type,
+        RemoteCacheWaitResultMetric result);
     prometheus::Histogram & getRemoteCacheBgDownloadStageSecondsHistogram(
         RemoteCacheFileTypeMetric file_type,
         RemoteCacheDownloadStageMetric stage);
+    prometheus::Counter & getRemoteCacheRejectCounter(
+        RemoteCacheFileTypeMetric file_type,
+        RemoteCacheRejectReasonMetric reason);
 
 private:
     TiFlashMetrics();
@@ -1450,11 +1463,21 @@ class TiFlashMetrics
         std::array<prometheus::Counter *, static_cast<size_t>(RemoteCacheWaitResultMetric::Count)>,
         static_cast<size_t>(RemoteCacheFileTypeMetric::Count)>
         remote_cache_wait_on_downloading_bytes_metrics{};
+    prometheus::Family<prometheus::Histogram> * registered_remote_cache_wait_on_downloading_seconds_family;
+    std::array<
+        std::array<prometheus::Histogram *, static_cast<size_t>(RemoteCacheWaitResultMetric::Count)>,
+        static_cast<size_t>(RemoteCacheFileTypeMetric::Count)>
+        remote_cache_wait_on_downloading_seconds_metrics{};
     prometheus::Family<prometheus::Histogram> * registered_remote_cache_bg_download_stage_seconds_family;
     std::array<
         std::array<prometheus::Histogram *, static_cast<size_t>(RemoteCacheDownloadStageMetric::Count)>,
         static_cast<size_t>(RemoteCacheFileTypeMetric::Count)>
         remote_cache_bg_download_stage_seconds_metrics{};
+    prometheus::Family<prometheus::Counter> * registered_remote_cache_reject_family;
+    std::array<
+        std::array<prometheus::Counter *, static_cast<size_t>(RemoteCacheRejectReasonMetric::Count)>,
+        static_cast<size_t>(RemoteCacheFileTypeMetric::Count)>
+        remote_cache_reject_metrics{};
 
 public:
 #define MAKE_METRIC_MEMBER_M(family_name, help, type, ...) \
diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index b9e0a9abdb8..4b0dc9cb675 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -174,6 +174,11 @@ void observeWaitOnDownloadingMetrics(FileType file_type, WaitResult result, UInt
                 metric_file_type,
                 TiFlashMetrics::RemoteCacheWaitResultMetric::Hit)
             .Increment();
+        metrics
+            .getRemoteCacheWaitOnDownloadingSecondsHistogram(
+                metric_file_type,
+                TiFlashMetrics::RemoteCacheWaitResultMetric::Hit)
+            .Observe(wait_seconds);
         metrics
             .getRemoteCacheWaitOnDownloadingBytesCounter(
                 metric_file_type,
@@ -186,6 +191,11 @@ void observeWaitOnDownloadingMetrics(FileType file_type, WaitResult result, UInt
                 metric_file_type,
                 TiFlashMetrics::RemoteCacheWaitResultMetric::Timeout)
             .Increment();
+        metrics
+            .getRemoteCacheWaitOnDownloadingSecondsHistogram(
+                metric_file_type,
+                TiFlashMetrics::RemoteCacheWaitResultMetric::Timeout)
+            .Observe(wait_seconds);
         metrics
             .getRemoteCacheWaitOnDownloadingBytesCounter(
                 metric_file_type,
@@ -198,6 +208,11 @@ void observeWaitOnDownloadingMetrics(FileType file_type, WaitResult result, UInt
                 metric_file_type,
                 TiFlashMetrics::RemoteCacheWaitResultMetric::Failed)
             .Increment();
+        metrics
+            .getRemoteCacheWaitOnDownloadingSecondsHistogram(
+                metric_file_type,
+                TiFlashMetrics::RemoteCacheWaitResultMetric::Failed)
+            .Observe(wait_seconds);
         metrics
             .getRemoteCacheWaitOnDownloadingBytesCounter(
                 metric_file_type,
@@ -206,7 +221,6 @@ void observeWaitOnDownloadingMetrics(FileType file_type, WaitResult result, UInt
         break;
     }
 
-    UNUSED(wait_seconds);
     switch (result)
     {
     case WaitResult::Hit:
@@ -221,6 +235,23 @@ void observeWaitOnDownloadingMetrics(FileType file_type, WaitResult result, UInt
     }
 }
 
+void observeRemoteCacheRejectMetrics(FileType file_type)
+{
+    TiFlashMetrics::instance()
+        .getRemoteCacheRejectCounter(
+            toMetricFileType(file_type),
+            TiFlashMetrics::RemoteCacheRejectReasonMetric::TooManyDownload)
+        .Increment();
+}
+
+void updateBgDownloadStatusMetrics(Int64 bg_downloading_count)
+{
+    GET_METRIC(tiflash_storage_remote_cache_status, type_bg_downloading_count).Set(bg_downloading_count);
+    const auto running_limit = static_cast<Int64>(S3FileCachePool::get().getMaxThreads());
+    GET_METRIC(tiflash_storage_remote_cache_status, type_bg_download_queue_count)
+        .Set(std::max<Int64>(0, bg_downloading_count - running_limit));
+}
+
 void observeBgDownloadStageMetrics(FileType file_type, BgDownloadStage stage, double seconds)
 {
     auto & metrics = TiFlashMetrics::instance();
@@ -438,6 +469,7 @@ FileCache::FileCache(
     , log(Logger::get("FileCache"))
 {
     CurrentMetrics::set(CurrentMetrics::DTFileCacheCapacity, cache_capacity);
+    updateBgDownloadStatusMetrics(0);
     prepareDir(cache_dir);
     restore();
 }
@@ -575,6 +607,7 @@ FileSegmentPtr FileCache::get(const S3::S3FilenameView & s3_fname, const std::op
                 return nullptr;
             case ShouldCacheRes::RejectTooManyDownloading:
                 GET_METRIC(tiflash_storage_remote_cache, type_dtfile_too_many_download).Increment();
+                observeRemoteCacheRejectMetrics(file_type);
                 return nullptr;
             case ShouldCacheRes::Cache:
                 break;
@@ -1354,8 +1387,7 @@ void FileCache::bgDownloadExecutor(
         bg_download_succ_count.fetch_add(1, std::memory_order_relaxed);
     }
     bg_downloading_count.fetch_sub(1, std::memory_order_relaxed);
-    GET_METRIC(tiflash_storage_remote_cache_status, type_bg_downloading_count)
-        .Set(bg_downloading_count.load(std::memory_order_relaxed));
+    updateBgDownloadStatusMetrics(bg_downloading_count.load(std::memory_order_relaxed));
     LOG_DEBUG(
         log,
         "downloading count {} => s3_key {} finished",
@@ -1366,8 +1398,7 @@ void FileCache::bgDownloadExecutor(
 void FileCache::bgDownload(const String & s3_key, FileSegmentPtr & file_seg)
 {
     bg_downloading_count.fetch_add(1, std::memory_order_relaxed);
-    GET_METRIC(tiflash_storage_remote_cache_status, type_bg_downloading_count)
-        .Set(bg_downloading_count.load(std::memory_order_relaxed));
+    updateBgDownloadStatusMetrics(bg_downloading_count.load(std::memory_order_relaxed));
     LOG_DEBUG(
         log,
         "downloading count {} => s3_key {} start",
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.h b/dbms/src/Storages/S3/S3ReadLimiter.h
index cadd34c52f9..3597bd5af6c 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.h
+++ b/dbms/src/Storages/S3/S3ReadLimiter.h
@@ -39,6 +39,11 @@ class S3ReadLimiter : public std::enable_shared_from_this<S3ReadLimiter>
     /// The token is acquired before the request body starts being consumed and released when the
     /// response stream is destroyed or re-opened. This keeps the stream limiter aligned with the
     /// actual number of concurrent remote-read streams instead of just the number of requests sent.
+    ///
+    /// Important: the token must not be interpreted as a safe upper bound for the number of
+    /// `S3RandomAccessFile` objects. One reader can hold a response body open while being idle in a
+    /// pipeline stage, so limiting tokens too aggressively can stall unrelated readers even when
+    /// there is little ongoing S3 network I/O.
     class StreamToken
     {
     public:
@@ -79,6 +84,11 @@ class S3ReadLimiter : public std::enable_shared_from_this<S3ReadLimiter>
     /// It limits two dimensions together:
     /// - concurrently active `GetObject` body streams
     /// - total remote-read bytes consumed by direct reads and FileCache downloads
+    ///
+    /// The stream dimension is best-effort protection against too many live response bodies, not a
+    /// replacement for byte throttling and not a safe cap on reader object count. In TiFlash a
+    /// `S3RandomAccessFile` may keep its body stream open across scheduling gaps, so a low stream
+    /// limit can block forward progress even when the node is no longer transferring many bytes.
     explicit S3ReadLimiter(UInt64 max_read_bytes_per_sec_ = 0, UInt64 max_streams_ = 0, UInt64 refill_period_ms_ = 100);
 
     ~S3ReadLimiter();
@@ -88,6 +98,9 @@ class S3ReadLimiter : public std::enable_shared_from_this<S3ReadLimiter>
 
     /// Acquire a token that must live as long as the `GetObject` body stream remains active.
     /// Returns `nullptr` when the stream limit is disabled.
+    ///
+    /// Callers should use this to bound live response bodies, but should not assume it models only
+    /// the time spent actively reading bytes from S3.
     [[nodiscard]] std::unique_ptr<StreamToken> acquireStream();
 
     /// Charge remote-read bytes. The call blocks when the current node-level budget is exhausted.

From efa152bed73e679a5e14db90a16f174386254ffd Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 20:50:21 +0800
Subject: [PATCH 15/36] disagg: remove S3 stream cap support

---
 dbms/src/Common/TiFlashMetrics.cpp            | 18 ++--
 dbms/src/Common/TiFlashMetrics.h              | 13 +--
 dbms/src/IO/BaseFile/IORateLimitConfig.cpp    |  7 +-
 dbms/src/IO/BaseFile/IORateLimitConfig.h      |  3 -
 dbms/src/IO/BaseFile/RateLimiter.cpp          |  7 +-
 .../IO/BaseFile/tests/gtest_rate_limiter.cpp  | 66 ++-------------
 .../src/Server/tests/gtest_storage_config.cpp | 11 ---
 dbms/src/Storages/S3/FileCache.cpp            |  2 -
 dbms/src/Storages/S3/FileCache.h              |  5 +-
 dbms/src/Storages/S3/S3RandomAccessFile.cpp   | 24 ++----
 dbms/src/Storages/S3/S3RandomAccessFile.h     | 10 ++-
 dbms/src/Storages/S3/S3ReadLimiter.cpp        | 82 ++-----------------
 dbms/src/Storages/S3/S3ReadLimiter.h          | 73 +++--------------
 .../src/Storages/S3/tests/gtest_filecache.cpp |  5 +-
 dbms/src/Storages/S3/tests/gtest_s3client.cpp |  2 -
 dbms/src/Storages/S3/tests/gtest_s3file.cpp   | 29 -------
 16 files changed, 59 insertions(+), 298 deletions(-)

diff --git a/dbms/src/Common/TiFlashMetrics.cpp b/dbms/src/Common/TiFlashMetrics.cpp
index cb3f34b262e..cda98983c5e 100644
--- a/dbms/src/Common/TiFlashMetrics.cpp
+++ b/dbms/src/Common/TiFlashMetrics.cpp
@@ -121,11 +121,10 @@ TiFlashMetrics::TiFlashMetrics()
                .Name("tiflash_storage_remote_cache_wait_on_downloading_seconds")
                .Help("Bounded wait duration of remote cache downloading")
                .Register(*registry);
-    registered_remote_cache_reject_family
-        = &prometheus::BuildCounter()
-               .Name("tiflash_storage_remote_cache_reject")
-               .Help("Remote cache admission rejection by reason and file type")
-               .Register(*registry);
+    registered_remote_cache_reject_family = &prometheus::BuildCounter()
+                                                 .Name("tiflash_storage_remote_cache_reject")
+                                                 .Help("Remote cache admission rejection by reason and file type")
+                                                 .Register(*registry);
 
     for (size_t file_type_idx = 0; file_type_idx < remote_cache_file_type_labels.size(); ++file_type_idx)
     {
@@ -148,10 +147,9 @@ TiFlashMetrics::TiFlashMetrics()
         }
         for (size_t reason_idx = 0; reason_idx < remote_cache_reject_reason_labels.size(); ++reason_idx)
         {
-            remote_cache_reject_metrics[file_type_idx][reason_idx]
-                = &registered_remote_cache_reject_family->Add(
-                    {{"reason", std::string(remote_cache_reject_reason_labels[reason_idx])},
-                     {"file_type", std::string(remote_cache_file_type_labels[file_type_idx])}});
+            remote_cache_reject_metrics[file_type_idx][reason_idx] = &registered_remote_cache_reject_family->Add(
+                {{"reason", std::string(remote_cache_reject_reason_labels[reason_idx])},
+                 {"file_type", std::string(remote_cache_file_type_labels[file_type_idx])}});
         }
         for (size_t stage_idx = 0; stage_idx < remote_cache_download_stage_labels.size(); ++stage_idx)
         {
@@ -396,7 +394,7 @@ prometheus::Histogram & TiFlashMetrics::getRemoteCacheWaitOnDownloadingSecondsHi
     TiFlashMetrics::RemoteCacheWaitResultMetric result)
 {
     return *remote_cache_wait_on_downloading_seconds_metrics[static_cast<size_t>(file_type)]
-                                                           [static_cast<size_t>(result)];
+                                                            [static_cast<size_t>(result)];
 }
 
 prometheus::Histogram & TiFlashMetrics::getRemoteCacheBgDownloadStageSecondsHistogram(
diff --git a/dbms/src/Common/TiFlashMetrics.h b/dbms/src/Common/TiFlashMetrics.h
index e3713f6b0fd..8a00e118e83 100644
--- a/dbms/src/Common/TiFlashMetrics.h
+++ b/dbms/src/Common/TiFlashMetrics.h
@@ -402,7 +402,6 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       F(type_bg_read, {{"type", "bg_read"}}, ExpBuckets{0.001, 2, 20}),                                                             \
       F(type_fg_write, {{"type", "fg_write"}}, ExpBuckets{0.001, 2, 20}),                                                           \
       F(type_bg_write, {{"type", "bg_write"}}, ExpBuckets{0.001, 2, 20}),                                                           \
-      F(type_s3_read_stream, {{"type", "s3_read_stream"}}, ExpBuckets{0.001, 2, 20}),                                               \
       F(type_s3_read_byte, {{"type", "s3_read_byte"}}, ExpBuckets{0.001, 2, 20}))                                                   \
     M(tiflash_storage_io_limiter_pending_count,                                                                                     \
       "I/O limiter pending count",                                                                                                  \
@@ -411,8 +410,7 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       F(type_bg_read, {"type", "bg_read"}),                                                                                         \
       F(type_fg_write, {"type", "fg_write"}),                                                                                       \
       F(type_bg_write, {"type", "bg_write"}),                                                                                       \
-      F(type_s3_read_byte, {"type", "s3_read_byte"}),                                                                               \
-      F(type_s3_read_stream, {"type", "s3_read_stream"}))                                                                           \
+      F(type_s3_read_byte, {"type", "s3_read_byte"}))                                                                               \
     M(tiflash_storage_rough_set_filter_rate,                                                                                        \
       "Bucketed histogram of rough set filter rate",                                                                                \
       Histogram,                                                                                                                    \
@@ -813,11 +811,6 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       F(type_head_object, {{"type", "head_object"}}, ExpBuckets{0.001, 2, 20}),                                                     \
       F(type_read_stream, {{"type", "read_stream"}}, ExpBuckets{0.0001, 2, 20}),                                                    \
       F(type_read_stream_err, {{"type", "read_stream_err"}}, ExpBuckets{0.0001, 2, 20}))                                            \
-    M(tiflash_storage_s3_read_limiter_status,                                                                                       \
-      "S3 read limiter status",                                                                                                     \
-      Gauge,                                                                                                                        \
-      F(type_active_get_object_streams, {{"type", "active_get_object_streams"}}),                                                   \
-      F(type_max_get_object_streams, {{"type", "max_get_object_streams"}}))                                                         \
     M(tiflash_storage_s3_http_request_seconds,                                                                                      \
       "S3 request duration breakdown in seconds",                                                                                   \
       Histogram,                                                                                                                    \
@@ -931,8 +924,8 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
     M(tiflash_storage_remote_cache_status,                                                                                          \
       "Remote cache status",                                                                                                        \
       Gauge,                                                                                                                        \
-      F(type_bg_downloading_count, {{"type", "bg_downloading_count"}}),                                                            \
-      F(type_bg_download_queue_count, {{"type", "bg_download_queue_count"}}))                                                      \
+      F(type_bg_downloading_count, {{"type", "bg_downloading_count"}}),                                                             \
+      F(type_bg_download_queue_count, {{"type", "bg_download_queue_count"}}))                                                       \
     M(tiflash_system_seconds,                                                                                                       \
       "system calls duration in seconds",                                                                                           \
       Histogram,                                                                                                                    \
diff --git a/dbms/src/IO/BaseFile/IORateLimitConfig.cpp b/dbms/src/IO/BaseFile/IORateLimitConfig.cpp
index bbbe45d5b21..8a44d4f91d1 100644
--- a/dbms/src/IO/BaseFile/IORateLimitConfig.cpp
+++ b/dbms/src/IO/BaseFile/IORateLimitConfig.cpp
@@ -53,7 +53,6 @@ void IORateLimitConfig::parse(const String & storage_io_rate_limit, const Logger
     readConfig(config, "max_read_bytes_per_sec", max_read_bytes_per_sec);
     readConfig(config, "max_write_bytes_per_sec", max_write_bytes_per_sec);
     readConfig(config, "s3_max_read_bytes_per_sec", s3_max_read_bytes_per_sec);
-    readConfig(config, "s3_max_get_object_streams", s3_max_get_object_streams);
     readConfig(config, "foreground_write_weight", fg_write_weight);
     readConfig(config, "background_write_weight", bg_write_weight);
     readConfig(config, "foreground_read_weight", fg_read_weight);
@@ -74,7 +73,7 @@ std::string IORateLimitConfig::toString() const
 {
     return fmt::format(
         "IORateLimitConfig{{max_bytes_per_sec={} max_read_bytes_per_sec={} max_write_bytes_per_sec={} "
-        "s3_max_read_bytes_per_sec={} s3_max_get_object_streams={} "
+        "s3_max_read_bytes_per_sec={} "
         "use_max_bytes_per_sec={} "
         "fg_write_weight={} bg_write_weight={} fg_read_weight={} bg_read_weight={} "
         "fg_write_max_bytes_per_sec={} bg_write_max_bytes_per_sec={} "
@@ -84,7 +83,6 @@ std::string IORateLimitConfig::toString() const
         max_read_bytes_per_sec,
         max_write_bytes_per_sec,
         s3_max_read_bytes_per_sec,
-        s3_max_get_object_streams,
         use_max_bytes_per_sec,
         fg_write_weight,
         bg_write_weight,
@@ -171,8 +169,7 @@ bool IORateLimitConfig::operator==(const IORateLimitConfig & config) const
 {
     return config.max_bytes_per_sec == max_bytes_per_sec && config.max_read_bytes_per_sec == max_read_bytes_per_sec
         && config.max_write_bytes_per_sec == max_write_bytes_per_sec
-        && config.s3_max_read_bytes_per_sec == s3_max_read_bytes_per_sec
-        && config.s3_max_get_object_streams == s3_max_get_object_streams //
+        && config.s3_max_read_bytes_per_sec == s3_max_read_bytes_per_sec //
         && config.bg_write_weight == bg_write_weight && config.fg_write_weight == fg_write_weight
         && config.bg_read_weight == bg_read_weight && config.fg_read_weight == fg_read_weight
         && config.emergency_pct == emergency_pct && config.high_pct == high_pct && config.medium_pct == medium_pct
diff --git a/dbms/src/IO/BaseFile/IORateLimitConfig.h b/dbms/src/IO/BaseFile/IORateLimitConfig.h
index 6a75a3bf00c..68652622017 100644
--- a/dbms/src/IO/BaseFile/IORateLimitConfig.h
+++ b/dbms/src/IO/BaseFile/IORateLimitConfig.h
@@ -30,8 +30,6 @@ struct IORateLimitConfig
     UInt64 max_write_bytes_per_sec;
     // Node-level byte budget shared by all S3 direct reads and FileCache downloads. `0` disables byte throttling.
     UInt64 s3_max_read_bytes_per_sec;
-    // Node-level cap for concurrently active `GetObject` response bodies. `0` disables stream throttling.
-    UInt64 s3_max_get_object_streams;
 
     // only true when both max_read_bytes_per_sec and max_write_bytes_per_sec are 0
     bool use_max_bytes_per_sec;
@@ -59,7 +57,6 @@ struct IORateLimitConfig
         , max_read_bytes_per_sec(0)
         , max_write_bytes_per_sec(0)
         , s3_max_read_bytes_per_sec(0)
-        , s3_max_get_object_streams(0)
         , use_max_bytes_per_sec(true)
         // only limit background write by default
         , fg_write_weight(0)
diff --git a/dbms/src/IO/BaseFile/RateLimiter.cpp b/dbms/src/IO/BaseFile/RateLimiter.cpp
index f992de746eb..e1547450df8 100644
--- a/dbms/src/IO/BaseFile/RateLimiter.cpp
+++ b/dbms/src/IO/BaseFile/RateLimiter.cpp
@@ -527,18 +527,17 @@ void IORateLimiter::updateLimiterByConfig(const IORateLimitConfig & cfg)
     updateWriteLimiter(cfg.getBgWriteMaxBytesPerSec(), cfg.getFgWriteMaxBytesPerSec());
 
     // updateS3ReadLimiter
-    if (cfg.s3_max_read_bytes_per_sec == 0 && cfg.s3_max_get_object_streams == 0)
+    if (cfg.s3_max_read_bytes_per_sec == 0)
     {
         s3_read_limiter = nullptr;
     }
     else if (s3_read_limiter == nullptr)
     {
-        s3_read_limiter
-            = std::make_shared<S3::S3ReadLimiter>(cfg.s3_max_read_bytes_per_sec, cfg.s3_max_get_object_streams);
+        s3_read_limiter = std::make_shared<S3::S3ReadLimiter>(cfg.s3_max_read_bytes_per_sec);
     }
     else
     {
-        s3_read_limiter->updateConfig(cfg.s3_max_read_bytes_per_sec, cfg.s3_max_get_object_streams);
+        s3_read_limiter->updateConfig(cfg.s3_max_read_bytes_per_sec);
     }
 }
 
diff --git a/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp b/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
index 63761eb33fd..8d91acfe979 100644
--- a/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
+++ b/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
@@ -376,28 +376,9 @@ TEST(ReadLimiterTest, ReadMany)
     ASSERT_EQ(read_limiter.alloc_bytes, 100);
 }
 
-TEST(S3ReadLimiterTest, StreamTokenBlocksUntilRelease)
-{
-    auto limiter = std::make_shared<S3::S3ReadLimiter>(0, 1);
-    auto token1 = limiter->acquireStream();
-    ASSERT_NE(token1, nullptr);
-    ASSERT_EQ(limiter->activeStreams(), 1);
-
-    // The second reader should block until the first stream token is released.
-    auto future = std::async(std::launch::async, [&]() { return limiter->acquireStream(); });
-    ASSERT_EQ(future.wait_for(50ms), std::future_status::timeout);
-
-    token1.reset();
-    auto token2 = future.get();
-    ASSERT_NE(token2, nullptr);
-    ASSERT_EQ(limiter->activeStreams(), 1);
-    token2.reset();
-    ASSERT_EQ(limiter->activeStreams(), 0);
-}
-
 TEST(S3ReadLimiterTest, ByteRequestsWaitForRefill)
 {
-    S3::S3ReadLimiter limiter(1000, 0, 100);
+    S3::S3ReadLimiter limiter(1000, 100);
     // Consume the initial 100-byte burst, then verify the next request waits for at least one refill period.
     limiter.requestBytes(100, S3::S3ReadSource::DirectRead);
     AtomicStopwatch watch;
@@ -405,27 +386,9 @@ TEST(S3ReadLimiterTest, ByteRequestsWaitForRefill)
     ASSERT_GE(watch.elapsedMilliseconds(), 80);
 }
 
-TEST(S3ReadLimiterTest, UpdateConfigDisablesWaitingStream)
-{
-    auto limiter = std::make_shared<S3::S3ReadLimiter>(0, 1);
-    auto token1 = limiter->acquireStream();
-    ASSERT_NE(token1, nullptr);
-
-    // Once config reload disables stream limiting, a waiter should wake up and observe `nullptr` instead of hanging.
-    auto future = std::async(std::launch::async, [&]() { return limiter->acquireStream(); });
-    ASSERT_EQ(future.wait_for(50ms), std::future_status::timeout);
-
-    limiter->updateConfig(/*max_read_bytes_per_sec*/ 0, /*max_streams*/ 0);
-    auto token2 = future.get();
-    ASSERT_EQ(token2, nullptr);
-
-    token1.reset();
-    ASSERT_EQ(limiter->activeStreams(), 0);
-}
-
 TEST(S3ReadLimiterTest, UpdateConfigDisablesWaitingBytes)
 {
-    S3::S3ReadLimiter limiter(1000, 0, 100);
+    S3::S3ReadLimiter limiter(1000, 100);
     // Exhaust the initial burst, then make sure disabling the byte limit wakes a waiting requester promptly.
     limiter.requestBytes(100, S3::S3ReadSource::DirectRead);
 
@@ -436,40 +399,23 @@ TEST(S3ReadLimiterTest, UpdateConfigDisablesWaitingBytes)
     });
     ASSERT_EQ(future.wait_for(50ms), std::future_status::timeout);
 
-    limiter.updateConfig(/*max_read_bytes_per_sec*/ 0, /*max_streams*/ 0);
+    limiter.updateConfig(/*max_read_bytes_per_sec*/ 0);
     ASSERT_LT(future.get(), 100);
 }
 
 TEST(S3ReadLimiterTest, SuggestedChunkSizeTracksBurstLimit)
 {
     // The suggested chunk size should never exceed one refill-period burst when byte limiting is enabled.
-    S3::S3ReadLimiter limiter(/*max_read_bytes_per_sec*/ 1000, /*max_streams*/ 0, /*refill_period_ms*/ 100);
+    S3::S3ReadLimiter limiter(/*max_read_bytes_per_sec*/ 1000, /*refill_period_ms*/ 100);
     ASSERT_EQ(limiter.getSuggestedChunkSize(128 * 1024), 100);
 
-    limiter.updateConfig(/*max_read_bytes_per_sec*/ 5000, /*max_streams*/ 0);
+    limiter.updateConfig(/*max_read_bytes_per_sec*/ 5000);
     ASSERT_EQ(limiter.getSuggestedChunkSize(128 * 1024), 500);
 
-    limiter.updateConfig(/*max_read_bytes_per_sec*/ 0, /*max_streams*/ 0);
+    limiter.updateConfig(/*max_read_bytes_per_sec*/ 0);
     ASSERT_EQ(limiter.getSuggestedChunkSize(4096), 4096);
 }
 
-TEST(S3ReadLimiterTest, StreamTokenMoveDoesNotDoubleRelease)
-{
-    auto limiter = std::make_shared<S3::S3ReadLimiter>(0, 1);
-    auto token = limiter->acquireStream();
-    ASSERT_NE(token, nullptr);
-    ASSERT_EQ(limiter->activeStreams(), 1);
-
-    // Moving the token transfers ownership without releasing the stream slot.
-    auto moved = std::move(token);
-    ASSERT_EQ(token, nullptr);
-    ASSERT_EQ(limiter->activeStreams(), 1);
-
-    // Releasing the moved token should drop the slot exactly once.
-    moved.reset();
-    ASSERT_EQ(limiter->activeStreams(), 0);
-}
-
 #ifdef __linux__
 TEST(IORateLimiterTest, IOStat)
 {
diff --git a/dbms/src/Server/tests/gtest_storage_config.cpp b/dbms/src/Server/tests/gtest_storage_config.cpp
index fefc45e3a31..6459fac75e6 100644
--- a/dbms/src/Server/tests/gtest_storage_config.cpp
+++ b/dbms/src/Server/tests/gtest_storage_config.cpp
@@ -596,7 +596,6 @@ max_bytes_per_sec=0
 max_read_bytes_per_sec=0
 max_write_bytes_per_sec=0
 s3_max_read_bytes_per_sec=0
-s3_max_get_object_streams=0
 foreground_write_weight=1
 background_write_weight=2
 foreground_read_weight=5
@@ -609,7 +608,6 @@ max_bytes_per_sec=1024000
 max_read_bytes_per_sec=0
 max_write_bytes_per_sec=0
 s3_max_read_bytes_per_sec=2048000
-s3_max_get_object_streams=256
 foreground_write_weight=1
 background_write_weight=2
 foreground_read_weight=5
@@ -622,7 +620,6 @@ max_bytes_per_sec=0
 max_read_bytes_per_sec=1024000
 max_write_bytes_per_sec=1024000
 s3_max_read_bytes_per_sec=1024
-s3_max_get_object_streams=8
 foreground_write_weight=1
 background_write_weight=2
 foreground_read_weight=5
@@ -635,7 +632,6 @@ max_bytes_per_sec=1024000
 max_read_bytes_per_sec=1024000
 max_write_bytes_per_sec=1024000
 s3_max_read_bytes_per_sec=4096
-s3_max_get_object_streams=16
 foreground_write_weight=1
 background_write_weight=2
 foreground_read_weight=5
@@ -647,7 +643,6 @@ background_read_weight=2
             [storage.io_rate_limit]
             max_bytes_per_sec=1024000
             s3_max_read_bytes_per_sec=8192
-            s3_max_get_object_streams=32
             foreground_write_weight=80
             background_write_weight=20
             foreground_read_weight=0
@@ -662,7 +657,6 @@ background_read_weight=2
         ASSERT_EQ(io_config.max_read_bytes_per_sec, 0);
         ASSERT_EQ(io_config.max_write_bytes_per_sec, 0);
         ASSERT_EQ(io_config.s3_max_read_bytes_per_sec, 0);
-        ASSERT_EQ(io_config.s3_max_get_object_streams, 0);
         ASSERT_TRUE(io_config.use_max_bytes_per_sec);
         ASSERT_EQ(io_config.fg_write_weight, 0);
         ASSERT_EQ(io_config.bg_write_weight, 100);
@@ -682,7 +676,6 @@ background_read_weight=2
         ASSERT_EQ(io_config.max_read_bytes_per_sec, 0);
         ASSERT_EQ(io_config.max_write_bytes_per_sec, 0);
         ASSERT_EQ(io_config.s3_max_read_bytes_per_sec, 0);
-        ASSERT_EQ(io_config.s3_max_get_object_streams, 0);
         ASSERT_TRUE(io_config.use_max_bytes_per_sec);
         ASSERT_EQ(io_config.fg_write_weight, 1);
         ASSERT_EQ(io_config.bg_write_weight, 2);
@@ -702,7 +695,6 @@ background_read_weight=2
         ASSERT_EQ(io_config.max_read_bytes_per_sec, 0);
         ASSERT_EQ(io_config.max_write_bytes_per_sec, 0);
         ASSERT_EQ(io_config.s3_max_read_bytes_per_sec, 2048000);
-        ASSERT_EQ(io_config.s3_max_get_object_streams, 256);
         ASSERT_TRUE(io_config.use_max_bytes_per_sec);
         ASSERT_EQ(io_config.fg_write_weight, 1);
         ASSERT_EQ(io_config.bg_write_weight, 2);
@@ -722,7 +714,6 @@ background_read_weight=2
         ASSERT_EQ(io_config.max_read_bytes_per_sec, 1024000);
         ASSERT_EQ(io_config.max_write_bytes_per_sec, 1024000);
         ASSERT_EQ(io_config.s3_max_read_bytes_per_sec, 1024);
-        ASSERT_EQ(io_config.s3_max_get_object_streams, 8);
         ASSERT_FALSE(io_config.use_max_bytes_per_sec); // use max_read_bytes_per_sec and max_write_bytes_per_sec
         ASSERT_EQ(io_config.fg_write_weight, 1);
         ASSERT_EQ(io_config.bg_write_weight, 2);
@@ -744,7 +735,6 @@ background_read_weight=2
         ASSERT_EQ(io_config.max_read_bytes_per_sec, 1024000);
         ASSERT_EQ(io_config.max_write_bytes_per_sec, 1024000);
         ASSERT_EQ(io_config.s3_max_read_bytes_per_sec, 4096);
-        ASSERT_EQ(io_config.s3_max_get_object_streams, 16);
         ASSERT_FALSE(io_config.use_max_bytes_per_sec); // use max_read_bytes_per_sec and max_write_bytes_per_sec
         ASSERT_EQ(io_config.fg_write_weight, 1);
         ASSERT_EQ(io_config.bg_write_weight, 2);
@@ -766,7 +756,6 @@ background_read_weight=2
         ASSERT_EQ(io_config.max_read_bytes_per_sec, 0);
         ASSERT_EQ(io_config.max_write_bytes_per_sec, 0);
         ASSERT_EQ(io_config.s3_max_read_bytes_per_sec, 8192);
-        ASSERT_EQ(io_config.s3_max_get_object_streams, 32);
         ASSERT_TRUE(io_config.use_max_bytes_per_sec);
         ASSERT_EQ(io_config.fg_write_weight, 80);
         ASSERT_EQ(io_config.bg_write_weight, 20);
diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 4b0dc9cb675..85472d76cbc 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -1292,8 +1292,6 @@ void FileCache::downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, c
     Aws::S3::Model::GetObjectRequest req;
     client->setBucketAndKeyWithRoot(req, s3_key);
     ProfileEvents::increment(ProfileEvents::S3GetObject);
-    // Limit live background-download streams with the same token used by direct readers.
-    auto stream_token = s3_read_limiter != nullptr ? s3_read_limiter->acquireStream() : nullptr;
     auto outcome = client->GetObject(req);
     if (!outcome.IsSuccess())
     {
diff --git a/dbms/src/Storages/S3/FileCache.h b/dbms/src/Storages/S3/FileCache.h
index 31238a3f159..700d36167fc 100644
--- a/dbms/src/Storages/S3/FileCache.h
+++ b/dbms/src/Storages/S3/FileCache.h
@@ -150,7 +150,10 @@ class FileSegment
     }
 
 private:
-    Status waitForNotEmptyImpl(std::optional<std::chrono::milliseconds> timeout, bool log_progress, bool throw_on_timeout);
+    Status waitForNotEmptyImpl(
+        std::optional<std::chrono::milliseconds> timeout,
+        bool log_progress,
+        bool throw_on_timeout);
 
     mutable std::mutex mtx;
     const String local_fname;
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.cpp b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
index 49a50514de4..604c2a672ba 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.cpp
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
@@ -86,7 +86,6 @@ S3RandomAccessFile::S3RandomAccessFile(
 
 S3RandomAccessFile::~S3RandomAccessFile()
 {
-    resetReadStreamToken();
     CurrentMetrics::sub(CurrentMetrics::S3RandomAccessFile);
 }
 
@@ -165,7 +164,11 @@ ssize_t S3RandomAccessFile::readChunked(char * buf, size_t size)
     return finalizeRead(size, total_gcount, sw, istr);
 }
 
-ssize_t S3RandomAccessFile::finalizeRead(size_t requested_size, size_t actual_size, const Stopwatch & sw, std::istream & istr)
+ssize_t S3RandomAccessFile::finalizeRead(
+    size_t requested_size,
+    size_t actual_size,
+    const Stopwatch & sw,
+    std::istream & istr)
 {
     // Keep the post-read handling shared so limiter and non-limiter paths emit identical retries, logging and
     // observability signals.
@@ -176,7 +179,8 @@ ssize_t S3RandomAccessFile::finalizeRead(size_t requested_size, size_t actual_si
 
     // Theoretically, `istr.eof()` is equivalent to `cur_offset + actual_size != static_cast<size_t>(content_length)`.
     // It's just a double check for more safety.
-    if (actual_size < requested_size && (!istr.eof() || cur_offset + actual_size != static_cast<size_t>(content_length)))
+    if (actual_size < requested_size
+        && (!istr.eof() || cur_offset + actual_size != static_cast<size_t>(content_length)))
     {
         ProfileEvents::increment(ProfileEvents::S3IOReadError);
         auto state = istr.rdstate();
@@ -253,8 +257,7 @@ off_t S3RandomAccessFile::seekImpl(off_t offset_, int whence)
     if (offset_ < cur_offset)
     {
         ProfileEvents::increment(ProfileEvents::S3IOSeekBackward, 1);
-        // The current body stream is forward-only. Re-open from the target offset and release the old stream slot first.
-        resetReadStreamToken();
+        // The current body stream is forward-only. Re-open from the target offset.
         cur_offset = offset_;
         cur_retry = 0;
         initialize("seek backward");
@@ -365,9 +368,6 @@ void S3RandomAccessFile::initialize(std::string_view action)
 {
     while (cur_retry < max_retry)
     {
-        // Hold the token for the whole body lifetime so the stream cap reflects live `GetObject` responses,
-        // including callers that read slowly or perform forward seeks.
-        auto next_stream_token = read_limiter != nullptr ? read_limiter->acquireStream() : nullptr;
         Stopwatch sw_get_object;
         SCOPE_EXIT({
             auto elapsed_secs = sw_get_object.elapsedSeconds();
@@ -398,7 +398,6 @@ void S3RandomAccessFile::initialize(std::string_view action)
         });
         if (!outcome.IsSuccess())
         {
-            next_stream_token.reset();
             Int64 delay_ms = details::calculateDelayForNextRetry(cur_retry);
             cur_retry += 1;
             auto el = sw_get_object.elapsedSeconds();
@@ -424,7 +423,6 @@ void S3RandomAccessFile::initialize(std::string_view action)
         }
         read_result = outcome.GetResultWithOwnership();
         RUNTIME_CHECK(read_result.GetBody(), remote_fname, strerror(errno));
-        read_stream_token = std::move(next_stream_token);
         return; // init successfully
     }
     // exceed max retry times
@@ -435,12 +433,6 @@ void S3RandomAccessFile::initialize(std::string_view action)
         remote_fname);
 }
 
-void S3RandomAccessFile::resetReadStreamToken()
-{
-    if (read_stream_token != nullptr)
-        read_stream_token.reset();
-}
-
 inline static RandomAccessFilePtr tryOpenCachedFile(const String & remote_fname, std::optional<UInt64> filesize)
 {
     try
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.h b/dbms/src/Storages/S3/S3RandomAccessFile.h
index 68ef80bac95..51ccd323412 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.h
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.h
@@ -24,7 +24,6 @@
 #include <common/types.h>
 
 #include <ext/scope_guard.h>
-
 #include <istream>
 
 /// Remove the population of thread_local from Poco
@@ -100,9 +99,13 @@ class S3RandomAccessFile final : public RandomAccessFile
     String readRangeOfObject();
     ssize_t readChunked(char * buf, size_t size);
     ssize_t finalizeRead(size_t requested_size, size_t actual_size, const Stopwatch & sw, std::istream & istr);
-    off_t finalizeSeek(off_t target_offset, size_t requested_size, size_t actual_size, const Stopwatch & sw, std::istream & istr);
+    off_t finalizeSeek(
+        off_t target_offset,
+        size_t requested_size,
+        size_t actual_size,
+        const Stopwatch & sw,
+        std::istream & istr);
     off_t seekChunked(off_t offset);
-    void resetReadStreamToken();
 
     // When reading, it is necessary to pass the extra information of file, such file size, to S3RandomAccessFile::create.
     // It is troublesome to pass parameters layer by layer. So currently, use thread_local global variable to pass parameters.
@@ -116,7 +119,6 @@ class S3RandomAccessFile final : public RandomAccessFile
     Aws::S3::Model::GetObjectResult read_result;
     Int64 content_length = 0;
     std::shared_ptr<S3ReadLimiter> read_limiter;
-    std::unique_ptr<S3ReadLimiter::StreamToken> read_stream_token;
 
     DB::LoggerPtr log;
     bool is_close = false;
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.cpp b/dbms/src/Storages/S3/S3ReadLimiter.cpp
index b10c6935b58..a67700d8d63 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.cpp
+++ b/dbms/src/Storages/S3/S3ReadLimiter.cpp
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Common/CurrentMetrics.h>
 #include <Common/Stopwatch.h>
 #include <Common/TiFlashMetrics.h>
 #include <Storages/S3/S3ReadLimiter.h>
@@ -34,40 +33,23 @@ void recordWaitIfNeeded(bool waited, const Stopwatch & sw, F && observe)
 }
 } // namespace
 
-S3ReadLimiter::StreamToken::~StreamToken()
-{
-    reset();
-}
-
-void S3ReadLimiter::StreamToken::reset()
-{
-    if (owner == nullptr)
-        return;
-    owner->releaseStream();
-    owner = nullptr;
-}
-
-S3ReadLimiter::S3ReadLimiter(UInt64 max_read_bytes_per_sec_, UInt64 max_streams_, UInt64 refill_period_ms_)
+DB::S3::S3ReadLimiter::S3ReadLimiter(UInt64 max_read_bytes_per_sec_, UInt64 refill_period_ms_)
     : refill_period_ms(refill_period_ms_)
     , max_read_bytes_per_sec(max_read_bytes_per_sec_)
-    , max_streams(max_streams_)
-    , active_streams(0)
     , available_bytes(static_cast<double>(burstBytesPerPeriod(max_read_bytes_per_sec_)))
     , last_refill_time(Clock::now())
     , stop(false)
     , log(Logger::get("S3ReadLimiter"))
 {
     GET_METRIC(tiflash_storage_io_limiter_curr, type_s3_read_bytes).Set(max_read_bytes_per_sec_);
-    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_max_get_object_streams).Set(max_streams_);
-    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_active_get_object_streams).Set(0);
 }
 
-S3ReadLimiter::~S3ReadLimiter()
+DB::S3::S3ReadLimiter::~S3ReadLimiter()
 {
     setStop();
 }
 
-void S3ReadLimiter::updateConfig(UInt64 max_read_bytes_per_sec_, UInt64 max_streams_)
+void DB::S3::S3ReadLimiter::updateConfig(UInt64 max_read_bytes_per_sec_)
 {
     {
         std::lock_guard lock(bytes_mutex);
@@ -77,50 +59,11 @@ void S3ReadLimiter::updateConfig(UInt64 max_read_bytes_per_sec_, UInt64 max_stre
             available_bytes = 0;
         last_refill_time = Clock::now();
     }
-    {
-        std::lock_guard lock(stream_mutex);
-        max_streams.store(max_streams_, std::memory_order_relaxed);
-    }
     GET_METRIC(tiflash_storage_io_limiter_curr, type_s3_read_bytes).Set(max_read_bytes_per_sec_);
-    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_max_get_object_streams).Set(max_streams_);
     bytes_cv.notify_all();
-    stream_cv.notify_all();
 }
 
-std::unique_ptr<S3ReadLimiter::StreamToken> S3ReadLimiter::acquireStream()
-{
-    const auto limit = max_streams.load(std::memory_order_relaxed);
-    if (limit == 0)
-        return nullptr;
-
-    Stopwatch sw;
-    bool waited = false;
-    std::unique_lock lock(stream_mutex);
-    // A token is held for the whole lifetime of one `GetObject` body, not just the initial request.
-    while (!stop && max_streams.load(std::memory_order_relaxed) != 0
-           && active_streams.load(std::memory_order_relaxed) >= max_streams.load(std::memory_order_relaxed))
-    {
-        if (!waited)
-        {
-            GET_METRIC(tiflash_storage_io_limiter_pending_count, type_s3_read_stream).Increment();
-            waited = true;
-        }
-        stream_cv.wait(lock);
-    }
-
-    recordWaitIfNeeded(waited, sw, [](double seconds) {
-        GET_METRIC(tiflash_storage_io_limiter_pending_seconds, type_s3_read_stream).Observe(seconds);
-    });
-
-    if (stop || max_streams.load(std::memory_order_relaxed) == 0)
-        return nullptr;
-
-    auto cur = active_streams.fetch_add(1, std::memory_order_relaxed) + 1;
-    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_active_get_object_streams).Set(cur);
-    return std::make_unique<StreamToken>(this);
-}
-
-void S3ReadLimiter::requestBytes(UInt64 bytes, S3ReadSource source)
+void DB::S3::S3ReadLimiter::requestBytes(UInt64 bytes, S3ReadSource source)
 {
     if (bytes == 0)
         return;
@@ -177,7 +120,7 @@ void S3ReadLimiter::requestBytes(UInt64 bytes, S3ReadSource source)
     }
 }
 
-UInt64 S3ReadLimiter::getSuggestedChunkSize(UInt64 preferred_chunk_size) const
+UInt64 DB::S3::S3ReadLimiter::getSuggestedChunkSize(UInt64 preferred_chunk_size) const
 {
     const auto limit = max_read_bytes_per_sec.load(std::memory_order_relaxed);
     if (limit == 0)
@@ -185,27 +128,18 @@ UInt64 S3ReadLimiter::getSuggestedChunkSize(UInt64 preferred_chunk_size) const
     return std::max<UInt64>(1, std::min(preferred_chunk_size, burstBytesPerPeriod(limit)));
 }
 
-void S3ReadLimiter::setStop()
+void DB::S3::S3ReadLimiter::setStop()
 {
     {
-        std::lock_guard lock_stream(stream_mutex);
         std::lock_guard lock_bytes(bytes_mutex);
         if (stop)
             return;
         stop = true;
     }
-    stream_cv.notify_all();
     bytes_cv.notify_all();
 }
 
-void S3ReadLimiter::releaseStream()
-{
-    auto cur = active_streams.fetch_sub(1, std::memory_order_relaxed) - 1;
-    GET_METRIC(tiflash_storage_s3_read_limiter_status, type_active_get_object_streams).Set(cur);
-    stream_cv.notify_one();
-}
-
-void S3ReadLimiter::refillBytesLocked(Clock::time_point now)
+void DB::S3::S3ReadLimiter::refillBytesLocked(Clock::time_point now)
 {
     const auto current_limit = max_read_bytes_per_sec.load(std::memory_order_relaxed);
     if (current_limit == 0)
@@ -227,7 +161,7 @@ void S3ReadLimiter::refillBytesLocked(Clock::time_point now)
     last_refill_time = now;
 }
 
-UInt64 S3ReadLimiter::burstBytesPerPeriod(UInt64 max_read_bytes_per_sec_) const
+UInt64 DB::S3::S3ReadLimiter::burstBytesPerPeriod(UInt64 max_read_bytes_per_sec_) const
 {
     if (max_read_bytes_per_sec_ == 0)
         return 0;
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.h b/dbms/src/Storages/S3/S3ReadLimiter.h
index 3597bd5af6c..995fa8efe31 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.h
+++ b/dbms/src/Storages/S3/S3ReadLimiter.h
@@ -20,7 +20,6 @@
 #include <atomic>
 #include <chrono>
 #include <condition_variable>
-#include <memory>
 #include <mutex>
 
 namespace DB::S3
@@ -31,77 +30,34 @@ enum class S3ReadSource
     FileCacheDownload,
 };
 
-class S3ReadLimiter : public std::enable_shared_from_this<S3ReadLimiter>
+class S3ReadLimiter
 {
 public:
-    /// RAII handle for one live `GetObject` body stream.
+    /// Stream-based limiting looks attractive because a token could track one live `GetObject`
+    /// body stream.
     ///
-    /// The token is acquired before the request body starts being consumed and released when the
-    /// response stream is destroyed or re-opened. This keeps the stream limiter aligned with the
-    /// actual number of concurrent remote-read streams instead of just the number of requests sent.
-    ///
-    /// Important: the token must not be interpreted as a safe upper bound for the number of
+    /// Important: such a token must not be interpreted as a safe upper bound for the number of
     /// `S3RandomAccessFile` objects. One reader can hold a response body open while being idle in a
     /// pipeline stage, so limiting tokens too aggressively can stall unrelated readers even when
     /// there is little ongoing S3 network I/O.
-    class StreamToken
-    {
-    public:
-        explicit StreamToken(S3ReadLimiter * owner_)
-            : owner(owner_)
-        {}
-
-        ~StreamToken();
-
-        StreamToken(const StreamToken &) = delete;
-        StreamToken & operator=(const StreamToken &) = delete;
-
-        StreamToken(StreamToken && other) noexcept
-            : owner(other.owner)
-        {
-            other.owner = nullptr;
-        }
-
-        StreamToken & operator=(StreamToken && other) noexcept
-        {
-            if (this == &other)
-                return *this;
-            reset();
-            owner = other.owner;
-            other.owner = nullptr;
-            return *this;
-        }
-
-        /// Releases one active stream slot early. Destruction does the same automatically.
-        void reset();
-
-    private:
-        S3ReadLimiter * owner;
-    };
+    ///
+    /// Stream-based limiting is therefore removed for now. Keep this note here so future changes do
+    /// not accidentally re-introduce the same unsafe hard cap on `S3RandomAccessFile` concurrency.
 
     /// A lightweight node-level limiter for S3 remote reads.
     ///
-    /// It limits two dimensions together:
-    /// - concurrently active `GetObject` body streams
+    /// It currently limits one dimension:
     /// - total remote-read bytes consumed by direct reads and FileCache downloads
     ///
     /// The stream dimension is best-effort protection against too many live response bodies, not a
     /// replacement for byte throttling and not a safe cap on reader object count. In TiFlash a
     /// `S3RandomAccessFile` may keep its body stream open across scheduling gaps, so a low stream
     /// limit can block forward progress even when the node is no longer transferring many bytes.
-    explicit S3ReadLimiter(UInt64 max_read_bytes_per_sec_ = 0, UInt64 max_streams_ = 0, UInt64 refill_period_ms_ = 100);
+    explicit S3ReadLimiter(UInt64 max_read_bytes_per_sec_ = 0, UInt64 refill_period_ms_ = 100);
 
     ~S3ReadLimiter();
 
-    /// Update both byte-rate and stream limits. `0` disables the corresponding limit.
-    void updateConfig(UInt64 max_read_bytes_per_sec_, UInt64 max_streams_);
-
-    /// Acquire a token that must live as long as the `GetObject` body stream remains active.
-    /// Returns `nullptr` when the stream limit is disabled.
-    ///
-    /// Callers should use this to bound live response bodies, but should not assume it models only
-    /// the time spent actively reading bytes from S3.
-    [[nodiscard]] std::unique_ptr<StreamToken> acquireStream();
+    void updateConfig(UInt64 max_read_bytes_per_sec_);
 
     /// Charge remote-read bytes. The call blocks when the current node-level budget is exhausted.
     void requestBytes(UInt64 bytes, S3ReadSource source);
@@ -110,16 +66,12 @@ class S3ReadLimiter : public std::enable_shared_from_this<S3ReadLimiter>
     UInt64 getSuggestedChunkSize(UInt64 preferred_chunk_size) const;
 
     UInt64 maxReadBytesPerSec() const { return max_read_bytes_per_sec.load(std::memory_order_relaxed); }
-    UInt64 maxStreams() const { return max_streams.load(std::memory_order_relaxed); }
-    UInt64 activeStreams() const { return active_streams.load(std::memory_order_relaxed); }
 
     void setStop();
 
 private:
     using Clock = std::chrono::steady_clock;
 
-    /// Return one `GetObject` stream slot back to the limiter and wake one waiter.
-    void releaseStream();
     /// Refill the token bucket according to elapsed wall time. Caller must hold `bytes_mutex`.
     void refillBytesLocked(Clock::time_point now);
     /// Limit the instantaneous burst so long reads are naturally split into small limiter-aware chunks.
@@ -127,11 +79,6 @@ class S3ReadLimiter : public std::enable_shared_from_this<S3ReadLimiter>
 
     const UInt64 refill_period_ms;
     std::atomic<UInt64> max_read_bytes_per_sec;
-    std::atomic<UInt64> max_streams;
-    std::atomic<UInt64> active_streams;
-
-    mutable std::mutex stream_mutex;
-    std::condition_variable stream_cv;
 
     mutable std::mutex bytes_mutex;
     std::condition_variable bytes_cv;
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index 80235d159c5..1c82e60b6b9 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/Logger.h>
 #include <Common/Stopwatch.h>
 #include <Common/SyncPoint/SyncPoint.h>
-#include <Common/FailPoint.h>
 #include <Debug/TiFlashTestEnv.h>
 #include <IO/BaseFile/RateLimiter.h>
 #include <IO/IOThreadPools.h>
@@ -1278,16 +1278,13 @@ TEST_F(FileCacheTest, BgDownloadRespectsS3StreamLimiter)
 
     ASSERT_EQ(file_cache.get(S3FilenameView::fromKey(objects[0].key), objects[0].size), nullptr);
     sp_download.waitAndPause();
-    ASSERT_EQ(limiter->activeStreams(), 1);
 
     ASSERT_EQ(file_cache.get(S3FilenameView::fromKey(objects[1].key), objects[1].size), nullptr);
     std::this_thread::sleep_for(50ms);
-    ASSERT_EQ(limiter->activeStreams(), 1);
 
     sp_download.next();
     sp_download.disable();
     waitForBgDownload(file_cache);
-    ASSERT_EQ(limiter->activeStreams(), 0);
 }
 
 TEST_F(FileCacheTest, GetWaitOnDownloadingSupportsColDataAndOther)
diff --git a/dbms/src/Storages/S3/tests/gtest_s3client.cpp b/dbms/src/Storages/S3/tests/gtest_s3client.cpp
index 66bbad412ba..99c40f168a9 100644
--- a/dbms/src/Storages/S3/tests/gtest_s3client.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_s3client.cpp
@@ -263,7 +263,6 @@ TEST_F(S3ClientTest, PublishS3ReadLimiter)
     IORateLimiter io_rate_limiter;
     IORateLimitConfig cfg;
     cfg.s3_max_read_bytes_per_sec = 8192;
-    cfg.s3_max_get_object_streams = 9;
     io_rate_limiter.updateLimiterByConfig(cfg);
 
     auto published = io_rate_limiter.getS3ReadLimiter();
@@ -271,7 +270,6 @@ TEST_F(S3ClientTest, PublishS3ReadLimiter)
     ClientFactory::instance().setS3ReadLimiter(published);
     ASSERT_EQ(ClientFactory::instance().sharedTiFlashClient()->getS3ReadLimiter(), published);
     ASSERT_EQ(published->maxReadBytesPerSec(), 8192);
-    ASSERT_EQ(published->maxStreams(), 9);
 }
 
 TEST_F(S3ClientTest, ListPrefixEarlyStopOnTruncatedResult)
diff --git a/dbms/src/Storages/S3/tests/gtest_s3file.cpp b/dbms/src/Storages/S3/tests/gtest_s3file.cpp
index 2e9ad7b127d..6ba148d0a99 100644
--- a/dbms/src/Storages/S3/tests/gtest_s3file.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_s3file.cpp
@@ -355,35 +355,6 @@ try
 }
 CATCH
 
-TEST_P(S3FileTest, StreamLimiterBlocksSecondDirectReader)
-try
-{
-    const String key1 = "/a/b/c/stream_limit_1";
-    const String key2 = "/a/b/c/stream_limit_2";
-    writeFile(key1, 4096, WriteSettings{});
-    writeFile(key2, 4096, WriteSettings{});
-
-    auto limiter = std::make_shared<S3ReadLimiter>(0, 1);
-    s3_client->setS3ReadLimiter(limiter);
-    SCOPE_EXIT({ s3_client->setS3ReadLimiter(nullptr); });
-
-    auto file1 = std::make_shared<S3RandomAccessFile>(s3_client, key1, nullptr);
-    ASSERT_EQ(limiter->activeStreams(), 1);
-
-    auto future = std::async(std::launch::async, [&]() {
-        return std::make_shared<S3RandomAccessFile>(s3_client, key2, nullptr);
-    });
-    ASSERT_EQ(future.wait_for(50ms), std::future_status::timeout);
-
-    file1.reset();
-    auto file2 = future.get();
-    ASSERT_NE(file2, nullptr);
-    ASSERT_EQ(limiter->activeStreams(), 1);
-    file2.reset();
-    ASSERT_EQ(limiter->activeStreams(), 0);
-}
-CATCH
-
 TEST_P(S3FileTest, WriteRead)
 try
 {

From 9a91c34f99ad598ff460b8d5fcfcc62609ab84a6 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 21:08:56 +0800
Subject: [PATCH 16/36] docs: add English disagg S3 backpressure design

---
 ...-level-backpressure-and-filecache-dedup.md | 661 ++++++++++++++++++
 1 file changed, 661 insertions(+)
 create mode 100644 docs/design/2026-03-24-disagg-s3-node-level-backpressure-and-filecache-dedup.md

diff --git a/docs/design/2026-03-24-disagg-s3-node-level-backpressure-and-filecache-dedup.md b/docs/design/2026-03-24-disagg-s3-node-level-backpressure-and-filecache-dedup.md
new file mode 100644
index 00000000000..00c1614be81
--- /dev/null
+++ b/docs/design/2026-03-24-disagg-s3-node-level-backpressure-and-filecache-dedup.md
@@ -0,0 +1,661 @@
+# TiFlash Disaggregated Read Path: Node-Level S3 Backpressure and FileCache Same-Key Deduplication
+
+Purpose: define a production-oriented design with deliberately constrained implementation scope to solve uncontrolled S3 traffic on compute nodes in disaggregated read workloads caused by cold reads and amplified cache misses. The target risk scenario is: a large portion of one table is cold on the current node and therefore absent from local `FileCache`; the system receives concurrent queries in a short time window; multiple queries scan roughly the same set of files on TiFlash at about the same time; S3 access latency keeps rising; node network becomes congested; queries queue for a long time; and the situation eventually turns into query failures or obvious timeouts.
+
+Date: 2026-03-24
+
+## Summary
+
+This proposal recommends rolling out in two incremental phases:
+
+1. Introduce a node-level `S3ReadLimiter` to cap total remote-read bandwidth on one node.
+2. In `FileCache::get(...)`, add one bounded wait only when the same physical key already has an in-flight download, so followers prefer to reuse the existing download result instead of immediately falling back to `S3RandomAccessFile` direct reads.
+
+The first version deliberately does **not** do the following:
+
+- no query-level async read scheduler
+- no new global miss coordination table
+- no sleep-retry on `RejectTooManyDownloading`
+- no complicated logic-file-type-specific branching inside `FileCache`
+
+The reason is straightforward: the only stable, reusable, and easily rollbackable seams in the current code are `S3RandomAccessFile`, `FileCache`, `ClientFactory/TiFlashS3Client`, and `IORateLimitConfig` / `IORateLimiter`. Once we put a hard node-level traffic guardrail in place and deduplicate same-key miss fan-out, we are already able to turn “an occasional network blow-up that makes queries fail” into “a bounded, observable, tunable problem”.
+
+## Context
+
+### 1. Current direct S3 reads issue `GetObject` immediately
+
+The current S3 open path is:
+
+```text
+FileProvider::newRandomAccessFile
+  -> S3RandomAccessFile::create
+     -> FileCache::getRandomAccessFile
+        -> cache hit: return local file
+        -> cache miss: return nullptr
+     -> fallback: construct S3RandomAccessFile
+        -> initialize() immediately in constructor
+           -> client->GetObject(req)
+```
+
+Verified code locations:
+
+- `FileProvider::newRandomAccessFile` goes directly to `S3RandomAccessFile::create(...)` on the S3 path and does not use the incoming `read_limiter`:
+  `dbms/src/IO/FileProvider/FileProvider.cpp:35`
+- `S3RandomAccessFile::create(...)` first tries `FileCache::getRandomAccessFile(...)`, then immediately constructs a remote reader on miss:
+  `dbms/src/Storages/S3/S3RandomAccessFile.cpp:345`
+- `S3RandomAccessFile` calls `initialize("init file")` in its constructor:
+  `dbms/src/Storages/S3/S3RandomAccessFile.cpp:65`
+- `initialize(...)` directly calls `client_ptr->GetObject(req)`:
+  `dbms/src/Storages/S3/S3RandomAccessFile.cpp:278`
+
+This means:
+
+- once `FileCache` misses, the caller immediately consumes a new S3 stream
+- there is currently no node-level hard cap on total bandwidth or total remote-read concurrency
+
+### 2. Current `FileCache::get(...)` only inserts a background download and returns immediately on miss
+
+`FileCache::get(...)` currently has three key branches:
+
+- hit an existing `Complete` segment and return it directly:
+  `dbms/src/Storages/S3/FileCache.cpp:342`
+- hit an existing segment that is not ready yet, record a miss, return `nullptr` immediately:
+  `dbms/src/Storages/S3/FileCache.cpp:351`
+- key does not exist and caching is allowed: insert an `Empty` segment, schedule background download, return `nullptr` immediately:
+  `dbms/src/Storages/S3/FileCache.cpp:366`
+
+Current behavior can be summarized as:
+
+```text
+The same key is already being downloaded
+  -> get() still returns nullptr
+  -> caller keeps direct-falling back to S3
+
+First miss of one key
+  -> get() only inserts Empty + submits bg download
+  -> caller keeps direct-falling back to S3
+```
+
+That is, today `FileCache` does not help query hot paths reuse an already running download.
+
+### 3. A wait primitive already exists, but only for the foreground-download path
+
+`FileSegment::waitForNotEmpty()` already exists, but it is a 30-second-granularity loop and is used only by `getOrWait(...)`:
+
+- `dbms/src/Storages/S3/FileCache.cpp:76`
+- `dbms/src/Storages/S3/FileCache.cpp:411`
+
+`getOrWait(...)` means:
+
+- if key already exists: wait for the download result
+- if key does not exist: perform a foreground download directly
+
+This API is suitable for vector index / inverted index / full-text index paths that must land on disk and then `mmap`, but not suitable for ordinary query read paths, because it turns the very first miss on the current thread into a synchronous download.
+
+### 4. Existing `IORateLimiter` cannot directly limit remote S3 reads
+
+Current read-side `IORateLimiter` implementation depends on `/proc/<pid>/io` and `/proc/<pid>/task/<tid>/io` to sample real local-disk bytes:
+
+- `dbms/src/IO/BaseFile/RateLimiter.cpp:523`
+- `dbms/src/IO/BaseFile/RateLimiter.cpp:601`
+- `dbms/src/IO/BaseFile/RateLimiter.cpp:644`
+
+This works for local disks, not remote S3 traffic, for two reasons:
+
+1. S3 bytes never show up in `/proc/.../io` disk statistics.
+2. For S3, the application already knows how many bytes it plans to read and how many bytes it actually read, so `/proc` sampling is unnecessary.
+
+Therefore remote reads need a dedicated limiter implementation instead of reusing the existing local-disk `ReadLimiter`.
+
+### 5. `S3RandomAccessFile` does not have access to `Context`
+
+Today `S3RandomAccessFile` only holds:
+
+- `TiFlashS3Client`
+- `remote_fname`
+- `ScanContext`
+
+Code location:
+
+- `dbms/src/Storages/S3/S3RandomAccessFile.h:43`
+
+It does not have a `Context` or `IORateLimiter` reference. Therefore, if we want a node-level remote limiter, we must first solve how the runtime object becomes reachable from the S3 read path.
+
+### 6. Under MetaV2, many logical small files are mapped to the physical `.merged` key
+
+This is the key fact that shapes the phase-2 design.
+
+Under MetaV2 / DMFile V3:
+
+- min-max index is written into the merged file:
+  `dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp:312`
+- mark is written into the merged file:
+  `dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp:345`
+- some small data files are also merged into the merged file:
+  `dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.cpp:321`
+
+Correspondingly, `getReadFileSize(...)` returns the size of the entire merged object, not the logical sub-file size:
+
+- `dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.cpp:395`
+
+So the real access chain becomes:
+
+```text
+Logically read mark / index / small data
+  -> resolve to merged_sub_file_infos
+  -> actually open N.merged
+  -> FileCache only sees the physical key: N.merged
+```
+
+The direct conclusion is:
+
+- `FileCache::get(...)` cannot reliably tell, from the physical key alone, whether this is a small mark/index read or a large merged-data read
+- therefore the first version should not introduce complicated wait policies at the `FileCache` layer based on logical file categories
+
+## Goals
+
+- Prevent one TiFlash compute node from saturating its NIC because of cold S3 reads and thereby causing query failures.
+- Put direct S3 reads and FileCache downloads under the same node-level backpressure instead of allowing the two paths to amplify independently.
+- Reduce miss fan-out for the same physical key without changing the current synchronous read model.
+- Keep the cache-hit fast path almost unchanged, especially with no new heap allocations or major CPU overhead when the limiter is disabled.
+- Make every new behavior switchable, observable, and rollbackable online.
+
+## Non-Goals
+
+- No query-level async read scheduler.
+- No cross-node coordination or global download registry.
+- No sleep-retry for `RejectTooManyDownloading` in the first version.
+- Do not turn the first miss into a foreground download in the first version; only deduplicate followers.
+- Do not separately optimize metadata-burst during read-task build; the node-level limiter in phase 1 automatically covers these S3 reads.
+- Do not add a separate timeout/cancel semantic for limiter waits in the first version.
+
+## Design
+
+### Phase 1: node-level S3 remote-read limiter
+
+#### Design decision
+
+Add a dedicated `S3ReadLimiter` that constrains:
+
+- total remote-read bytes per second on one node
+
+It covers two paths:
+
+- `S3RandomAccessFile` direct reads
+- `FileCache::downloadImpl(...)` foreground/background downloads
+
+#### Why the first version keeps only byte-rate limiting
+
+We experimented with modeling the `GetObject` body lifetime as a `StreamToken`, and with using `s3_max_get_object_streams` to cap concurrent active streams. However, under real workloads and replay validation, this turned out to be an unsafe upper bound for the number of `S3RandomAccessFile` objects:
+
+- one `S3RandomAccessFile` may keep a remote body stream open across pipeline / scheduler gaps while not continuously transferring bytes
+- for semantic consistency, the token must cover the entire body lifetime; once the limit is set too tightly, readers that hold a stream but are temporarily idle are still counted under the hard cap
+- the result is that active streams stay pinned near the cap while real S3 throughput has already dropped, and subsequent readers remain blocked, which manifests as permanently stuck queries or no forward progress
+
+Therefore the first version keeps `s3_max_read_bytes_per_sec` as the deterministic node-level guardrail and removes `s3_max_get_object_streams` from both config and metrics. If we want to reintroduce a stream-dimension cap in the future, we first need a model that is closer to real network occupancy than a reader-lifetime token.
+
+#### Configuration surface
+
+Continue to use `[storage.io_rate_limit]` and add one new field:
+
+- `s3_max_read_bytes_per_sec`
+  - node-level upper bound on total remote-read bandwidth
+  - uniformly covers `S3RandomAccessFile` read/forward seek and `FileCache` download
+  - `0` means disabled
+
+Suggested TOML form:
+
+```toml
+[storage.io_rate_limit]
+# Existing disk IO limiter configs...
+# max_bytes_per_sec = 0
+
+# Limit total S3 remote-read bandwidth on one compute node.
+# 0 means disabled.
+# s3_max_read_bytes_per_sec = 0
+```
+
+#### Runtime ownership and integration points
+
+Recommended runtime object relationship:
+
+```text
+storage.io_rate_limit
+  -> IORateLimitConfig
+     -> IORateLimiter handles parse / reload
+        -> owns a dedicated S3ReadLimiter
+           -> Server publishes it to ClientFactory during initialization
+              -> TiFlashS3Client holds shared_ptr<S3ReadLimiter>
+                 -> used directly by S3RandomAccessFile / FileCache::downloadImpl
+```
+
+Key constraints:
+
+- `S3ReadLimiter` is not a subclass of the current `ReadLimiter`
+  - it does not depend on `/proc/.../io`
+  - it throttles directly from application-known byte counts
+- `IORateLimiter` still owns config reload entry
+  - but the remote limiter does not participate in current disk IO auto tune
+- `ClientFactory` is responsible for publishing the current limiter to all `TiFlashS3Client` instances
+  - including the client created immediately at startup
+  - and clients lazily initialized later by compute nodes through write nodes
+
+This avoids introducing an unnatural `Context` dependency into `S3RandomAccessFile`.
+
+#### Behavioral semantics
+
+##### 1. Semantics of byte budget
+
+`s3_max_read_bytes_per_sec` accounts and limits real remote traffic:
+
+- bytes read by `S3RandomAccessFile::readImpl(...)`
+- bytes discarded by forward skip in `S3RandomAccessFile::seekImpl(...)`
+- bytes downloaded to local storage by `FileCache::downloadImpl(...)`
+
+It is a throttling mechanism, not fail-fast:
+
+- wait when the budget is exhausted
+- continue once budget becomes available again
+
+#### Key implementation details
+
+##### 1. `S3RandomAccessFile` switches to chunked mode only when the limiter is enabled
+
+Current implementation:
+
+- `readImpl(...)` performs one `istream.read(buf, size)`:
+  `dbms/src/Storages/S3/S3RandomAccessFile.cpp:121`
+- `seekImpl(...)` performs one `istream.ignore(delta)` on forward skip:
+  `dbms/src/Storages/S3/S3RandomAccessFile.cpp:195`
+
+If we simply add one `request(size)` / `request(delta)` before these calls, there are three problems:
+
+1. one read can accumulate an overly large burst
+2. retry / EOF / stream error can make reserved budget diverge from actual bytes
+3. forward seek can burst a large amount of traffic after obtaining one large budget chunk
+
+Therefore the first version should:
+
+- keep the current one-shot `read/ignore` fast path when the limiter is disabled
+- switch to fixed-size chunk loops when the limiter is enabled
+  - for example 64 KiB or 128 KiB
+  - wait for budget before each chunk advances the stream
+  - no new heap allocation
+
+This requires:
+
+- `readImpl(...)` to become chunked `istream.read(...)` when the limiter is enabled
+- `seekImpl(...)` forward skip to become chunked `istream.ignore(...)` when the limiter is enabled
+- backward seek to keep the current semantics: re-`GetObject` plus a new body stream
+
+##### 2. `FileCache::downloadImpl(...)` reuses the same limiter without introducing a new abstraction layer
+
+Current download flow:
+
+```text
+downloadImpl
+  -> client->GetObject(req)
+  -> downloadToLocal(result.GetBody(), ...)
+     -> ReadBufferFromIStream + copyData(...)
+```
+
+Code locations:
+
+- `dbms/src/Storages/S3/FileCache.cpp:1013`
+- `dbms/src/Storages/S3/FileCache.cpp:989`
+
+The first version does not need a new IO framework here. Keep it simple:
+
+- if limiter is disabled, continue to use the current download implementation
+- if limiter is enabled, change `downloadToLocal(...)` into a fixed-buffer local loop
+  - read a chunk
+  - write to local file
+  - no extra heap allocations throughout the path
+
+This path is not the cache-hit hot path of a query, so one additional tiny branch relative to current `copyData(...)` is acceptable, but it should not introduce a new object graph.
+
+#### Compatibility and invariants
+
+- cache-hit read path stays unchanged
+- when the limiter is disabled, behavior must remain identical to today
+- existing `S3RandomAccessFile::initialize(...)` retry logic must be preserved
+- remote limiter does not depend on query context
+- remote limiter does not participate in current disk IO auto tune
+
+#### Observability
+
+At minimum, add these node-level metrics:
+
+- count and duration of waiting for byte budget
+- charged bytes of direct reads
+- charged bytes of FileCache downloads
+
+Keep and continue to use existing metrics:
+
+- `tiflash_storage_s3_request_seconds`
+- `ProfileEvents::S3GetObject`
+- `ProfileEvents::S3ReadBytes`
+- existing disaggregated S3 statistics in `ScanContext`
+- `CurrentMetrics::S3RandomAccessFile`
+  - this can continue to serve as an approximate guardrail for direct-read pressure
+
+### Phase 2: same-key deduplication in `FileCache::get(...)`
+
+#### Design decision
+
+The first version does exactly one thing:
+
+- when `FileCache::get(...)` finds that the same physical key already has an `Empty` segment, meaning someone is downloading it, allow the current thread to perform one very short bounded wait and try to reuse that result
+
+The first version intentionally does not do two things:
+
+- no wait strategy by logical file type
+- no sleep-retry on `RejectTooManyDownloading`
+
+This is an intentional narrowing of scope:
+
+- `FileCache` can reliably see only physical keys
+- under MetaV2, mark/index/small data often map to `.merged`
+- queue-congestion retry is not very helpful for wide-table scenarios with many unique keys missing together
+- phase 1 already solves the root cause of “the node network gets blown up”, so phase 2 can focus on the most reliable same-key deduplication only
+
+#### Configuration surface
+
+Add one dynamic setting near the existing `dt_filecache_*` settings:
+
+- `dt_filecache_wait_on_downloading_ms`
+  - `0` means disabled
+  - when non-zero, `FileCache::get(...)` waits up to this duration when it hits an existing `Empty` segment
+
+The recommended code default for the first version is `0`; when enabling in canary, start from a very small value such as `1~2ms`.
+
+#### Behavioral semantics
+
+Current behavior:
+
+```text
+get(key)
+  -> found Empty segment
+  -> return nullptr
+  -> caller directly falls back to S3
+```
+
+Behavior after the change:
+
+```text
+get(key)
+  -> found Empty segment
+  -> release FileCache::mtx
+  -> waitForNotEmptyFor(dt_filecache_wait_on_downloading_ms)
+     -> Complete: return cached segment
+     -> Failed / timeout: return nullptr
+```
+
+Note two points:
+
+1. wait only when the same key already has an in-flight download
+2. for the first miss where the current thread just inserted the `Empty` segment, keep the current behavior in the first version and return `nullptr` immediately
+
+In other words, the first version deduplicates followers, not leaders.
+
+#### Local primitive to add
+
+Add on `FileSegment`:
+
+- `waitForNotEmptyFor(timeout_ms)`
+
+Code locations:
+
+- `dbms/src/Storages/S3/FileCache.h:90`
+- `dbms/src/Storages/S3/FileCache.cpp:76`
+
+Requirements:
+
+- return the state observed at timeout end
+- hold no `FileCache::mtx` during the entire wait
+- allow existing `waitForNotEmpty()` to share internal logic with it and avoid duplication
+
+#### Why the first version does not add sleep-retry on `RejectTooManyDownloading`
+
+We do not adopt this because:
+
+1. it mainly helps when the queue is only briefly jittering
+2. it is not very helpful for the main problem of wide-table cold reads where many unique keys miss together
+3. it expands `FileCache::get(...)` from “same-key reuse” to “congestion guess + retry”, increasing complexity and tuning surface
+
+Therefore the first version keeps only the most robust layer: when there is already an owner downloading one key, let followers wait briefly.
+
+#### Compatibility and invariants
+
+- `dt_filecache_wait_on_downloading_ms = 0` must preserve current behavior
+- no `FileCache::mtx` is held while waiting
+- only `Complete` is treated as hit; both `Failed` and timeout are treated as miss
+- existing `getOrWait(...)`, vector-index / inverted-index / full-text-index foreground download paths are out of scope for this change
+
+#### Observability
+
+At least add the following counters:
+
+- `wait_on_downloading`
+- `wait_on_downloading_hit`
+- `wait_on_downloading_timeout`
+- `wait_on_downloading_failed`
+
+To support finer-grained benefit analysis and tuning, also add a low-cardinality metric set:
+
+- `wait_on_downloading_result{result, file_type}`
+- `wait_on_downloading_bytes{result, file_type}`
+- `wait_on_downloading_wait_seconds{result, file_type}`
+- `bg_download_stage_seconds{stage, file_type}`
+- `bg_downloading_count`
+
+Suggested fixed labels:
+
+- `result`: `hit` / `timeout` / `failed`
+- `file_type`: `merged` / `coldata` / `other`
+- `stage`: `queue_wait` / `download`
+
+Among them:
+
+- `wait_on_downloading_result{...}` is used for count-based hit ratio
+- `wait_on_downloading_bytes{...}` is used for bytes-weighted hit ratio, to avoid overestimating benefit from many tiny merged hits
+- `wait_on_downloading_wait_seconds{...}` is used to measure actual follower waiting cost
+- `bg_download_stage_seconds{stage="queue_wait"}` is the time from `bgDownload(...)` submission until `bgDownloadExecutor(...)` actually starts running
+- `bg_download_stage_seconds{stage="download"}` is the time from executor start until download completes or fails
+- `bg_downloading_count` is the number of currently active background downloads
+
+This metric set must strictly avoid high cardinality:
+
+- no labels by key, table name, or DMFile id
+- keep only coarse-grained categories such as `Merged vs ColData vs Other`, enough to support tuning
+
+Current code already has natural hook points and state for these metrics:
+
+- the queue-full decision that causes `RejectTooManyDownloading`:
+  `dbms/src/Storages/S3/FileCache.cpp:887`
+- current background download count `bg_downloading_count`:
+  `dbms/src/Storages/S3/FileCache.h:471`
+  `dbms/src/Storages/S3/FileCache.cpp:1106`
+- stopwatch around one download:
+  `dbms/src/Storages/S3/FileCache.cpp:1010`
+
+This metric set is sufficient to answer four questions:
+
+- whether bounded wait really absorbs direct fallback
+- whether the benefit mainly comes from `Merged` or from `ColData`
+- when count hit ratio looks promising, whether bytes-weighted hit ratio still holds
+- whether timeout mainly comes from queueing too long or from the object download itself being too slow
+
+## Rejected Directions
+
+### 1. Distinguish wait policy at the `FileCache` layer by `Meta / Index / Mark / Merged / ColData`
+
+Not adopted. Under MetaV2, this mapping does not reliably hold at the `FileCache` layer:
+
+- mark/index/small data often map to the physical `.merged`
+- `FileCache` only sees the physical key
+
+If we force this into the first version, the document would become more precise than the implementation, but the implementation still would not have accurate-enough information, and the final policy and metrics would both become misleading.
+
+### 2. Sleep-retry on `RejectTooManyDownloading`
+
+Not adopted because:
+
+- it cannot replace a node-level traffic cap
+- it is not very useful for cold reads with many unique keys
+- it adds hot-path branches and new tuning knobs
+
+If phase 2 later shows that same-key follower dedup already works well but queue jitter is still obvious, we can evaluate a third incremental change separately.
+
+## Incremental Plan
+
+1. Add S3 remote-read config into `IORateLimitConfig`, disabled by default.
+2. Add `S3ReadLimiter`; let `IORateLimiter` handle reload and `ClientFactory` publish it to `TiFlashS3Client`.
+3. Wire it into `S3RandomAccessFile` and `FileCache::downloadImpl(...)`, together with phase-1 metrics and unit tests.
+4. Canary phase 1 on compute nodes first, with real machine-specific limits.
+5. Add bounded wait to `FileSegment` / `FileCache::get(...)`, disabled by default.
+6. After phase 1 metrics become stable, enable a very small `dt_filecache_wait_on_downloading_ms` in canary.
+
+### Recommended initial parameters
+
+For the first canary, the first goal is to make sure a node cannot saturate its network because of cold-read fan-out, rather than to maximize throughput from day one.
+
+- `s3_max_read_bytes_per_sec`
+  - start from `30%~50%` of the node's sustainable outbound bandwidth budget rather than the theoretical NIC peak
+  - for `10GbE` nodes, start from `300~500 MiB/s`, i.e. `314572800~524288000`
+  - for `25GbE` nodes, start from `800~1200 MiB/s`, i.e. `838860800~1258291200`
+  - if the same node also serves obvious MPP exchange, page-storage, S3 background tasks, or other outbound traffic, prefer the lower end of the range
+  - when increasing, change by only `10%~20%` each step
+
+- `dt_filecache_wait_on_downloading_ms`
+  - keep `0` in phase 1
+  - for the first phase-2 canary, start from `1`
+  - increase to `2` only after `wait_on_downloading_hit / wait_on_downloading` is clearly non-zero and timeout ratio is still low
+  - do not exceed `5` in the first version
+
+At phase 1, there is only one knob to stabilize: `s3_max_read_bytes_per_sec`. It directly determines how much sustained pressure a compute node can apply to S3 during a cold-read burst.
+
+### Rollout sequence
+
+Recommended four-step rollout, changing only one main variable each time:
+
+1. Single-node canary, phase 1 only
+   - suggested starting point:
+     - `s3_max_read_bytes_per_sec =` `30%~50%` of node budget
+     - `dt_filecache_wait_on_downloading_ms = 0`
+   - must cover at least one real cold-read peak or one reproducible wide-table cold-cache pressure test
+
+2. Small-batch canary, limited to one AZ, one tenant group, or no more than `5%` of compute nodes
+   - keep config identical to the canary
+   - if tuning is required, change only one parameter each time and observe at least one complete business peak window before the next change
+
+3. Expand to `20%~30%` of compute nodes
+   - only expand when phase 1 has already shown clear error reduction, node outbound traffic no longer sticks to the line, and S3 latency no longer keeps worsening
+
+4. Enable phase 2 on the same canary
+   - keep phase-1 config unchanged and add only `dt_filecache_wait_on_downloading_ms = 1`
+   - if phase-2 hit ratio is not obvious or timeout ratio is high, roll back to `0` without blocking phase-1 full rollout
+
+### Rollout criteria and rollback thresholds
+
+The following thresholds are recommendations for the first rollout, not contractual guarantees; their goal is to turn “whether to continue rollout” into an actionable decision.
+
+Before expanding phase 1, all of the following are recommended:
+
+- `CurrentMetrics::S3RandomAccessFile` no longer grows almost linearly with query concurrency
+  - “continuously above `900` for several minutes” is recommended as an alert line
+  - this is a trend/guardrail signal, not an exact token count
+- P95/P99 of `tiflash_storage_s3_request_seconds{type="get_object"}` and `tiflash_storage_s3_request_seconds{type="read_stream"}` no longer keep worsening during bursts
+  - some slowdown relative to baseline is acceptable
+  - but if P99 degrades by more than `30%` at similar throughput and query errors do not clearly improve, rollout should not continue
+- 1-minute peak node outbound traffic should stay below `85%` of the node budget
+  - if it still sticks near the link limit or shows obvious saw-tooth oscillation, the byte budget is still too high
+- query errors caused by node network saturation should drop to `0` or near `0`
+  - if the error type does not clearly improve, the limiter likely did not hit the real bottleneck
+- `tiflash_storage_remote_cache{type="dtfile_download_failed"}` should not rise significantly after enabling
+
+Phase 1 should be rolled back immediately or at least stop further rollout under any of the following conditions:
+
+- query error rate rises obviously, or new timeout/cancel errors increase significantly
+- `CurrentMetrics::S3RandomAccessFile` still frequently approaches `1000`, indicating that direct-read pressure is still too high or there are unprotected paths
+- P99 of `tiflash_storage_s3_request_seconds{type="get_object"}` or `{type="read_stream"}` keeps rising to around `2x` baseline and does not recover
+- node outbound traffic still sticks to the line or turns into a new severe oscillation
+
+Before expanding phase 2, all of the following are recommended:
+
+- `wait_on_downloading_hit / wait_on_downloading >= 10%`
+  - if it stays below `5%` for a long time, the same-key follower dedup benefit is weak and keeping `0` is likely better
+- bytes-weighted hit ratio should be stably non-zero; start with `5%` as an empirical threshold
+  - can be computed as `sum(wait_on_downloading_bytes{result="hit"}) / sum(wait_on_downloading_bytes)`
+  - if count hit ratio is high but bytes-weighted hit ratio stays below `5%`, the benefit mainly comes from small metadata / merged files and should not be expected to significantly reduce the main data traffic
+- `wait_on_downloading_timeout / wait_on_downloading <= 20%`
+  - if timeout ratio is high, the wait window is larger than the real reusable window
+- `wait_on_downloading_failed` remains at a very low level
+- after enabling `1ms` wait, query P99 does not worsen by more than `10%`
+- `tiflash_storage_remote_cache{type="dtfile_too_many_download"}` does not continue to rise, or direct-fallback spikes show signs of dropping
+- split results between `Merged` and `ColData` match workload expectations
+  - if `merged` hit is high but `coldata` is low, phase 2 mainly absorbs MetaV2 small-file fallback; this still has value, but network relief is usually smaller than the count ratio suggests
+  - if the goal is to further reduce main data traffic, focus on `coldata` hit/timeout/bytes rather than only total hit ratio
+- `bg_download_stage_seconds{stage="queue_wait"}` should not stay significantly above `stage="download"}` for a long time
+  - if queue wait dominates timeout, first inspect download queue size, concurrent-download config, and `RejectTooManyDownloading` pressure instead of increasing `dt_filecache_wait_on_downloading_ms`
+  - if actual download dominates timeout and timeout is concentrated on `coldata`, the benefit ceiling of phase 2 is inherently limited and the wait window should not keep growing in order to force more benefit
+
+Existing code locations for the current observability basis:
+
+- `CurrentMetrics::S3RandomAccessFile`: `dbms/src/Common/CurrentMetrics.cpp:88`
+- `tiflash_storage_s3_request_seconds`: `dbms/src/Common/TiFlashMetrics.h:779`
+- `tiflash_storage_remote_cache`: `dbms/src/Common/TiFlashMetrics.h:875`
+
+## Validation Strategy
+
+### Unit tests
+
+- Phase 1
+  - `s3_max_read_bytes_per_sec` limits both direct reads and downloads
+  - behavior remains unchanged when the limiter is disabled
+  - lazily created S3 clients on compute nodes still pick up the current limiter
+
+- Phase 2
+  - timeout / success / failed semantics of `waitForNotEmptyFor(...)`
+  - `FileCache::get(...)` can directly return the cached file via bounded wait when the key already has an `Empty` segment
+  - behavior remains unchanged when `dt_filecache_wait_on_downloading_ms = 0`
+  - no `FileCache::mtx` is held during waiting
+  - leader behavior on the first miss stays unchanged
+  - observability classification for `Merged` / `ColData` / `Other` is correct
+  - `wait_on_downloading_bytes{...}` matches actual waited file size accounting
+  - `bg_download_stage_seconds{stage="queue_wait|download"}` is accounted correctly on both success and failure paths
+
+### Integration tests
+
+- cold cache, wide table, many columns read simultaneously
+- multiple threads reading the same physical `.merged` key concurrently
+- node-level limiter still caps peak traffic when direct read and FileCache download coexist
+- under MetaV2, mark / index / data reads all map to the `.merged` key path
+
+### Production success criteria
+
+- after phase 1 rollout, node peak outbound traffic stays stably below the budget and no longer continuously sticks near the link limit
+- query failures caused by node network saturation drop to `0` or near `0`
+- P95 / P99 of `tiflash_storage_s3_request_seconds{type="get_object"}` and `{type="read_stream"}` no longer keep worsening during bursts
+- if phase 2 is enabled, `wait_on_downloading_hit / wait_on_downloading` should at least reach a two-digit percentage and timeout ratio should remain low
+- if phase 2 is enabled, bytes-weighted hit ratio should stay stably non-zero, and the `Merged vs ColData` split should explain where the benefit comes from rather than leaving only one total hit ratio
+- if timeout mainly comes from queue wait, optimize download queueing first; if timeout mainly comes from actual download, accept the benefit ceiling of phase 2 on large `ColData` files instead of treating it as a replacement for phase 1
+
+## Risks and Mitigations
+
+- limit set too low and query latency rises
+  - mitigation: disabled by default; size per node; roll out in phases
+
+- direct read and FileCache download share the same limit and may squeeze each other
+  - mitigation: share one budget in the first version and do not add more layering yet; decide whether to split only after observing `source`-dimension metrics
+
+- bounded wait hit ratio is low and only adds tail latency
+  - mitigation: default `0`; start from a tiny value such as `1~2ms`; trigger only when the same key already has an in-flight download
+
+- new CPU overhead after enabling the limiter
+  - mitigation: use chunked paths only when the limiter is enabled; keep current fast path when disabled
+
+## Open Questions
+
+- if we want to constrain “active remote body streams” again in the future, what model is closer to actual network occupancy than a reader-lifetime token?
+- if phase 2 hit ratio is already respectable but direct-fallback spikes still happen after `RejectTooManyDownloading`, do we need a third phase with queue-full retry?

From 50a5e62e2af45673d913f9c030943110188c8f26 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 21:32:55 +0800
Subject: [PATCH 17/36] disagg: clarify shared limiter test semantics

---
 dbms/src/Common/TiFlashMetrics.h               | 4 ++--
 dbms/src/Storages/S3/FileCache.cpp             | 3 +--
 dbms/src/Storages/S3/tests/gtest_filecache.cpp | 9 ++++++++-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/dbms/src/Common/TiFlashMetrics.h b/dbms/src/Common/TiFlashMetrics.h
index 8a00e118e83..656ccc6f328 100644
--- a/dbms/src/Common/TiFlashMetrics.h
+++ b/dbms/src/Common/TiFlashMetrics.h
@@ -385,8 +385,8 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
       F(type_fg_write_alloc_bytes, {"type", "fg_write_alloc_bytes"}),                                                               \
       F(type_bg_write_req_bytes, {"type", "bg_write_req_bytes"}),                                                                   \
       F(type_bg_write_alloc_bytes, {"type", "bg_write_alloc_bytes"}),                                                               \
-      F(type_s3_direct_read_bytes, {{"type", "s3_direct_read_bytes"}}),                                                             \
-      F(type_s3_filecache_download_bytes, {{"type", "s3_filecache_download_bytes"}}))                                               \
+      F(type_s3_direct_read_bytes, {"type", "s3_direct_read_bytes"}),                                                               \
+      F(type_s3_filecache_download_bytes, {"type", "s3_filecache_download_bytes"}))                                                 \
     M(tiflash_storage_io_limiter_curr,                                                                                              \
       "Current limit bytes per second of Storage I/O limiter",                                                                      \
       Gauge,                                                                                                                        \
diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 85472d76cbc..c4e38438c3b 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -47,7 +47,6 @@
 #include <filesystem>
 #include <magic_enum.hpp>
 #include <queue>
-#include <ranges>
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
@@ -639,7 +638,7 @@ FileSegmentPtr FileCache::get(const S3::S3FilenameView & s3_fname, const std::op
                 file_type);
             table.set(s3_key, file_seg);
         }
-    } // Release the lock before submiting bg download task. Because bgDownload may be blocked when the queue is full.
+    } // Release the lock before submitting bg download task. Because bgDownload may be blocked when the queue is full.
 
     if (wait_ms != 0)
     {
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index 1c82e60b6b9..dca46f8bb6c 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -1256,7 +1256,7 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingReturnsMissWhenDownloaderFails)
     ASSERT_EQ(file_seg->getSize(), objects[0].size);
 }
 
-TEST_F(FileCacheTest, BgDownloadRespectsS3StreamLimiter)
+TEST_F(FileCacheTest, BgDownloadWorksWithSharedS3ReadLimiter)
 {
     auto cache_dir = fmt::format("{}/bg_download_limiter", tmp_dir);
     StorageRemoteCacheConfig cache_config{.dir = cache_dir, .capacity = cache_capacity, .dtfile_level = 100};
@@ -1269,6 +1269,10 @@ TEST_F(FileCacheTest, BgDownloadRespectsS3StreamLimiter)
     settings.dt_filecache_max_downloading_count_scale = 2.0;
     file_cache.updateConfig(settings);
 
+    // This test is kept as a regression check for the shared-limiter plumbing on the FileCache download
+    // path. Stream-based limiting has been removed, and the byte limit is intentionally disabled here,
+    // so the expectation is simply that background downloads still make progress and do not deadlock when
+    // a shared S3ReadLimiter object is attached to the client.
     auto limiter = std::make_shared<S3ReadLimiter>(0, 1);
     s3_client->setS3ReadLimiter(limiter);
     SCOPE_EXIT({ s3_client->setS3ReadLimiter(nullptr); });
@@ -1276,9 +1280,12 @@ TEST_F(FileCacheTest, BgDownloadRespectsS3StreamLimiter)
     auto objects = genObjects(/*store_count*/ 1, /*table_count*/ 1, /*file_count*/ 1, {"3.merged", "4.merged"});
     auto sp_download = SyncPointCtl::enableInScope("before_FileCache::downloadImpl_download_to_local");
 
+    // Start one background download and pause it at the download-to-local boundary.
     ASSERT_EQ(file_cache.get(S3FilenameView::fromKey(objects[0].key), objects[0].size), nullptr);
     sp_download.waitAndPause();
 
+    // Submit a second download while the first one is paused. The test passes as long as both downloads
+    // complete normally after the pause is released.
     ASSERT_EQ(file_cache.get(S3FilenameView::fromKey(objects[1].key), objects[1].size), nullptr);
     std::this_thread::sleep_for(50ms);
 

From 97036bf21093cf2e143e04085e46792a83304cd2 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sat, 4 Apr 2026 21:48:35 +0800
Subject: [PATCH 18/36] disagg: keep S3 limiter object across disable reload

---
 dbms/src/IO/BaseFile/RateLimiter.cpp          | 9 +++------
 dbms/src/Storages/S3/tests/gtest_s3client.cpp | 6 ++++++
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/dbms/src/IO/BaseFile/RateLimiter.cpp b/dbms/src/IO/BaseFile/RateLimiter.cpp
index e1547450df8..18cf8e89dfb 100644
--- a/dbms/src/IO/BaseFile/RateLimiter.cpp
+++ b/dbms/src/IO/BaseFile/RateLimiter.cpp
@@ -527,13 +527,10 @@ void IORateLimiter::updateLimiterByConfig(const IORateLimitConfig & cfg)
     updateWriteLimiter(cfg.getBgWriteMaxBytesPerSec(), cfg.getFgWriteMaxBytesPerSec());
 
     // updateS3ReadLimiter
-    if (cfg.s3_max_read_bytes_per_sec == 0)
+    if (s3_read_limiter == nullptr)
     {
-        s3_read_limiter = nullptr;
-    }
-    else if (s3_read_limiter == nullptr)
-    {
-        s3_read_limiter = std::make_shared<S3::S3ReadLimiter>(cfg.s3_max_read_bytes_per_sec);
+        if (cfg.s3_max_read_bytes_per_sec != 0)
+            s3_read_limiter = std::make_shared<S3::S3ReadLimiter>(cfg.s3_max_read_bytes_per_sec);
     }
     else
     {
diff --git a/dbms/src/Storages/S3/tests/gtest_s3client.cpp b/dbms/src/Storages/S3/tests/gtest_s3client.cpp
index 99c40f168a9..977b5509cd0 100644
--- a/dbms/src/Storages/S3/tests/gtest_s3client.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_s3client.cpp
@@ -270,6 +270,12 @@ TEST_F(S3ClientTest, PublishS3ReadLimiter)
     ClientFactory::instance().setS3ReadLimiter(published);
     ASSERT_EQ(ClientFactory::instance().sharedTiFlashClient()->getS3ReadLimiter(), published);
     ASSERT_EQ(published->maxReadBytesPerSec(), 8192);
+
+    cfg.s3_max_read_bytes_per_sec = 0;
+    io_rate_limiter.updateLimiterByConfig(cfg);
+    auto disabled = io_rate_limiter.getS3ReadLimiter();
+    ASSERT_EQ(disabled, published);
+    ASSERT_EQ(disabled->maxReadBytesPerSec(), 0);
 }
 
 TEST_F(S3ClientTest, ListPrefixEarlyStopOnTruncatedResult)

From ff09ede1516f34b33fe8665ba63b5c6e0d5144fc Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sun, 5 Apr 2026 01:51:53 +0800
Subject: [PATCH 19/36] disagg: address review follow-up fixes

---
 dbms/src/Common/TiFlashMetrics.cpp               | 11 +++++++++++
 dbms/src/IO/BaseFile/RateLimiter.cpp             |  6 ++++++
 .../src/IO/BaseFile/tests/gtest_rate_limiter.cpp |  5 +++++
 dbms/src/Storages/S3/S3RandomAccessFile.cpp      |  3 ++-
 dbms/src/Storages/S3/tests/gtest_filecache.cpp   | 16 ++++++++++------
 dbms/src/Storages/S3/tests/gtest_s3client.cpp    |  3 +++
 ...ode-level-backpressure-and-filecache-dedup.md |  2 +-
 7 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/dbms/src/Common/TiFlashMetrics.cpp b/dbms/src/Common/TiFlashMetrics.cpp
index cda98983c5e..dc5332610d4 100644
--- a/dbms/src/Common/TiFlashMetrics.cpp
+++ b/dbms/src/Common/TiFlashMetrics.cpp
@@ -30,6 +30,17 @@ constexpr std::array remote_cache_reject_reason_labels = {"too_many_download"};
 constexpr std::array remote_cache_download_stage_labels = {"queue_wait", "download"};
 constexpr auto remote_cache_wait_on_downloading_buckets = ExpBuckets{0.0001, 2, 20};
 constexpr auto remote_cache_bg_download_stage_buckets = ExpBuckets{0.0001, 2, 20};
+
+static_assert(
+    remote_cache_file_type_labels.size() == static_cast<size_t>(TiFlashMetrics::RemoteCacheFileTypeMetric::Count));
+static_assert(
+    remote_cache_wait_result_labels.size() == static_cast<size_t>(TiFlashMetrics::RemoteCacheWaitResultMetric::Count));
+static_assert(
+    remote_cache_reject_reason_labels.size()
+    == static_cast<size_t>(TiFlashMetrics::RemoteCacheRejectReasonMetric::Count));
+static_assert(
+    remote_cache_download_stage_labels.size()
+    == static_cast<size_t>(TiFlashMetrics::RemoteCacheDownloadStageMetric::Count));
 } // namespace
 
 TiFlashMetrics & TiFlashMetrics::instance()
diff --git a/dbms/src/IO/BaseFile/RateLimiter.cpp b/dbms/src/IO/BaseFile/RateLimiter.cpp
index 18cf8e89dfb..ab6b2c9af87 100644
--- a/dbms/src/IO/BaseFile/RateLimiter.cpp
+++ b/dbms/src/IO/BaseFile/RateLimiter.cpp
@@ -530,7 +530,11 @@ void IORateLimiter::updateLimiterByConfig(const IORateLimitConfig & cfg)
     if (s3_read_limiter == nullptr)
     {
         if (cfg.s3_max_read_bytes_per_sec != 0)
+        {
             s3_read_limiter = std::make_shared<S3::S3ReadLimiter>(cfg.s3_max_read_bytes_per_sec);
+            if (stop.load(std::memory_order_relaxed))
+                s3_read_limiter->setStop();
+        }
     }
     else
     {
@@ -703,6 +707,8 @@ void IORateLimiter::setStop()
         auto sz = fg_read_limiter->setStop();
         LOG_DEBUG(log, "fg_read_limiter setStop request size {}", sz);
     }
+    if (s3_read_limiter != nullptr)
+        s3_read_limiter->setStop();
 }
 
 void IORateLimiter::runAutoTune()
diff --git a/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp b/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
index 8d91acfe979..0a438da8ba0 100644
--- a/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
+++ b/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
@@ -392,11 +392,16 @@ TEST(S3ReadLimiterTest, UpdateConfigDisablesWaitingBytes)
     // Exhaust the initial burst, then make sure disabling the byte limit wakes a waiting requester promptly.
     limiter.requestBytes(100, S3::S3ReadSource::DirectRead);
 
+    std::promise<void> waiter_started;
+    auto waiter_started_future = waiter_started.get_future();
     auto future = std::async(std::launch::async, [&]() {
         AtomicStopwatch watch;
+        waiter_started.set_value();
         limiter.requestBytes(100, S3::S3ReadSource::DirectRead);
         return watch.elapsedMilliseconds();
     });
+
+    ASSERT_EQ(waiter_started_future.wait_for(1s), std::future_status::ready);
     ASSERT_EQ(future.wait_for(50ms), std::future_status::timeout);
 
     limiter.updateConfig(/*max_read_bytes_per_sec*/ 0);
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.cpp b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
index 604c2a672ba..181cefbd8ee 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.cpp
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
@@ -75,11 +75,12 @@ S3RandomAccessFile::S3RandomAccessFile(
     : client_ptr(std::move(client_ptr_))
     , remote_fname(remote_fname_)
     , cur_offset(0)
-    , read_limiter(client_ptr->getS3ReadLimiter())
+    , read_limiter(nullptr)
     , log(Logger::get(remote_fname))
     , scan_context(scan_context_)
 {
     RUNTIME_CHECK(client_ptr != nullptr);
+    read_limiter = client_ptr->getS3ReadLimiter();
     initialize("init file");
     CurrentMetrics::add(CurrentMetrics::S3RandomAccessFile);
 }
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index dca46f8bb6c..5039ce81c09 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -34,6 +34,8 @@
 #include <fmt/compile.h>
 #include <gtest/gtest.h>
 
+#include <ext/scope_guard.h>
+
 #include <atomic>
 #include <chrono>
 #include <filesystem>
@@ -1230,12 +1232,14 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingReturnsMissWhenDownloaderFails)
 
     // The follower reaches `get()` while the same key is still being downloaded. Inject a failure right before
     // the downloader starts copying the body so the follower wakes up with `Status::Failed` and returns miss.
-    FailPointHelper::enableFailPoint(FailPoints::file_cache_bg_download_fail);
-    auto wait_failed = std::async(std::launch::async, [&]() { return file_cache.get(key, objects[0].size); });
-    std::this_thread::sleep_for(20ms);
-    sp_download.next();
-    ASSERT_EQ(wait_failed.get(), nullptr);
-    FailPointHelper::disableFailPoint(FailPoints::file_cache_bg_download_fail);
+    {
+        FailPointHelper::enableFailPoint(FailPoints::file_cache_bg_download_fail);
+        SCOPE_EXIT({ FailPointHelper::disableFailPoint(FailPoints::file_cache_bg_download_fail); });
+        auto wait_failed = std::async(std::launch::async, [&]() { return file_cache.get(key, objects[0].size); });
+        std::this_thread::sleep_for(20ms);
+        sp_download.next();
+        ASSERT_EQ(wait_failed.get(), nullptr);
+    }
     sp_download.disable();
 
     waitForBgDownload(file_cache);
diff --git a/dbms/src/Storages/S3/tests/gtest_s3client.cpp b/dbms/src/Storages/S3/tests/gtest_s3client.cpp
index 977b5509cd0..b18daa41c3f 100644
--- a/dbms/src/Storages/S3/tests/gtest_s3client.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_s3client.cpp
@@ -256,6 +256,9 @@ CATCH
 
 TEST_F(S3ClientTest, PublishS3ReadLimiter)
 {
+    auto prev_limiter = ClientFactory::instance().sharedTiFlashClient()->getS3ReadLimiter();
+    SCOPE_EXIT({ ClientFactory::instance().setS3ReadLimiter(prev_limiter); });
+
     auto limiter = std::make_shared<S3ReadLimiter>(4096, 7);
     ClientFactory::instance().setS3ReadLimiter(limiter);
     ASSERT_EQ(client->getS3ReadLimiter(), limiter);
diff --git a/docs/design/2026-03-24-disagg-s3-node-level-backpressure-and-filecache-dedup.md b/docs/design/2026-03-24-disagg-s3-node-level-backpressure-and-filecache-dedup.md
index 00c1614be81..9d9bebfc691 100644
--- a/docs/design/2026-03-24-disagg-s3-node-level-backpressure-and-filecache-dedup.md
+++ b/docs/design/2026-03-24-disagg-s3-node-level-backpressure-and-filecache-dedup.md
@@ -596,7 +596,7 @@ Before expanding phase 2, all of the following are recommended:
 - split results between `Merged` and `ColData` match workload expectations
   - if `merged` hit is high but `coldata` is low, phase 2 mainly absorbs MetaV2 small-file fallback; this still has value, but network relief is usually smaller than the count ratio suggests
   - if the goal is to further reduce main data traffic, focus on `coldata` hit/timeout/bytes rather than only total hit ratio
-- `bg_download_stage_seconds{stage="queue_wait"}` should not stay significantly above `stage="download"}` for a long time
+- `bg_download_stage_seconds{stage="queue_wait"}` should not stay significantly above `bg_download_stage_seconds{stage="download"}` for a long time
   - if queue wait dominates timeout, first inspect download queue size, concurrent-download config, and `RejectTooManyDownloading` pressure instead of increasing `dt_filecache_wait_on_downloading_ms`
   - if actual download dominates timeout and timeout is concentrated on `coldata`, the benefit ceiling of phase 2 is inherently limited and the wait window should not keep growing in order to force more benefit
 

From 88f63dc3be87eb60224cb40e6c837be8ebf7e9c2 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sun, 5 Apr 2026 02:05:48 +0800
Subject: [PATCH 20/36] disagg: avoid large-request stalls in S3 limiter

---
 .../IO/BaseFile/tests/gtest_rate_limiter.cpp   | 11 +++++++++++
 dbms/src/Storages/S3/S3RandomAccessFile.cpp    |  5 +++++
 dbms/src/Storages/S3/S3ReadLimiter.cpp         | 18 +++++++++++++++---
 dbms/src/Storages/S3/S3ReadLimiter.h           | 12 +++++++++++-
 4 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp b/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
index 0a438da8ba0..c3dee082cb9 100644
--- a/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
+++ b/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
@@ -421,6 +421,17 @@ TEST(S3ReadLimiterTest, SuggestedChunkSizeTracksBurstLimit)
     ASSERT_EQ(limiter.getSuggestedChunkSize(4096), 4096);
 }
 
+TEST(S3ReadLimiterTest, LargeRequestDoesNotWaitForever)
+{
+    S3::S3ReadLimiter limiter(/*max_read_bytes_per_sec*/ 1000, /*refill_period_ms*/ 100);
+
+    // The initial burst is only 100 bytes, but callers that request a larger chunk should still make
+    // forward progress instead of waiting forever for a budget that can never accumulate.
+    AtomicStopwatch watch;
+    limiter.requestBytes(128 * 1024, S3::S3ReadSource::DirectRead);
+    ASSERT_LT(watch.elapsedMilliseconds(), 200);
+}
+
 #ifdef __linux__
 TEST(IORateLimiterTest, IOStat)
 {
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.cpp b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
index 181cefbd8ee..3e60efc1702 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.cpp
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
@@ -145,6 +145,9 @@ ssize_t S3RandomAccessFile::readChunked(char * buf, size_t size)
     ProfileEvents::increment(ProfileEvents::S3IORead, 1);
 
     auto & istr = read_result.GetBody();
+    // Use the limiter-suggested step so one large logical read is split into smoother refill-period-
+    // sized chunks. That keeps `requestBytes()` on its strict path for normal reads and only falls
+    // back to borrowing semantics for requests that are unavoidably larger than one burst.
     const auto chunk_size = read_limiter->getSuggestedChunkSize(s3_read_limiter_preferred_chunk_size);
     size_t total_gcount = 0;
     while (total_gcount < size)
@@ -282,6 +285,8 @@ off_t S3RandomAccessFile::seekChunked(off_t offset)
     Stopwatch sw;
     ProfileEvents::increment(ProfileEvents::S3IOSeek, 1);
     auto & istr = read_result.GetBody();
+    // Use the same chunk heuristic as readChunked() so forward seeks do not turn into one oversized
+    // limiter request when skipping a large remote range.
     const auto chunk_size = read_limiter->getSuggestedChunkSize(s3_read_limiter_preferred_chunk_size);
     size_t total_ignored = 0;
     const auto bytes_to_ignore = static_cast<size_t>(offset - cur_offset);
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.cpp b/dbms/src/Storages/S3/S3ReadLimiter.cpp
index a67700d8d63..da982323d0a 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.cpp
+++ b/dbms/src/Storages/S3/S3ReadLimiter.cpp
@@ -99,9 +99,21 @@ void DB::S3::S3ReadLimiter::requestBytes(UInt64 bytes, S3ReadSource source)
 
         const auto now = Clock::now();
         refillBytesLocked(now);
-        if (available_bytes >= static_cast<double>(bytes))
+        const auto requested_bytes = static_cast<double>(bytes);
+        const auto burst_bytes = static_cast<double>(burstBytesPerPeriod(current_limit));
+        if (available_bytes >= requested_bytes)
         {
-            available_bytes -= static_cast<double>(bytes);
+            available_bytes -= requested_bytes;
+            return;
+        }
+
+        // Preserve the strict token-bucket behavior for requests that fit into one burst. When one
+        // caller asks for more than the bucket can ever accumulate, allow it to borrow once some
+        // budget is available so the request still makes forward progress. Upper layers are expected
+        // to call getSuggestedChunkSize() and keep this branch rare.
+        if (requested_bytes > burst_bytes && available_bytes > 0)
+        {
+            available_bytes -= requested_bytes;
             return;
         }
 
@@ -113,7 +125,7 @@ void DB::S3::S3ReadLimiter::requestBytes(UInt64 bytes, S3ReadSource source)
 
         // Sleep only for the missing budget instead of a fixed interval so large readers converge quickly
         // after budget becomes available again.
-        const auto missing = static_cast<double>(bytes) - available_bytes;
+        const auto missing = requested_bytes - available_bytes;
         const auto wait_us
             = std::max<UInt64>(1, static_cast<UInt64>(missing * 1000000.0 / static_cast<double>(current_limit)));
         bytes_cv.wait_for(lock, std::chrono::microseconds(wait_us));
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.h b/dbms/src/Storages/S3/S3ReadLimiter.h
index 995fa8efe31..72c0a3701c5 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.h
+++ b/dbms/src/Storages/S3/S3ReadLimiter.h
@@ -60,9 +60,19 @@ class S3ReadLimiter
     void updateConfig(UInt64 max_read_bytes_per_sec_);
 
     /// Charge remote-read bytes. The call blocks when the current node-level budget is exhausted.
+    ///
+    /// Requests that fit within one refill-period burst keep strict token-bucket semantics. If one
+    /// caller asks for more than a single burst can ever accumulate, the limiter allows that request
+    /// to borrow against future refills once some positive budget is available so the caller does not
+    /// wait forever.
     void requestBytes(UInt64 bytes, S3ReadSource source);
 
-    /// Suggest a chunk size that keeps limiter-enabled readers from creating large bursts.
+    /// Suggest a chunk size for limiter-aware loops in upper layers.
+    ///
+    /// Callers should prefer this value before each `read()` / `ignore()` / buffer refill so large
+    /// remote reads are naturally split into refill-period-sized steps. Keeping chunks near one burst
+    /// preserves smooth throttling and makes the large-request borrowing path in `requestBytes()` a
+    /// rare fallback instead of the common case.
     UInt64 getSuggestedChunkSize(UInt64 preferred_chunk_size) const;
 
     UInt64 maxReadBytesPerSec() const { return max_read_bytes_per_sec.load(std::memory_order_relaxed); }

From fe5b6d942d395c29bd2590c6bde65a16a34d1b86 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sun, 5 Apr 2026 02:49:34 +0800
Subject: [PATCH 21/36] Format codes

Signed-off-by: JaySon-Huang <tshent@qq.com>
---
 dbms/src/Storages/S3/tests/gtest_filecache.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index 5039ce81c09..0b8d62e4823 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -34,10 +34,9 @@
 #include <fmt/compile.h>
 #include <gtest/gtest.h>
 
-#include <ext/scope_guard.h>
-
 #include <atomic>
 #include <chrono>
+#include <ext/scope_guard.h>
 #include <filesystem>
 #include <fstream>
 #include <memory>

From a1a397dcd8cd2ca9290fa9cce55f53a94d844b74 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sun, 5 Apr 2026 13:05:34 +0800
Subject: [PATCH 22/36] disagg: decouple S3 read accounting from throttling

---
 dbms/src/Storages/S3/FileCache.cpp          | 16 +++++++++--
 dbms/src/Storages/S3/MockS3Client.h         |  6 ++--
 dbms/src/Storages/S3/S3Common.cpp           | 31 ++++++++++++++++----
 dbms/src/Storages/S3/S3Common.h             | 32 ++++++++++++++++++---
 dbms/src/Storages/S3/S3RandomAccessFile.cpp |  6 ++++
 dbms/src/Storages/S3/S3RandomAccessFile.h   |  4 +--
 dbms/src/Storages/S3/S3ReadLimiter.cpp      | 28 +++++++++++-------
 dbms/src/Storages/S3/S3ReadLimiter.h        |  9 +++++-
 dbms/src/Storages/S3/S3ReadLimiter_fwd.h    | 25 ++++++++++++++++
 9 files changed, 130 insertions(+), 27 deletions(-)
 create mode 100644 dbms/src/Storages/S3/S3ReadLimiter_fwd.h

diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index c4e38438c3b..526e257ac79 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -1254,7 +1254,8 @@ void downloadToLocal(
     const String & fname,
     Int64 content_length,
     const WriteLimiterPtr & write_limiter,
-    const std::shared_ptr<S3::S3ReadLimiter> & s3_read_limiter)
+    const std::shared_ptr<S3::S3ReadLimiter> & s3_read_limiter,
+    const std::shared_ptr<S3::S3ReadMetricsRecorder> & s3_read_metrics_recorder)
 {
     // create an empty file with write_limiter
     // each time `ofile.write` is called, the write speed will be controlled by the write_limiter.
@@ -1271,6 +1272,8 @@ void downloadToLocal(
         ReadBufferFromIStream rbuf(istr, buffer_size);
         WriteBufferFromWritableFile wbuf(ofile, buffer_size);
         copyData(rbuf, wbuf, content_length);
+        if (s3_read_metrics_recorder != nullptr)
+            s3_read_metrics_recorder->recordBytes(rbuf.count(), S3::S3ReadSource::FileCacheDownload);
         wbuf.sync();
         return;
     }
@@ -1280,6 +1283,8 @@ void downloadToLocal(
     ReadBufferFromIStreamWithLimiter rbuf(istr, buffer_size, s3_read_limiter, S3::S3ReadSource::FileCacheDownload);
     WriteBufferFromWritableFile wbuf(ofile, buffer_size);
     copyData(rbuf, wbuf, content_length);
+    if (s3_read_metrics_recorder != nullptr)
+        s3_read_metrics_recorder->recordBytes(rbuf.count(), S3::S3ReadSource::FileCacheDownload);
     wbuf.sync();
 }
 
@@ -1288,6 +1293,7 @@ void FileCache::downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, c
     Stopwatch sw;
     auto client = S3::ClientFactory::instance().sharedTiFlashClient();
     auto s3_read_limiter = client->getS3ReadLimiter();
+    auto s3_read_metrics_recorder = client->getS3ReadMetricsRecorder();
     Aws::S3::Model::GetObjectRequest req;
     client->setBucketAndKeyWithRoot(req, s3_key);
     ProfileEvents::increment(ProfileEvents::S3GetObject);
@@ -1320,7 +1326,13 @@ void FileCache::downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, c
     auto temp_fname = toTemporaryFilename(local_fname);
     SYNC_FOR("before_FileCache::downloadImpl_download_to_local");
     FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::file_cache_bg_download_fail);
-    downloadToLocal(result.GetBody(), temp_fname, content_length, write_limiter, s3_read_limiter);
+    downloadToLocal(
+        result.GetBody(),
+        temp_fname,
+        content_length,
+        write_limiter,
+        s3_read_limiter,
+        s3_read_metrics_recorder);
     std::filesystem::rename(temp_fname, local_fname);
 
 #ifndef NDEBUG
diff --git a/dbms/src/Storages/S3/MockS3Client.h b/dbms/src/Storages/S3/MockS3Client.h
index 2a22168a4c8..4a93a5b69fe 100644
--- a/dbms/src/Storages/S3/MockS3Client.h
+++ b/dbms/src/Storages/S3/MockS3Client.h
@@ -30,7 +30,8 @@ class MockS3Client final : public S3::TiFlashS3Client
         const String & root,
         const Aws::Auth::AWSCredentials & cred,
         const Aws::Client::ClientConfiguration & cfg,
-        std::shared_ptr<S3ReadLimiter> s3_read_limiter = nullptr)
+        std::shared_ptr<S3ReadLimiter> s3_read_limiter = nullptr,
+        std::shared_ptr<S3ReadMetricsRecorder> s3_read_metrics_recorder = nullptr)
         : TiFlashS3Client(
             bucket,
             root,
@@ -38,7 +39,8 @@ class MockS3Client final : public S3::TiFlashS3Client
             cfg,
             Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
             false,
-            std::move(s3_read_limiter))
+            std::move(s3_read_limiter),
+            std::move(s3_read_metrics_recorder))
     {}
 
     ~MockS3Client() override = default;
diff --git a/dbms/src/Storages/S3/S3Common.cpp b/dbms/src/Storages/S3/S3Common.cpp
index acbc39573f3..8c46828b56e 100644
--- a/dbms/src/Storages/S3/S3Common.cpp
+++ b/dbms/src/Storages/S3/S3Common.cpp
@@ -34,6 +34,7 @@
 #include <Storages/S3/PocoHTTPClientFactory.h>
 #include <Storages/S3/S3Common.h>
 #include <Storages/S3/S3Filename.h>
+#include <Storages/S3/S3ReadLimiter.h>
 #include <Storages/S3/S3RandomAccessFile.h>
 #include <aws/core/Region.h>
 #include <aws/core/auth/AWSCredentials.h>
@@ -183,11 +184,13 @@ TiFlashS3Client::TiFlashS3Client(
     const Aws::Client::ClientConfiguration & clientConfiguration,
     Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy signPayloads,
     bool useVirtualAddressing,
-    std::shared_ptr<S3ReadLimiter> s3_read_limiter_)
+    std::shared_ptr<S3ReadLimiter> s3_read_limiter_,
+    std::shared_ptr<S3ReadMetricsRecorder> s3_read_metrics_recorder_)
     : Aws::S3::S3Client(credentials, clientConfiguration, signPayloads, useVirtualAddressing)
     , bucket_name(bucket_name_)
     , key_root(normalizedRoot(root_))
     , s3_read_limiter(std::move(s3_read_limiter_))
+    , s3_read_metrics_recorder(std::move(s3_read_metrics_recorder_))
     , log(Logger::get(fmt::format("bucket={} root={}", bucket_name, key_root)))
 {}
 
@@ -195,11 +198,13 @@ TiFlashS3Client::TiFlashS3Client(
     const String & bucket_name_,
     const String & root_,
     std::unique_ptr<Aws::S3::S3Client> && raw_client,
-    std::shared_ptr<S3ReadLimiter> s3_read_limiter_)
+    std::shared_ptr<S3ReadLimiter> s3_read_limiter_,
+    std::shared_ptr<S3ReadMetricsRecorder> s3_read_metrics_recorder_)
     : Aws::S3::S3Client(std::move(*raw_client))
     , bucket_name(bucket_name_)
     , key_root(normalizedRoot(root_))
     , s3_read_limiter(std::move(s3_read_limiter_))
+    , s3_read_metrics_recorder(std::move(s3_read_metrics_recorder_))
     , log(Logger::get(fmt::format("bucket={} root={}", bucket_name, key_root)))
 {}
 
@@ -336,6 +341,8 @@ void ClientFactory::init(const StorageS3Config & config_, bool mock_s3_)
         return;
 
     config = config_;
+    if (shared_s3_read_metrics_recorder == nullptr)
+        shared_s3_read_metrics_recorder = std::make_shared<S3ReadMetricsRecorder>();
     RUNTIME_CHECK(!config.root.starts_with("//"), config.root);
     config.root = normalizedRoot(config.root);
 
@@ -353,7 +360,8 @@ void ClientFactory::init(const StorageS3Config & config_, bool mock_s3_)
             config.bucket,
             config.root,
             std::move(s3_client),
-            shared_s3_read_limiter);
+            shared_s3_read_limiter,
+            shared_s3_read_metrics_recorder);
     }
     else
     {
@@ -362,7 +370,13 @@ void ClientFactory::init(const StorageS3Config & config_, bool mock_s3_)
         cfg.region = Aws::Region::US_EAST_1; // default region
         Aws::Auth::AWSCredentials cred("mock_access_key", "mock_secret_key");
         shared_tiflash_client
-            = std::make_unique<tests::MockS3Client>(config.bucket, config.root, cred, cfg, shared_s3_read_limiter);
+            = std::make_unique<tests::MockS3Client>(
+                config.bucket,
+                config.root,
+                cred,
+                cfg,
+                shared_s3_read_limiter,
+                shared_s3_read_metrics_recorder);
     }
     client_is_inited = true; // init finish
 }
@@ -384,6 +398,8 @@ std::shared_ptr<TiFlashS3Client> ClientFactory::initClientFromWriteNode()
     assert(kv_cluster != nullptr);
 
     const auto disagg_config = getDisaggConfigFromDisaggWriteNodes(kv_cluster, log);
+    if (shared_s3_read_metrics_recorder == nullptr)
+        shared_s3_read_metrics_recorder = std::make_shared<S3ReadMetricsRecorder>();
     // update connection fields and leave other fields unchanged
     config.endpoint = disagg_config.s3_config().endpoint();
     config.root = normalizedRoot(disagg_config.s3_config().root());
@@ -393,7 +409,12 @@ std::shared_ptr<TiFlashS3Client> ClientFactory::initClientFromWriteNode()
     auto [s3_client, vendor] = create(config, log);
     cloud_vendor = vendor;
     shared_tiflash_client
-        = std::make_shared<TiFlashS3Client>(config.bucket, config.root, std::move(s3_client), shared_s3_read_limiter);
+        = std::make_shared<TiFlashS3Client>(
+            config.bucket,
+            config.root,
+            std::move(s3_client),
+            shared_s3_read_limiter,
+            shared_s3_read_metrics_recorder);
     client_is_inited = true; // init finish
     return shared_tiflash_client;
 }
diff --git a/dbms/src/Storages/S3/S3Common.h b/dbms/src/Storages/S3/S3Common.h
index b5e2c21ce4c..9f85bc9f928 100644
--- a/dbms/src/Storages/S3/S3Common.h
+++ b/dbms/src/Storages/S3/S3Common.h
@@ -21,6 +21,7 @@
 #include <IO/FileProvider/FileProvider_fwd.h>
 #include <Interpreters/Context_fwd.h>
 #include <Server/StorageConfigParser.h>
+#include <Storages/S3/S3ReadLimiter_fwd.h>
 #include <aws/core/Aws.h>
 #include <aws/core/http/Scheme.h>
 #include <aws/s3/S3Client.h>
@@ -43,8 +44,6 @@ extern const int S3_ERROR;
 
 namespace DB::S3
 {
-class S3ReadLimiter;
-
 inline String S3ErrorMessage(const Aws::S3::S3Error & e)
 {
     return fmt::format(
@@ -74,13 +73,15 @@ class TiFlashS3Client : public Aws::S3::S3Client
         const Aws::Client::ClientConfiguration & clientConfiguration,
         Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy signPayloads,
         bool useVirtualAddressing,
-        std::shared_ptr<S3ReadLimiter> s3_read_limiter_ = nullptr);
+        std::shared_ptr<S3ReadLimiter> s3_read_limiter_ = nullptr,
+        std::shared_ptr<S3ReadMetricsRecorder> s3_read_metrics_recorder_ = nullptr);
 
     TiFlashS3Client(
         const String & bucket_name_,
         const String & root_,
         std::unique_ptr<Aws::S3::S3Client> && raw_client,
-        std::shared_ptr<S3ReadLimiter> s3_read_limiter_ = nullptr);
+        std::shared_ptr<S3ReadLimiter> s3_read_limiter_ = nullptr,
+        std::shared_ptr<S3ReadMetricsRecorder> s3_read_metrics_recorder_ = nullptr);
 
     const String & bucket() const { return bucket_name; }
 
@@ -108,11 +109,25 @@ class TiFlashS3Client : public Aws::S3::S3Client
         s3_read_limiter = std::move(limiter);
     }
 
+    std::shared_ptr<S3ReadMetricsRecorder> getS3ReadMetricsRecorder() const
+    {
+        std::lock_guard lock(s3_read_metrics_recorder_mutex);
+        return s3_read_metrics_recorder;
+    }
+
+    void setS3ReadMetricsRecorder(std::shared_ptr<S3ReadMetricsRecorder> recorder)
+    {
+        std::lock_guard lock(s3_read_metrics_recorder_mutex);
+        s3_read_metrics_recorder = std::move(recorder);
+    }
+
 private:
     const String bucket_name;
     String key_root;
     mutable std::mutex s3_read_limiter_mutex;
     std::shared_ptr<S3ReadLimiter> s3_read_limiter;
+    mutable std::mutex s3_read_metrics_recorder_mutex;
+    std::shared_ptr<S3ReadMetricsRecorder> s3_read_metrics_recorder;
 
 public:
     LoggerPtr log;
@@ -175,6 +190,14 @@ class ClientFactory
             shared_tiflash_client->setS3ReadLimiter(shared_s3_read_limiter);
     }
 
+    void setS3ReadMetricsRecorder(const std::shared_ptr<S3ReadMetricsRecorder> & recorder)
+    {
+        std::unique_lock lock_init(mtx_init);
+        shared_s3_read_metrics_recorder = recorder;
+        if (shared_tiflash_client != nullptr)
+            shared_tiflash_client->setS3ReadMetricsRecorder(shared_s3_read_metrics_recorder);
+    }
+
     S3GCMethod gc_method = S3GCMethod::Lifecycle;
 
     CloudVendor cloud_vendor = CloudVendor::Unknown;
@@ -201,6 +224,7 @@ class ClientFactory
     StorageS3Config config;
     std::shared_ptr<TiFlashS3Client> shared_tiflash_client;
     std::shared_ptr<S3ReadLimiter> shared_s3_read_limiter;
+    std::shared_ptr<S3ReadMetricsRecorder> shared_s3_read_metrics_recorder;
     pingcap::kv::Cluster * kv_cluster = nullptr;
 
     LoggerPtr log;
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.cpp b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
index 3e60efc1702..34b8a500546 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.cpp
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
@@ -76,11 +76,13 @@ S3RandomAccessFile::S3RandomAccessFile(
     , remote_fname(remote_fname_)
     , cur_offset(0)
     , read_limiter(nullptr)
+    , read_metrics_recorder(nullptr)
     , log(Logger::get(remote_fname))
     , scan_context(scan_context_)
 {
     RUNTIME_CHECK(client_ptr != nullptr);
     read_limiter = client_ptr->getS3ReadLimiter();
+    read_metrics_recorder = client_ptr->getS3ReadMetricsRecorder();
     initialize("init file");
     CurrentMetrics::add(CurrentMetrics::S3RandomAccessFile);
 }
@@ -225,6 +227,8 @@ ssize_t S3RandomAccessFile::finalizeRead(
     }
     cur_offset += actual_size;
     ProfileEvents::increment(ProfileEvents::S3ReadBytes, actual_size);
+    if (read_metrics_recorder != nullptr)
+        read_metrics_recorder->recordBytes(actual_size, S3ReadSource::DirectRead);
     return actual_size;
 }
 
@@ -351,6 +355,8 @@ off_t S3RandomAccessFile::finalizeSeek(
             elapsed_secs);
     }
     ProfileEvents::increment(ProfileEvents::S3ReadBytes, actual_size);
+    if (read_metrics_recorder != nullptr)
+        read_metrics_recorder->recordBytes(actual_size, S3ReadSource::DirectRead);
     cur_offset = target_offset;
     return cur_offset;
 }
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.h b/dbms/src/Storages/S3/S3RandomAccessFile.h
index 51ccd323412..2a6d5244014 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.h
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.h
@@ -19,7 +19,7 @@
 #include <Common/Stopwatch.h>
 #include <IO/BaseFile/RandomAccessFile.h>
 #include <Storages/DeltaMerge/ScanContext_fwd.h>
-#include <Storages/S3/S3ReadLimiter.h>
+#include <Storages/S3/S3ReadLimiter_fwd.h>
 #include <aws/s3/model/GetObjectResult.h>
 #include <common/types.h>
 
@@ -34,7 +34,6 @@
 namespace DB::S3
 {
 class TiFlashS3Client;
-class S3ReadLimiter;
 } // namespace DB::S3
 
 namespace DB::ErrorCodes
@@ -119,6 +118,7 @@ class S3RandomAccessFile final : public RandomAccessFile
     Aws::S3::Model::GetObjectResult read_result;
     Int64 content_length = 0;
     std::shared_ptr<S3ReadLimiter> read_limiter;
+    std::shared_ptr<S3ReadMetricsRecorder> read_metrics_recorder;
 
     DB::LoggerPtr log;
     bool is_close = false;
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.cpp b/dbms/src/Storages/S3/S3ReadLimiter.cpp
index da982323d0a..013703cb563 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.cpp
+++ b/dbms/src/Storages/S3/S3ReadLimiter.cpp
@@ -33,6 +33,22 @@ void recordWaitIfNeeded(bool waited, const Stopwatch & sw, F && observe)
 }
 } // namespace
 
+void DB::S3::S3ReadMetricsRecorder::recordBytes(UInt64 bytes, S3ReadSource source) const
+{
+    if (bytes == 0)
+        return;
+
+    switch (source)
+    {
+    case S3ReadSource::DirectRead:
+        GET_METRIC(tiflash_storage_io_limiter, type_s3_direct_read_bytes).Increment(bytes);
+        break;
+    case S3ReadSource::FileCacheDownload:
+        GET_METRIC(tiflash_storage_io_limiter, type_s3_filecache_download_bytes).Increment(bytes);
+        break;
+    }
+}
+
 DB::S3::S3ReadLimiter::S3ReadLimiter(UInt64 max_read_bytes_per_sec_, UInt64 refill_period_ms_)
     : refill_period_ms(refill_period_ms_)
     , max_read_bytes_per_sec(max_read_bytes_per_sec_)
@@ -63,21 +79,11 @@ void DB::S3::S3ReadLimiter::updateConfig(UInt64 max_read_bytes_per_sec_)
     bytes_cv.notify_all();
 }
 
-void DB::S3::S3ReadLimiter::requestBytes(UInt64 bytes, S3ReadSource source)
+void DB::S3::S3ReadLimiter::requestBytes(UInt64 bytes, S3ReadSource /*source*/)
 {
     if (bytes == 0)
         return;
 
-    switch (source)
-    {
-    case S3ReadSource::DirectRead:
-        GET_METRIC(tiflash_storage_io_limiter, type_s3_direct_read_bytes).Increment(bytes);
-        break;
-    case S3ReadSource::FileCacheDownload:
-        GET_METRIC(tiflash_storage_io_limiter, type_s3_filecache_download_bytes).Increment(bytes);
-        break;
-    }
-
     const auto limit = max_read_bytes_per_sec.load(std::memory_order_relaxed);
     if (limit == 0)
         return;
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.h b/dbms/src/Storages/S3/S3ReadLimiter.h
index 72c0a3701c5..bb8db363451 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.h
+++ b/dbms/src/Storages/S3/S3ReadLimiter.h
@@ -24,12 +24,19 @@
 
 namespace DB::S3
 {
-enum class S3ReadSource
+enum class S3ReadSource : UInt8
 {
     DirectRead,
     FileCacheDownload,
 };
 
+class S3ReadMetricsRecorder
+{
+public:
+    /// Record remote-read bytes regardless of whether byte throttling is enabled.
+    void recordBytes(UInt64 bytes, S3ReadSource source) const;
+};
+
 class S3ReadLimiter
 {
 public:
diff --git a/dbms/src/Storages/S3/S3ReadLimiter_fwd.h b/dbms/src/Storages/S3/S3ReadLimiter_fwd.h
new file mode 100644
index 00000000000..d9423796a92
--- /dev/null
+++ b/dbms/src/Storages/S3/S3ReadLimiter_fwd.h
@@ -0,0 +1,25 @@
+// Copyright 2023 PingCAP, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <common/types.h>
+
+namespace DB::S3
+{
+enum class S3ReadSource : UInt8;
+
+class S3ReadLimiter;
+class S3ReadMetricsRecorder;
+} // namespace DB::S3

From 3b9fcb80199680956f7265ffe1440fcb0c7daac0 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sun, 5 Apr 2026 13:41:05 +0800
Subject: [PATCH 23/36] disagg: add mark and minmax cache eviction APIs

---
 .../KVStore/FFI/ProxyFFIStatusService.cpp     |  93 ++++++
 .../KVStore/FFI/ProxyFFIStatusService.h       |  15 +
 .../KVStore/FFI/tests/gtest_status_server.cpp |  78 +++++
 ...4-05-mark-and-minmax-cache-eviction-api.md | 305 ++++++++++++++++++
 docs/tiflash_http_api.md                      |  52 +++
 5 files changed, 543 insertions(+)
 create mode 100644 docs/design/2026-04-05-mark-and-minmax-cache-eviction-api.md

diff --git a/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.cpp b/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.cpp
index 5817d2820dd..c81e6c4a28a 100644
--- a/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.cpp
+++ b/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.cpp
@@ -579,6 +579,97 @@ RemoteCacheEvictRequest parseEvictRequest(
     return req;
 }
 
+std::optional<CacheEvictType> parseCacheEvictType(
+    std::string_view path,
+    std::string_view api_name,
+    String & err_msg)
+{
+    auto trim_path = path.substr(api_name.size());
+    if (trim_path == "/mark")
+        return CacheEvictType::Mark;
+    if (trim_path == "/minmax")
+        return CacheEvictType::MinMax;
+
+    err_msg = fmt::format("invalid cache evict request: {}", path);
+    return std::nullopt;
+}
+
+namespace
+{
+String buildCacheEvictOkBody(std::string_view cache_name, std::optional<std::string_view> message = std::nullopt)
+{
+    if (message.has_value())
+        return fmt::format(R"json({{"status":"ok","cache":"{}","message":"{}"}})json", cache_name, *message);
+    return fmt::format(R"json({{"status":"ok","cache":"{}"}})json", cache_name);
+}
+
+void evictLocalCacheOrThrow(Context & global_ctx, CacheEvictType cache_type)
+{
+    switch (cache_type)
+    {
+    case CacheEvictType::Mark:
+        global_ctx.dropMarkCache();
+        return;
+    case CacheEvictType::MinMax:
+        global_ctx.dropMinMaxIndexCache();
+        return;
+    }
+    __builtin_unreachable();
+}
+
+std::string_view cacheTypeName(CacheEvictType cache_type)
+{
+    switch (cache_type)
+    {
+    case CacheEvictType::Mark:
+        return "mark";
+    case CacheEvictType::MinMax:
+        return "minmax";
+    }
+    __builtin_unreachable();
+}
+} // namespace
+
+HttpRequestRes HandleHttpRequestLocalCacheEvict(
+    EngineStoreServerWrap * server,
+    std::string_view path,
+    const std::string & api_name,
+    std::string_view,
+    std::string_view)
+{
+    auto & global_ctx = server->tmt->getContext();
+    auto log = Logger::get("HandleHttpRequestLocalCacheEvict");
+
+    String err_msg;
+    auto cache_type = parseCacheEvictType(path, api_name, err_msg);
+    if (!cache_type.has_value())
+    {
+        auto body = fmt::format(R"json({{"status":"error","message":"{}"}})json", err_msg);
+        LOG_WARNING(log, "invalid local cache evict request, path={} api_name={}", path, api_name);
+        return buildRespWithCode(HttpRequestStatus::BadRequest, api_name, std::move(body));
+    }
+
+    const auto cache_name = cacheTypeName(*cache_type);
+    const bool cache_enabled = [&] {
+        switch (*cache_type)
+        {
+        case CacheEvictType::Mark:
+            return global_ctx.getMarkCache() != nullptr;
+        case CacheEvictType::MinMax:
+            return global_ctx.getMinMaxIndexCache() != nullptr;
+        }
+        __builtin_unreachable();
+    }();
+
+    // `drop*Cache()` eventually calls `LRUCache::reset()`, which clears the registry under the
+    // cache mutex while leaving already-held `shared_ptr` values valid for in-flight readers.
+    evictLocalCacheOrThrow(global_ctx, *cache_type);
+    LOG_INFO(log, "manual cache eviction, action=evict cache={} result={}", cache_name, cache_enabled ? "ok" : "noop");
+    return buildOkResp(
+        api_name,
+        cache_enabled ? buildCacheEvictOkBody(cache_name) : buildCacheEvictOkBody(cache_name, "cache not enabled"));
+}
+
 HttpRequestRes HandleHttpRequestRemoteCacheEvict(
     EngineStoreServerWrap * server,
     std::string_view path,
@@ -817,6 +908,7 @@ using HANDLE_HTTP_URI_METHOD = HttpRequestRes (*)(
     std::string_view);
 
 // A registry of available HTTP URI prefix (API name) and their handler methods.
+// Keep `docs/tiflash_http_api.md` in sync whenever adding or changing a public HTTP API here.
 static const std::map<std::string, HANDLE_HTTP_URI_METHOD> AVAILABLE_HTTP_URI = {
     {"/tiflash/sync-status/", HandleHttpRequestSyncStatus},
     {"/tiflash/sync-region/", HandleHttpRequestSyncRegion},
@@ -836,6 +928,7 @@ static const std::map<std::string, HANDLE_HTTP_URI_METHOD> AVAILABLE_HTTP_URI =
     {"/tiflash/remote/gc", HandleHttpRequestRemoteGC},
     {"/tiflash/remote/upload", HandleHttpRequestRemoteReUpload},
     {"/tiflash/remote/info", HandleHttpRequestRemoteInfo},
+    {"/tiflash/cache/evict", HandleHttpRequestLocalCacheEvict},
     {"/tiflash/remote/cache/evict", HandleHttpRequestRemoteCacheEvict},
     {"/tiflash/remote/cache/info", HandleHttpRequestRemoteCacheInfo},
 };
diff --git a/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.h b/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.h
index 72c00a846dc..8d0851c9ff6 100644
--- a/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.h
+++ b/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.h
@@ -16,6 +16,8 @@
 
 #include <Storages/S3/FileCache.h>
 
+#include <optional>
+
 namespace DB
 {
 
@@ -28,6 +30,12 @@ enum class EvictMethod
     ByEvictSize,
 };
 
+enum class CacheEvictType
+{
+    Mark = 0,
+    MinMax,
+};
+
 struct RemoteCacheEvictRequest
 {
     EvictMethod evict_method;
@@ -40,6 +48,13 @@ struct RemoteCacheEvictRequest
 
 RemoteCacheEvictRequest parseEvictRequest(std::string_view path, std::string_view api_name, std::string_view query);
 
+/// Parse `/tiflash/cache/evict/<type>` and resolve the target node-local cache type.
+/// Returns `std::nullopt` and sets `err_msg` when the path suffix is invalid.
+std::optional<CacheEvictType> parseCacheEvictType(
+    std::string_view path,
+    std::string_view api_name,
+    String & err_msg);
+
 std::tuple<std::vector<StoreID>, String> parseStoreIds(std::string_view path);
 
 } // namespace DB
diff --git a/dbms/src/Storages/KVStore/FFI/tests/gtest_status_server.cpp b/dbms/src/Storages/KVStore/FFI/tests/gtest_status_server.cpp
index 84586a59629..883e9972d4b 100644
--- a/dbms/src/Storages/KVStore/FFI/tests/gtest_status_server.cpp
+++ b/dbms/src/Storages/KVStore/FFI/tests/gtest_status_server.cpp
@@ -16,6 +16,8 @@
 #include <Common/StringUtils/StringRefUtils.h>
 #include <Databases/DatabaseTiFlash.h>
 #include <Debug/MockKVStore/MockUtils.h>
+#include <DataStreams/MarkInCompressedFile.h>
+#include <DataTypes/DataTypesNumber.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/InterpreterCreateQuery.h>
 #include <Interpreters/InterpreterDropQuery.h>
@@ -32,6 +34,7 @@
 #include <Storages/KVStore/Types.h>
 #include <Storages/KVStore/tests/region_kvstore_test.h>
 #include <Storages/StorageDeltaMerge.h>
+#include <Storages/DeltaMerge/Index/MinMaxIndex.h>
 #include <Storages/registerStorages.h>
 #include <TestUtils/TiFlashTestBasic.h>
 #include <TiDB/Schema/SchemaNameMapper.h>
@@ -588,4 +591,79 @@ TEST_F(StatusServerTest, TestParseRemoteCacheEvictRequest)
     }
 }
 
+TEST_F(StatusServerTest, TestParseLocalCacheEvictType)
+{
+    String err_msg;
+
+    auto cache_type = parseCacheEvictType("/tiflash/cache/evict/mark", "/tiflash/cache/evict", err_msg);
+    ASSERT_TRUE(cache_type.has_value());
+    ASSERT_EQ(*cache_type, CacheEvictType::Mark);
+    ASSERT_TRUE(err_msg.empty());
+
+    err_msg.clear();
+    cache_type = parseCacheEvictType("/tiflash/cache/evict/minmax", "/tiflash/cache/evict", err_msg);
+    ASSERT_TRUE(cache_type.has_value());
+    ASSERT_EQ(*cache_type, CacheEvictType::MinMax);
+    ASSERT_TRUE(err_msg.empty());
+
+    err_msg.clear();
+    cache_type = parseCacheEvictType("/tiflash/cache/evict/unknown", "/tiflash/cache/evict", err_msg);
+    ASSERT_FALSE(cache_type.has_value());
+    ASSERT_FALSE(err_msg.empty());
+}
+
+TEST_F(StatusServerTest, TestLocalCacheEvict)
+{
+    auto ctx = TiFlashTestEnv::getContext();
+    EngineStoreServerWrap store_server_wrap{};
+    store_server_wrap.tmt = &ctx->getTMTContext();
+    auto helper = GetEngineStoreServerHelper(&store_server_wrap);
+
+    if (!ctx->getMarkCache())
+        ctx->setMarkCache(1024 * 1024);
+    if (!ctx->getMinMaxIndexCache())
+        ctx->setMinMaxIndexCache(1024 * 1024);
+
+    {
+        auto mark_cache = ctx->getMarkCache();
+        ASSERT_NE(mark_cache, nullptr);
+        auto marks = std::make_shared<MarksInCompressedFile>();
+        marks->push_back(MarkInCompressedFile{1, 2});
+        mark_cache->set("mark-key", marks);
+        ASSERT_EQ(mark_cache->count(), 1);
+
+        String path = "/tiflash/cache/evict/mark";
+        auto res = helper.fn_handle_http_request(
+            &store_server_wrap,
+            BaseBuffView{path.data(), path.length()},
+            BaseBuffView{"", 0},
+            BaseBuffView{"", 0});
+        EXPECT_EQ(res.status, HttpRequestStatus::Ok);
+        EXPECT_EQ(std::string_view(res.res.view.data, res.res.view.len), R"json({"status":"ok","cache":"mark"})json");
+        EXPECT_EQ(mark_cache->count(), 0);
+        releaseResp(helper, std::move(res));
+    }
+
+    {
+        auto minmax_cache = ctx->getMinMaxIndexCache();
+        ASSERT_NE(minmax_cache, nullptr);
+        auto index = std::make_shared<DM::MinMaxIndex>(DataTypeInt64());
+        minmax_cache->set("minmax-key", index);
+        ASSERT_EQ(minmax_cache->count(), 1);
+
+        String path = "/tiflash/cache/evict/minmax";
+        auto res = helper.fn_handle_http_request(
+            &store_server_wrap,
+            BaseBuffView{path.data(), path.length()},
+            BaseBuffView{"", 0},
+            BaseBuffView{"", 0});
+        EXPECT_EQ(res.status, HttpRequestStatus::Ok);
+        EXPECT_EQ(
+            std::string_view(res.res.view.data, res.res.view.len),
+            R"json({"status":"ok","cache":"minmax"})json");
+        EXPECT_EQ(minmax_cache->count(), 0);
+        releaseResp(helper, std::move(res));
+    }
+}
+
 } // namespace DB::tests
diff --git a/docs/design/2026-04-05-mark-and-minmax-cache-eviction-api.md b/docs/design/2026-04-05-mark-and-minmax-cache-eviction-api.md
new file mode 100644
index 00000000000..7622b15d75d
--- /dev/null
+++ b/docs/design/2026-04-05-mark-and-minmax-cache-eviction-api.md
@@ -0,0 +1,305 @@
+# Mark and MinMax Cache Eviction HTTP API
+
+Purpose: propose a small operational API that clears the node-local `MarkCache`
+and `MinMaxIndexCache` on demand, primarily for controlled experiments,
+diagnostics, and cache-state reset during disaggregated read investigations.
+
+Date: 2026-04-05
+
+## Summary
+
+Add two TiFlash HTTP endpoints under `ProxyFFIStatusService`:
+
+- `/tiflash/cache/evict/mark`
+- `/tiflash/cache/evict/minmax`
+
+Each endpoint clears one node-local cache by calling the existing `Context`
+cache-drop methods:
+
+- `Context::dropMarkCache()`
+- `Context::dropMinMaxIndexCache()`
+
+The proposal intentionally keeps the first version narrow:
+
+- node-local scope only
+- no table-level or key-level eviction
+- no extra query parameters
+- no new cache implementation or refactor
+
+This is the simplest production-safe design because both caches are already
+owned by `Context`, already support `reset()`, and the existing `LRUCache`
+semantics are compatible with concurrent readers that still hold cached values.
+
+## Context
+
+### Current state
+
+TiFlash already exposes several operational HTTP endpoints from
+`ProxyFFIStatusService`, including readiness probes and remote cache eviction:
+
+- `dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.cpp:839`
+
+The service has direct access to the global TiFlash context through
+`EngineStoreServerWrap` / `TMTContext`, so it is a natural location for
+small node-local management actions.
+
+The two caches in scope are already globally owned by `Context`:
+
+- `MarkCache`
+  - definition: `dbms/src/Storages/MarkCache.h:64`
+  - `Context` API: `dbms/src/Interpreters/Context.h:394`
+  - getter implementation: `dbms/src/Interpreters/Context.cpp:1208`
+  - drop implementation: `dbms/src/Interpreters/Context.cpp:1215`
+- `MinMaxIndexCache`
+  - definition: `dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.h:158`
+  - `Context` API: `dbms/src/Interpreters/Context.h:398`
+  - getter implementation: `dbms/src/Interpreters/Context.cpp:1233`
+  - drop implementation: `dbms/src/Interpreters/Context.cpp:1239`
+
+Both drop methods already reset the corresponding cache instance rather than
+destroying and recreating the whole cache object.
+
+### Why this API is needed
+
+Remote-cache eviction alone is not sufficient for disaggregated read
+experiments. Mark and min-max metadata may stay warm even after remote cache is
+cleared, which makes it difficult to reproduce a fully cold metadata path.
+
+Today there is no dedicated operational endpoint to clear these caches without
+restarting TiFlash or relying on more invasive test-only machinery.
+
+### Verified cache reset semantics
+
+The proposal depends on `LRUCache::reset()` being safe while concurrent queries
+still hold cached values.
+
+Relevant implementation details:
+
+- cached values are stored as `std::shared_ptr<Mapped>`
+  - `dbms/src/Common/LRUCache.h:52`
+- `reset()` clears internal registry state under the cache mutex
+  - `dbms/src/Common/LRUCache.h:186`
+- `getOrSet()` already explicitly handles concurrent `reset()` on
+  `insert_tokens`
+  - `dbms/src/Common/LRUCache.h:138`
+
+This means:
+
+- in-flight readers that already obtained a cached value keep owning their
+  `shared_ptr`
+- new readers observe a cold-cache view after reset
+- internal cache structures remain synchronized by the same mutex used for
+  `get`, `set`, `getOrSet`, `remove`, and `reset`
+
+Therefore node-local cache eviction is compatible with concurrent query
+execution and does not require stopping traffic first.
+
+## Goals
+
+- Provide a simple node-local API to clear `MarkCache` on demand.
+- Provide a simple node-local API to clear `MinMaxIndexCache` on demand.
+- Keep the semantics safe for concurrent readers that already hold cached
+  values.
+- Reuse existing `Context` and cache-reset logic instead of inventing a new
+  eviction mechanism.
+- Make the API shape extensible for future cache types.
+
+## Non-Goals
+
+- No table-level, region-level, or key-level eviction.
+- No cluster-wide fan-out or cross-node coordination.
+- No new cache implementation, ownership redesign, or lifecycle refactor.
+- No authentication or authorization redesign in this proposal.
+- No automatic background eviction or policy-based eviction changes.
+- No attempt to preserve internal cache hit/miss statistics across reset.
+
+## Compatibility and Invariants
+
+- The API is node-local and only affects the TiFlash instance that receives the
+  HTTP request.
+- Existing queries that already obtained cached values continue using those
+  values safely.
+- Queries that look up the cache after eviction observe a miss and rebuild the
+  cache entry through the normal read path.
+- The API must not destroy or replace `Context` ownership of the cache object;
+  it should only call the existing drop/reset path.
+- The API returns success even if the target cache is not enabled; in that
+  case the operation is a no-op.
+
+## Design
+
+### API shape
+
+Add two new endpoints:
+
+- `/tiflash/cache/evict/mark`
+- `/tiflash/cache/evict/minmax`
+
+The `cache/evict/<type>` pattern is preferred over ad-hoc endpoint names
+because it leaves a clean extension path for future node-local cache eviction
+APIs such as `remote`, `column`, or `vector`.
+
+### Request and response behavior
+
+Requests are simple HTTP calls with no body and no query parameters.
+
+Suggested response body for the first version:
+
+```json
+{"status":"ok","cache":"mark"}
+```
+
+and:
+
+```json
+{"status":"ok","cache":"minmax"}
+```
+
+If the target cache is not enabled, still return success with a small message,
+for example:
+
+```json
+{"status":"ok","cache":"mark","message":"cache not enabled"}
+```
+
+This keeps the API operationally convenient and idempotent.
+
+### Logging and response conventions
+
+The first version should emit an INFO audit log for every manual cache-evict
+request. The log should at least include:
+
+- action: `evict`
+- cache: `mark` or `minmax`
+- result: `ok`, `noop`, or `error`
+
+This is sufficient for operational traceability and experiment timeline
+correlation. The first version does not need a dedicated Prometheus metric for
+manual cache eviction.
+
+Response bodies should use a consistent small JSON shape such as:
+
+```json
+{"status":"ok","cache":"mark"}
+```
+
+or:
+
+```json
+{"status":"ok","cache":"minmax","message":"cache not enabled"}
+```
+
+The proposal intentionally does not introduce a shared helper for cache-evict
+responses yet. With only two endpoints, keeping the JSON shape consistent in
+the handlers is simpler than adding a new abstraction layer.
+
+### Handler implementation
+
+Implement the new handlers in `ProxyFFIStatusService.cpp` and route them from
+the same path-dispatch table that already owns remote-cache eviction.
+
+The handler logic should reuse the existing `Context` methods directly:
+
+- `global_ctx.dropMarkCache()`
+- `global_ctx.dropMinMaxIndexCache()`
+
+This is preferable to calling `reset()` on the cache object directly because it
+keeps all cache-management logic behind the `Context` API boundary.
+
+### Execution model
+
+The operation is synchronous and lightweight:
+
+1. parse the path
+2. resolve the cache type
+3. obtain `Context` from `server->tmt`
+4. call the corresponding `drop*Cache()` method
+5. return a small success response
+
+No asynchronous job or background task is required.
+
+### ASCII flow
+
+```text
+HTTP request
+  -> ProxyFFIStatusService
+     -> parse /tiflash/cache/evict/<type>
+     -> resolve global Context
+     -> Context::dropMarkCache() or dropMinMaxIndexCache()
+        -> LRUCache::reset()
+           -> clear cache registry under mutex
+           -> preserve already-held shared_ptr values in running queries
+     -> return {"status":"ok", ...}
+```
+
+## Alternatives Considered
+
+### 1. Add per-table or per-key eviction
+
+Rejected for the first version.
+
+That would require new cache key parsing and more precise ownership semantics,
+while the immediate operational need is only to force a node-local cold path.
+
+### 2. Recreate the cache object instead of resetting it
+
+Rejected.
+
+`Context` already exposes `dropMarkCache()` and `dropMinMaxIndexCache()` with
+ the current reset semantics. Replacing the object instance adds risk without
+clear benefit.
+
+### 3. Put the API somewhere outside `ProxyFFIStatusService`
+
+Rejected.
+
+`ProxyFFIStatusService` already hosts similar operational endpoints and already
+has the right access to `Context` and `TMTContext`.
+
+## Incremental Plan
+
+1. Add two new handlers in `ProxyFFIStatusService.cpp`.
+2. Register both endpoints in the existing route table.
+3. Return small JSON success responses.
+4. Add unit tests for path dispatch and behavior when the cache is enabled or
+   absent.
+5. Validate manually on a dev cluster by evicting the cache and confirming the
+   next query rebuilds the corresponding cache entries.
+
+## Validation Strategy
+
+### Unit tests
+
+- request path dispatch resolves `/tiflash/cache/evict/mark`
+- request path dispatch resolves `/tiflash/cache/evict/minmax`
+- handler returns success when the cache exists
+- handler returns success when the cache is absent
+- repeated calls remain safe and idempotent
+
+### Manual validation
+
+Suggested manual checks:
+
+1. Warm `MarkCache` / `MinMaxIndexCache` with one query.
+2. Confirm cache bytes / files through existing asynchronous metrics.
+3. Call the new eviction endpoint.
+4. Confirm cache size drops to zero or near zero.
+5. Re-run the same query and confirm cache entries are rebuilt.
+
+## Risks and Mitigations
+
+- **Risk:** cache reset causes unexpected failures for in-flight queries.
+  - **Mitigation:** current `LRUCache` implementation stores values as
+    `shared_ptr` and already tolerates concurrent `reset()` during `getOrSet()`.
+- **Risk:** operators use the API in production at the wrong time and cause
+  avoidable cold-path latency spikes.
+  - **Mitigation:** document the API as an operational/debug endpoint rather
+    than a routine user-facing action.
+- **Risk:** internal hit/miss counters reset together with cache state.
+  - **Mitigation:** document that cache reset also clears internal cache stats.
+
+## Open Questions
+
+- If more node-local cache-evict endpoints are added later, at what point does
+  it become worthwhile to introduce a shared response/log helper instead of
+  keeping the format duplicated in a few handlers?
diff --git a/docs/tiflash_http_api.md b/docs/tiflash_http_api.md
index a618f6e4f29..119e6221049 100644
--- a/docs/tiflash_http_api.md
+++ b/docs/tiflash_http_api.md
@@ -292,3 +292,55 @@ curl "http://${TIFLASH_IP}:${TIFLASH_STATUS_PORT}/tiflash/remote/cache/evict/typ
     "released_size":"21380742440"
 }
 ```
+
+## Evict the mark cache on one TiFlash node
+
+```bash
+curl "http://${TIFLASH_IP}:${TIFLASH_STATUS_PORT}/tiflash/cache/evict/mark"
+```
+
+### Response
+
+```json
+{
+    "status": "ok",
+    "cache": "mark"
+}
+```
+
+If the mark cache is not enabled on the target node, the request is still
+treated as a successful no-op:
+
+```json
+{
+    "status": "ok",
+    "cache": "mark",
+    "message": "cache not enabled"
+}
+```
+
+## Evict the minmax index cache on one TiFlash node
+
+```bash
+curl "http://${TIFLASH_IP}:${TIFLASH_STATUS_PORT}/tiflash/cache/evict/minmax"
+```
+
+### Response
+
+```json
+{
+    "status": "ok",
+    "cache": "minmax"
+}
+```
+
+If the minmax index cache is not enabled on the target node, the request is
+still treated as a successful no-op:
+
+```json
+{
+    "status": "ok",
+    "cache": "minmax",
+    "message": "cache not enabled"
+}
+```

From da96fff00b28f76fc445851a6ef5f956cfec9c96 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sun, 5 Apr 2026 14:04:20 +0800
Subject: [PATCH 24/36] Remove unnecessary design doc

Signed-off-by: JaySon-Huang <tshent@qq.com>
---
 ...4-05-mark-and-minmax-cache-eviction-api.md | 305 ------------------
 1 file changed, 305 deletions(-)
 delete mode 100644 docs/design/2026-04-05-mark-and-minmax-cache-eviction-api.md

diff --git a/docs/design/2026-04-05-mark-and-minmax-cache-eviction-api.md b/docs/design/2026-04-05-mark-and-minmax-cache-eviction-api.md
deleted file mode 100644
index 7622b15d75d..00000000000
--- a/docs/design/2026-04-05-mark-and-minmax-cache-eviction-api.md
+++ /dev/null
@@ -1,305 +0,0 @@
-# Mark and MinMax Cache Eviction HTTP API
-
-Purpose: propose a small operational API that clears the node-local `MarkCache`
-and `MinMaxIndexCache` on demand, primarily for controlled experiments,
-diagnostics, and cache-state reset during disaggregated read investigations.
-
-Date: 2026-04-05
-
-## Summary
-
-Add two TiFlash HTTP endpoints under `ProxyFFIStatusService`:
-
-- `/tiflash/cache/evict/mark`
-- `/tiflash/cache/evict/minmax`
-
-Each endpoint clears one node-local cache by calling the existing `Context`
-cache-drop methods:
-
-- `Context::dropMarkCache()`
-- `Context::dropMinMaxIndexCache()`
-
-The proposal intentionally keeps the first version narrow:
-
-- node-local scope only
-- no table-level or key-level eviction
-- no extra query parameters
-- no new cache implementation or refactor
-
-This is the simplest production-safe design because both caches are already
-owned by `Context`, already support `reset()`, and the existing `LRUCache`
-semantics are compatible with concurrent readers that still hold cached values.
-
-## Context
-
-### Current state
-
-TiFlash already exposes several operational HTTP endpoints from
-`ProxyFFIStatusService`, including readiness probes and remote cache eviction:
-
-- `dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.cpp:839`
-
-The service has direct access to the global TiFlash context through
-`EngineStoreServerWrap` / `TMTContext`, so it is a natural location for
-small node-local management actions.
-
-The two caches in scope are already globally owned by `Context`:
-
-- `MarkCache`
-  - definition: `dbms/src/Storages/MarkCache.h:64`
-  - `Context` API: `dbms/src/Interpreters/Context.h:394`
-  - getter implementation: `dbms/src/Interpreters/Context.cpp:1208`
-  - drop implementation: `dbms/src/Interpreters/Context.cpp:1215`
-- `MinMaxIndexCache`
-  - definition: `dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.h:158`
-  - `Context` API: `dbms/src/Interpreters/Context.h:398`
-  - getter implementation: `dbms/src/Interpreters/Context.cpp:1233`
-  - drop implementation: `dbms/src/Interpreters/Context.cpp:1239`
-
-Both drop methods already reset the corresponding cache instance rather than
-destroying and recreating the whole cache object.
-
-### Why this API is needed
-
-Remote-cache eviction alone is not sufficient for disaggregated read
-experiments. Mark and min-max metadata may stay warm even after remote cache is
-cleared, which makes it difficult to reproduce a fully cold metadata path.
-
-Today there is no dedicated operational endpoint to clear these caches without
-restarting TiFlash or relying on more invasive test-only machinery.
-
-### Verified cache reset semantics
-
-The proposal depends on `LRUCache::reset()` being safe while concurrent queries
-still hold cached values.
-
-Relevant implementation details:
-
-- cached values are stored as `std::shared_ptr<Mapped>`
-  - `dbms/src/Common/LRUCache.h:52`
-- `reset()` clears internal registry state under the cache mutex
-  - `dbms/src/Common/LRUCache.h:186`
-- `getOrSet()` already explicitly handles concurrent `reset()` on
-  `insert_tokens`
-  - `dbms/src/Common/LRUCache.h:138`
-
-This means:
-
-- in-flight readers that already obtained a cached value keep owning their
-  `shared_ptr`
-- new readers observe a cold-cache view after reset
-- internal cache structures remain synchronized by the same mutex used for
-  `get`, `set`, `getOrSet`, `remove`, and `reset`
-
-Therefore node-local cache eviction is compatible with concurrent query
-execution and does not require stopping traffic first.
-
-## Goals
-
-- Provide a simple node-local API to clear `MarkCache` on demand.
-- Provide a simple node-local API to clear `MinMaxIndexCache` on demand.
-- Keep the semantics safe for concurrent readers that already hold cached
-  values.
-- Reuse existing `Context` and cache-reset logic instead of inventing a new
-  eviction mechanism.
-- Make the API shape extensible for future cache types.
-
-## Non-Goals
-
-- No table-level, region-level, or key-level eviction.
-- No cluster-wide fan-out or cross-node coordination.
-- No new cache implementation, ownership redesign, or lifecycle refactor.
-- No authentication or authorization redesign in this proposal.
-- No automatic background eviction or policy-based eviction changes.
-- No attempt to preserve internal cache hit/miss statistics across reset.
-
-## Compatibility and Invariants
-
-- The API is node-local and only affects the TiFlash instance that receives the
-  HTTP request.
-- Existing queries that already obtained cached values continue using those
-  values safely.
-- Queries that look up the cache after eviction observe a miss and rebuild the
-  cache entry through the normal read path.
-- The API must not destroy or replace `Context` ownership of the cache object;
-  it should only call the existing drop/reset path.
-- The API returns success even if the target cache is not enabled; in that
-  case the operation is a no-op.
-
-## Design
-
-### API shape
-
-Add two new endpoints:
-
-- `/tiflash/cache/evict/mark`
-- `/tiflash/cache/evict/minmax`
-
-The `cache/evict/<type>` pattern is preferred over ad-hoc endpoint names
-because it leaves a clean extension path for future node-local cache eviction
-APIs such as `remote`, `column`, or `vector`.
-
-### Request and response behavior
-
-Requests are simple HTTP calls with no body and no query parameters.
-
-Suggested response body for the first version:
-
-```json
-{"status":"ok","cache":"mark"}
-```
-
-and:
-
-```json
-{"status":"ok","cache":"minmax"}
-```
-
-If the target cache is not enabled, still return success with a small message,
-for example:
-
-```json
-{"status":"ok","cache":"mark","message":"cache not enabled"}
-```
-
-This keeps the API operationally convenient and idempotent.
-
-### Logging and response conventions
-
-The first version should emit an INFO audit log for every manual cache-evict
-request. The log should at least include:
-
-- action: `evict`
-- cache: `mark` or `minmax`
-- result: `ok`, `noop`, or `error`
-
-This is sufficient for operational traceability and experiment timeline
-correlation. The first version does not need a dedicated Prometheus metric for
-manual cache eviction.
-
-Response bodies should use a consistent small JSON shape such as:
-
-```json
-{"status":"ok","cache":"mark"}
-```
-
-or:
-
-```json
-{"status":"ok","cache":"minmax","message":"cache not enabled"}
-```
-
-The proposal intentionally does not introduce a shared helper for cache-evict
-responses yet. With only two endpoints, keeping the JSON shape consistent in
-the handlers is simpler than adding a new abstraction layer.
-
-### Handler implementation
-
-Implement the new handlers in `ProxyFFIStatusService.cpp` and route them from
-the same path-dispatch table that already owns remote-cache eviction.
-
-The handler logic should reuse the existing `Context` methods directly:
-
-- `global_ctx.dropMarkCache()`
-- `global_ctx.dropMinMaxIndexCache()`
-
-This is preferable to calling `reset()` on the cache object directly because it
-keeps all cache-management logic behind the `Context` API boundary.
-
-### Execution model
-
-The operation is synchronous and lightweight:
-
-1. parse the path
-2. resolve the cache type
-3. obtain `Context` from `server->tmt`
-4. call the corresponding `drop*Cache()` method
-5. return a small success response
-
-No asynchronous job or background task is required.
-
-### ASCII flow
-
-```text
-HTTP request
-  -> ProxyFFIStatusService
-     -> parse /tiflash/cache/evict/<type>
-     -> resolve global Context
-     -> Context::dropMarkCache() or dropMinMaxIndexCache()
-        -> LRUCache::reset()
-           -> clear cache registry under mutex
-           -> preserve already-held shared_ptr values in running queries
-     -> return {"status":"ok", ...}
-```
-
-## Alternatives Considered
-
-### 1. Add per-table or per-key eviction
-
-Rejected for the first version.
-
-That would require new cache key parsing and more precise ownership semantics,
-while the immediate operational need is only to force a node-local cold path.
-
-### 2. Recreate the cache object instead of resetting it
-
-Rejected.
-
-`Context` already exposes `dropMarkCache()` and `dropMinMaxIndexCache()` with
- the current reset semantics. Replacing the object instance adds risk without
-clear benefit.
-
-### 3. Put the API somewhere outside `ProxyFFIStatusService`
-
-Rejected.
-
-`ProxyFFIStatusService` already hosts similar operational endpoints and already
-has the right access to `Context` and `TMTContext`.
-
-## Incremental Plan
-
-1. Add two new handlers in `ProxyFFIStatusService.cpp`.
-2. Register both endpoints in the existing route table.
-3. Return small JSON success responses.
-4. Add unit tests for path dispatch and behavior when the cache is enabled or
-   absent.
-5. Validate manually on a dev cluster by evicting the cache and confirming the
-   next query rebuilds the corresponding cache entries.
-
-## Validation Strategy
-
-### Unit tests
-
-- request path dispatch resolves `/tiflash/cache/evict/mark`
-- request path dispatch resolves `/tiflash/cache/evict/minmax`
-- handler returns success when the cache exists
-- handler returns success when the cache is absent
-- repeated calls remain safe and idempotent
-
-### Manual validation
-
-Suggested manual checks:
-
-1. Warm `MarkCache` / `MinMaxIndexCache` with one query.
-2. Confirm cache bytes / files through existing asynchronous metrics.
-3. Call the new eviction endpoint.
-4. Confirm cache size drops to zero or near zero.
-5. Re-run the same query and confirm cache entries are rebuilt.
-
-## Risks and Mitigations
-
-- **Risk:** cache reset causes unexpected failures for in-flight queries.
-  - **Mitigation:** current `LRUCache` implementation stores values as
-    `shared_ptr` and already tolerates concurrent `reset()` during `getOrSet()`.
-- **Risk:** operators use the API in production at the wrong time and cause
-  avoidable cold-path latency spikes.
-  - **Mitigation:** document the API as an operational/debug endpoint rather
-    than a routine user-facing action.
-- **Risk:** internal hit/miss counters reset together with cache state.
-  - **Mitigation:** document that cache reset also clears internal cache stats.
-
-## Open Questions
-
-- If more node-local cache-evict endpoints are added later, at what point does
-  it become worthwhile to introduce a shared response/log helper instead of
-  keeping the format duplicated in a few handlers?

From ce2672de418798a6263c0f1ecdf6bb5c70d76ae9 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sun, 5 Apr 2026 20:31:36 +0800
Subject: [PATCH 25/36] disagg: address cache eviction and filecache review
 notes

---
 dbms/src/IO/Checksum/ChecksumBuffer.cpp       | 21 +++++++++-
 dbms/src/Interpreters/Context.cpp             | 18 +++++++++
 dbms/src/Interpreters/Context.h               |  4 ++
 .../Storages/DeltaMerge/File/DMFileReader.cpp | 28 ++++++++++++--
 .../KVStore/FFI/ProxyFFIStatusService.cpp     | 38 ++++++++-----------
 .../KVStore/FFI/ProxyFFIStatusService.h       |  5 +--
 .../KVStore/FFI/tests/gtest_status_server.cpp | 12 +++---
 dbms/src/Storages/S3/FileCache.cpp            |  2 +-
 dbms/src/Storages/S3/S3Common.cpp             | 30 +++++++--------
 dbms/src/Storages/S3/S3RandomAccessFile.cpp   |  8 +++-
 .../src/Storages/S3/tests/gtest_filecache.cpp | 14 +++++--
 11 files changed, 122 insertions(+), 58 deletions(-)

diff --git a/dbms/src/IO/Checksum/ChecksumBuffer.cpp b/dbms/src/IO/Checksum/ChecksumBuffer.cpp
index a8aa4e4b4da..99f0fad9fd2 100644
--- a/dbms/src/IO/Checksum/ChecksumBuffer.cpp
+++ b/dbms/src/IO/Checksum/ChecksumBuffer.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <IO/Checksum/ChecksumBuffer.h>
+#include <common/logger_useful.h>
 
 namespace DB
 {
@@ -53,10 +54,26 @@ off_t FramedChecksumReadBuffer<Backend>::doSeek(off_t offset, int whence)
     auto result = in->seek(static_cast<off_t>(header_offset), SEEK_SET);
     if (result < 0)
     {
+        LOG_WARNING(
+            Logger::get("FramedChecksumReadBuffer"),
+            "failed to seek underlying reader while loading checksum frame, file={} whence={} target_frame={} "
+            "target_offset={} header_offset={} underlying_seek_ret={}",
+            in->getFileName(),
+            whence,
+            target_frame,
+            target_offset,
+            header_offset,
+            result);
         throw TiFlashException(
             Errors::Checksum::IOFailure,
-            "checksum framed file {} is not seekable",
-            in->getFileName());
+            "failed to seek checksum framed file {} to frame boundary: underlying reader returned {} for "
+            "header_offset={} target_frame={} target_offset={} whence={}",
+            in->getFileName(),
+            result,
+            header_offset,
+            target_frame,
+            target_offset,
+            whence);
     }
     auto length = expectRead(
         working_buffer.begin() - sizeof(ChecksumFrame<Backend>),
diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp
index aa918cc192e..0cf47ad9123 100644
--- a/dbms/src/Interpreters/Context.cpp
+++ b/dbms/src/Interpreters/Context.cpp
@@ -1219,6 +1219,15 @@ void Context::dropMarkCache() const
         shared->mark_cache->reset();
 }
 
+bool Context::dropMarkCacheAndReport() const
+{
+    auto lock = getLock();
+    if (shared->mark_cache == nullptr)
+        return false;
+    shared->mark_cache->reset();
+    return true;
+}
+
 
 void Context::setMinMaxIndexCache(size_t cache_size_in_bytes)
 {
@@ -1243,6 +1252,15 @@ void Context::dropMinMaxIndexCache() const
         shared->minmax_index_cache->reset();
 }
 
+bool Context::dropMinMaxIndexCacheAndReport() const
+{
+    auto lock = getLock();
+    if (shared->minmax_index_cache == nullptr)
+        return false;
+    shared->minmax_index_cache->reset();
+    return true;
+}
+
 void Context::setLocalIndexCache(size_t light_local_index_cache, size_t heavy_cache_entities)
 {
     auto lock = getLock();
diff --git a/dbms/src/Interpreters/Context.h b/dbms/src/Interpreters/Context.h
index b978ab66bdd..179f72763cc 100644
--- a/dbms/src/Interpreters/Context.h
+++ b/dbms/src/Interpreters/Context.h
@@ -394,10 +394,14 @@ class Context
     void setMarkCache(size_t cache_size_in_bytes);
     std::shared_ptr<MarkCache> getMarkCache() const;
     void dropMarkCache() const;
+    /// Reset MarkCache and report whether it was enabled before the reset.
+    bool dropMarkCacheAndReport() const;
 
     void setMinMaxIndexCache(size_t cache_size_in_bytes);
     std::shared_ptr<DM::MinMaxIndexCache> getMinMaxIndexCache() const;
     void dropMinMaxIndexCache() const;
+    /// Reset MinMaxIndexCache and report whether it was enabled before the reset.
+    bool dropMinMaxIndexCacheAndReport() const;
 
     void setLocalIndexCache(size_t light_local_index_cache, size_t heavy_cache_entities);
     std::shared_ptr<DM::LocalIndexCache> getLightLocalIndexCache() const;
diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileReader.cpp b/dbms/src/Storages/DeltaMerge/File/DMFileReader.cpp
index 91f18c8836a..62ae9a3d34a 100644
--- a/dbms/src/Storages/DeltaMerge/File/DMFileReader.cpp
+++ b/dbms/src/Storages/DeltaMerge/File/DMFileReader.cpp
@@ -544,9 +544,31 @@ ColumnPtr DMFileReader::readFromDisk(
         [&](const IDataType::SubstreamPath & substream_path) {
             const auto substream_name = DMFile::getFileNameBase(cd.id, substream_path);
             auto & sub_stream = column_streams.at(substream_name);
-            sub_stream->buf->seek(
-                sub_stream->getOffsetInFile(start_pack_id),
-                sub_stream->getOffsetInDecompressedBlock(start_pack_id));
+            const auto offset_in_file = sub_stream->getOffsetInFile(start_pack_id);
+            const auto offset_in_decompressed_block = sub_stream->getOffsetInDecompressedBlock(start_pack_id);
+            try
+            {
+                sub_stream->buf->seek(offset_in_file, offset_in_decompressed_block);
+            }
+            catch (...)
+            {
+                tryLogCurrentWarningException(
+                    log,
+                    fmt::format(
+                        "DMFile substream seek failed, dmfile={} column_id={} type_on_disk={} stream_name={} "
+                        "substream_name={} start_pack_id={} read_rows={} offset_in_file={} "
+                        "offset_in_decompressed_block={}",
+                        path(),
+                        cd.id,
+                        type_on_disk->getName(),
+                        stream_name,
+                        substream_name,
+                        start_pack_id,
+                        read_rows,
+                        offset_in_file,
+                        offset_in_decompressed_block));
+                throw;
+            }
             return sub_stream->buf.get();
         },
         read_rows,
diff --git a/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.cpp b/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.cpp
index c81e6c4a28a..e736f7da3bc 100644
--- a/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.cpp
+++ b/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.cpp
@@ -16,6 +16,8 @@
 #include <Common/FailPoint.h>
 #include <Common/FmtUtils.h>
 #include <Common/TiFlashMetrics.h>
+#include <IO/Buffer/WriteBufferFromString.h>
+#include <IO/WriteHelpers.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/SharedContexts/Disagg.h>
 #include <RaftStoreProxyFFI/ProxyFFI.h>
@@ -36,6 +38,7 @@
 #include <boost/algorithm/string.hpp>
 #include <charconv>
 #include <magic_enum.hpp>
+#include <sstream>
 #include <string>
 
 namespace DB
@@ -579,10 +582,7 @@ RemoteCacheEvictRequest parseEvictRequest(
     return req;
 }
 
-std::optional<CacheEvictType> parseCacheEvictType(
-    std::string_view path,
-    std::string_view api_name,
-    String & err_msg)
+std::optional<CacheEvictType> parseCacheEvictType(std::string_view path, std::string_view api_name, String & err_msg)
 {
     auto trim_path = path.substr(api_name.size());
     if (trim_path == "/mark")
@@ -603,16 +603,21 @@ String buildCacheEvictOkBody(std::string_view cache_name, std::optional<std::str
     return fmt::format(R"json({{"status":"ok","cache":"{}"}})json", cache_name);
 }
 
-void evictLocalCacheOrThrow(Context & global_ctx, CacheEvictType cache_type)
+String buildJsonErrorBody(const String & err_msg)
+{
+    WriteBufferFromOwnString msg_buf;
+    writeJSONString(err_msg, msg_buf);
+    return fmt::format(R"json({{"status":"error","message":{}}})json", msg_buf.releaseStr());
+}
+
+bool evictLocalCacheAndReport(Context & global_ctx, CacheEvictType cache_type)
 {
     switch (cache_type)
     {
     case CacheEvictType::Mark:
-        global_ctx.dropMarkCache();
-        return;
+        return global_ctx.dropMarkCacheAndReport();
     case CacheEvictType::MinMax:
-        global_ctx.dropMinMaxIndexCache();
-        return;
+        return global_ctx.dropMinMaxIndexCacheAndReport();
     }
     __builtin_unreachable();
 }
@@ -644,26 +649,15 @@ HttpRequestRes HandleHttpRequestLocalCacheEvict(
     auto cache_type = parseCacheEvictType(path, api_name, err_msg);
     if (!cache_type.has_value())
     {
-        auto body = fmt::format(R"json({{"status":"error","message":"{}"}})json", err_msg);
+        auto body = buildJsonErrorBody(err_msg);
         LOG_WARNING(log, "invalid local cache evict request, path={} api_name={}", path, api_name);
         return buildRespWithCode(HttpRequestStatus::BadRequest, api_name, std::move(body));
     }
 
     const auto cache_name = cacheTypeName(*cache_type);
-    const bool cache_enabled = [&] {
-        switch (*cache_type)
-        {
-        case CacheEvictType::Mark:
-            return global_ctx.getMarkCache() != nullptr;
-        case CacheEvictType::MinMax:
-            return global_ctx.getMinMaxIndexCache() != nullptr;
-        }
-        __builtin_unreachable();
-    }();
-
     // `drop*Cache()` eventually calls `LRUCache::reset()`, which clears the registry under the
     // cache mutex while leaving already-held `shared_ptr` values valid for in-flight readers.
-    evictLocalCacheOrThrow(global_ctx, *cache_type);
+    const bool cache_enabled = evictLocalCacheAndReport(global_ctx, *cache_type);
     LOG_INFO(log, "manual cache eviction, action=evict cache={} result={}", cache_name, cache_enabled ? "ok" : "noop");
     return buildOkResp(
         api_name,
diff --git a/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.h b/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.h
index 8d0851c9ff6..73891c5fb72 100644
--- a/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.h
+++ b/dbms/src/Storages/KVStore/FFI/ProxyFFIStatusService.h
@@ -50,10 +50,7 @@ RemoteCacheEvictRequest parseEvictRequest(std::string_view path, std::string_vie
 
 /// Parse `/tiflash/cache/evict/<type>` and resolve the target node-local cache type.
 /// Returns `std::nullopt` and sets `err_msg` when the path suffix is invalid.
-std::optional<CacheEvictType> parseCacheEvictType(
-    std::string_view path,
-    std::string_view api_name,
-    String & err_msg);
+std::optional<CacheEvictType> parseCacheEvictType(std::string_view path, std::string_view api_name, String & err_msg);
 
 std::tuple<std::vector<StoreID>, String> parseStoreIds(std::string_view path);
 
diff --git a/dbms/src/Storages/KVStore/FFI/tests/gtest_status_server.cpp b/dbms/src/Storages/KVStore/FFI/tests/gtest_status_server.cpp
index 883e9972d4b..3760ce2a7ff 100644
--- a/dbms/src/Storages/KVStore/FFI/tests/gtest_status_server.cpp
+++ b/dbms/src/Storages/KVStore/FFI/tests/gtest_status_server.cpp
@@ -14,10 +14,10 @@
 
 #include <Common/FailPoint.h>
 #include <Common/StringUtils/StringRefUtils.h>
-#include <Databases/DatabaseTiFlash.h>
-#include <Debug/MockKVStore/MockUtils.h>
 #include <DataStreams/MarkInCompressedFile.h>
 #include <DataTypes/DataTypesNumber.h>
+#include <Databases/DatabaseTiFlash.h>
+#include <Debug/MockKVStore/MockUtils.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/InterpreterCreateQuery.h>
 #include <Interpreters/InterpreterDropQuery.h>
@@ -25,6 +25,7 @@
 #include <Parsers/ASTDropQuery.h>
 #include <Parsers/ParserCreateQuery.h>
 #include <Parsers/parseQuery.h>
+#include <Storages/DeltaMerge/Index/MinMaxIndex.h>
 #include <Storages/IManageableStorage.h>
 #include <Storages/KVStore/FFI/ProxyFFI.h>
 #include <Storages/KVStore/FFI/ProxyFFICommon.h>
@@ -34,7 +35,6 @@
 #include <Storages/KVStore/Types.h>
 #include <Storages/KVStore/tests/region_kvstore_test.h>
 #include <Storages/StorageDeltaMerge.h>
-#include <Storages/DeltaMerge/Index/MinMaxIndex.h>
 #include <Storages/registerStorages.h>
 #include <TestUtils/TiFlashTestBasic.h>
 #include <TiDB/Schema/SchemaNameMapper.h>
@@ -627,6 +627,7 @@ TEST_F(StatusServerTest, TestLocalCacheEvict)
     {
         auto mark_cache = ctx->getMarkCache();
         ASSERT_NE(mark_cache, nullptr);
+        mark_cache->reset();
         auto marks = std::make_shared<MarksInCompressedFile>();
         marks->push_back(MarkInCompressedFile{1, 2});
         mark_cache->set("mark-key", marks);
@@ -647,6 +648,7 @@ TEST_F(StatusServerTest, TestLocalCacheEvict)
     {
         auto minmax_cache = ctx->getMinMaxIndexCache();
         ASSERT_NE(minmax_cache, nullptr);
+        minmax_cache->reset();
         auto index = std::make_shared<DM::MinMaxIndex>(DataTypeInt64());
         minmax_cache->set("minmax-key", index);
         ASSERT_EQ(minmax_cache->count(), 1);
@@ -658,9 +660,7 @@ TEST_F(StatusServerTest, TestLocalCacheEvict)
             BaseBuffView{"", 0},
             BaseBuffView{"", 0});
         EXPECT_EQ(res.status, HttpRequestStatus::Ok);
-        EXPECT_EQ(
-            std::string_view(res.res.view.data, res.res.view.len),
-            R"json({"status":"ok","cache":"minmax"})json");
+        EXPECT_EQ(std::string_view(res.res.view.data, res.res.view.len), R"json({"status":"ok","cache":"minmax"})json");
         EXPECT_EQ(minmax_cache->count(), 0);
         releaseResp(helper, std::move(res));
     }
diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 526e257ac79..62d1ee41698 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -119,7 +119,7 @@ class ReadBufferFromIStreamWithLimiter : public BufferWithOwnMemory<ReadBuffer>
         {
             if (istr.eof())
                 return false;
-            throw Exception("Cannot read from istream", ErrorCodes::CANNOT_READ_FROM_ISTREAM);
+            throw Exception(ErrorCodes::CANNOT_READ_FROM_ISTREAM, "Cannot read from istream");
         }
 
         working_buffer.resize(gcount);
diff --git a/dbms/src/Storages/S3/S3Common.cpp b/dbms/src/Storages/S3/S3Common.cpp
index 8c46828b56e..cd6a559d0a3 100644
--- a/dbms/src/Storages/S3/S3Common.cpp
+++ b/dbms/src/Storages/S3/S3Common.cpp
@@ -34,8 +34,8 @@
 #include <Storages/S3/PocoHTTPClientFactory.h>
 #include <Storages/S3/S3Common.h>
 #include <Storages/S3/S3Filename.h>
-#include <Storages/S3/S3ReadLimiter.h>
 #include <Storages/S3/S3RandomAccessFile.h>
+#include <Storages/S3/S3ReadLimiter.h>
 #include <aws/core/Region.h>
 #include <aws/core/auth/AWSCredentials.h>
 #include <aws/core/auth/signer/AWSAuthV4Signer.h>
@@ -369,14 +369,13 @@ void ClientFactory::init(const StorageS3Config & config_, bool mock_s3_)
         Aws::Client::ClientConfiguration cfg(true, /*defaultMode=*/"standard", /*shouldDisableIMDS=*/true);
         cfg.region = Aws::Region::US_EAST_1; // default region
         Aws::Auth::AWSCredentials cred("mock_access_key", "mock_secret_key");
-        shared_tiflash_client
-            = std::make_unique<tests::MockS3Client>(
-                config.bucket,
-                config.root,
-                cred,
-                cfg,
-                shared_s3_read_limiter,
-                shared_s3_read_metrics_recorder);
+        shared_tiflash_client = std::make_unique<tests::MockS3Client>(
+            config.bucket,
+            config.root,
+            cred,
+            cfg,
+            shared_s3_read_limiter,
+            shared_s3_read_metrics_recorder);
     }
     client_is_inited = true; // init finish
 }
@@ -408,13 +407,12 @@ std::shared_ptr<TiFlashS3Client> ClientFactory::initClientFromWriteNode()
 
     auto [s3_client, vendor] = create(config, log);
     cloud_vendor = vendor;
-    shared_tiflash_client
-        = std::make_shared<TiFlashS3Client>(
-            config.bucket,
-            config.root,
-            std::move(s3_client),
-            shared_s3_read_limiter,
-            shared_s3_read_metrics_recorder);
+    shared_tiflash_client = std::make_shared<TiFlashS3Client>(
+        config.bucket,
+        config.root,
+        std::move(s3_client),
+        shared_s3_read_limiter,
+        shared_s3_read_metrics_recorder);
     client_is_inited = true; // init finish
     return shared_tiflash_client;
 }
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.cpp b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
index 34b8a500546..30930ab8811 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.cpp
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
@@ -326,10 +326,16 @@ off_t S3RandomAccessFile::finalizeSeek(
         GET_METRIC(tiflash_storage_s3_request_seconds, type_read_stream_err).Observe(elapsed_secs);
         LOG_WARNING(
             log,
-            "Cannot ignore from istream, state=0x{:02X}, ignored={} expected={} errno={} errmsg={} cost={:.6f}s",
+            "Cannot ignore from istream, state=0x{:02X}, ignored={} expected={} target_offset={} cur_offset={} "
+            "content_length={} limiter_enabled={} max_read_bytes_per_sec={} errno={} errmsg={} cost={:.6f}s",
             state,
             actual_size,
             requested_size,
+            target_offset,
+            cur_offset,
+            content_length,
+            read_limiter != nullptr,
+            read_limiter != nullptr ? read_limiter->maxReadBytesPerSec() : 0,
             errno,
             strerror(errno),
             elapsed_secs);
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index 0b8d62e4823..bb4412a088a 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -1178,6 +1178,7 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingHitAndTimeout)
 
     auto objects = genObjects(/*store_count*/ 1, /*table_count*/ 1, /*file_count*/ 1, {"1.merged", "2.merged"});
     auto sp_download = SyncPointCtl::enableInScope("before_FileCache::downloadImpl_download_to_local");
+    auto sp_wait = SyncPointCtl::enableInScope("before_FileSegment::waitForNotEmpty_wait");
 
     auto first_key = S3FilenameView::fromKey(objects[0].key);
     // First request publishes the `Empty` placeholder and starts the background download.
@@ -1186,8 +1187,9 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingHitAndTimeout)
 
     // With a generous bounded-wait budget, the follower should reuse the downloader result instead of returning miss.
     auto wait_hit = std::async(std::launch::async, [&]() { return file_cache.get(first_key, objects[0].size); });
-    std::this_thread::sleep_for(20ms);
+    sp_wait.waitAndPause();
     sp_download.next();
+    sp_wait.next();
     auto hit_seg = wait_hit.get();
     ASSERT_NE(hit_seg, nullptr);
     ASSERT_TRUE(hit_seg->isReadyToRead());
@@ -1199,6 +1201,8 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingHitAndTimeout)
     ASSERT_EQ(file_cache.get(second_key, objects[1].size), nullptr);
     sp_download.waitAndPause();
     auto wait_timeout = std::async(std::launch::async, [&]() { return file_cache.get(second_key, objects[1].size); });
+    sp_wait.waitAndPause();
+    sp_wait.next();
     ASSERT_EQ(wait_timeout.get(), nullptr);
     sp_download.next();
     sp_download.disable();
@@ -1223,6 +1227,7 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingReturnsMissWhenDownloaderFails)
 
     auto objects = genObjects(/*store_count*/ 1, /*table_count*/ 1, /*file_count*/ 1, {"1.merged"});
     auto sp_download = SyncPointCtl::enableInScope("before_FileCache::downloadImpl_download_to_local");
+    auto sp_wait = SyncPointCtl::enableInScope("before_FileSegment::waitForNotEmpty_wait");
 
     auto key = S3FilenameView::fromKey(objects[0].key);
     // First caller creates the `Empty` placeholder and starts the background download.
@@ -1235,8 +1240,9 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingReturnsMissWhenDownloaderFails)
         FailPointHelper::enableFailPoint(FailPoints::file_cache_bg_download_fail);
         SCOPE_EXIT({ FailPointHelper::disableFailPoint(FailPoints::file_cache_bg_download_fail); });
         auto wait_failed = std::async(std::launch::async, [&]() { return file_cache.get(key, objects[0].size); });
-        std::this_thread::sleep_for(20ms);
+        sp_wait.waitAndPause();
         sp_download.next();
+        sp_wait.next();
         ASSERT_EQ(wait_failed.get(), nullptr);
     }
     sp_download.disable();
@@ -1314,6 +1320,7 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingSupportsColDataAndOther)
 
     auto objects = genObjects(/*store_count*/ 1, /*table_count*/ 1, /*file_count*/ 1, {"1.dat", "meta"});
     auto sp_download = SyncPointCtl::enableInScope("before_FileCache::downloadImpl_download_to_local");
+    auto sp_wait = SyncPointCtl::enableInScope("before_FileSegment::waitForNotEmpty_wait");
 
     auto run_wait_hit_case = [&](const ObjectInfo & obj, FileType expected_file_type) {
         auto key = S3FilenameView::fromKey(obj.key);
@@ -1323,8 +1330,9 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingSupportsColDataAndOther)
         sp_download.waitAndPause();
 
         auto wait_hit = std::async(std::launch::async, [&]() { return file_cache.get(key, obj.size); });
-        std::this_thread::sleep_for(20ms);
+        sp_wait.waitAndPause();
         sp_download.next();
+        sp_wait.next();
 
         auto file_seg = wait_hit.get();
         ASSERT_NE(file_seg, nullptr);

From 2bda03a423a9dc155531ee42a54ee88e5ebac87c Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sun, 5 Apr 2026 21:26:41 +0800
Subject: [PATCH 26/36] Update grafana panel

Signed-off-by: JaySon-Huang <tshent@qq.com>
---
 metrics/grafana/tiflash_summary.json | 1415 ++++++++++++++++++++------
 1 file changed, 1090 insertions(+), 325 deletions(-)

diff --git a/metrics/grafana/tiflash_summary.json b/metrics/grafana/tiflash_summary.json
index 402ef82a129..22c00c4a054 100644
--- a/metrics/grafana/tiflash_summary.json
+++ b/metrics/grafana/tiflash_summary.json
@@ -52,7 +52,7 @@
   "gnetId": null,
   "graphTooltip": 1,
   "id": null,
-  "iteration": 1774097820692,
+  "iteration": 1775388239670,
   "links": [],
   "panels": [
     {
@@ -1217,7 +1217,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 2
+            "y": 34
           },
           "hiddenSeries": false,
           "id": 141,
@@ -1329,7 +1329,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 2
+            "y": 34
           },
           "hiddenSeries": false,
           "id": 154,
@@ -1459,7 +1459,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 9
+            "y": 41
           },
           "hiddenSeries": false,
           "id": 145,
@@ -1589,7 +1589,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 9
+            "y": 41
           },
           "hiddenSeries": false,
           "id": 147,
@@ -1719,7 +1719,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 16
+            "y": 48
           },
           "hiddenSeries": false,
           "id": 155,
@@ -1849,7 +1849,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 16
+            "y": 48
           },
           "hiddenSeries": false,
           "id": 257,
@@ -1979,7 +1979,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 23
+            "y": 55
           },
           "hiddenSeries": false,
           "id": 151,
@@ -2109,7 +2109,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 23
+            "y": 55
           },
           "hiddenSeries": false,
           "id": 156,
@@ -2239,7 +2239,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 30
+            "y": 62
           },
           "hiddenSeries": false,
           "id": 149,
@@ -2369,7 +2369,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 30
+            "y": 62
           },
           "hiddenSeries": false,
           "id": 159,
@@ -2499,7 +2499,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 37
+            "y": 69
           },
           "hiddenSeries": false,
           "id": 161,
@@ -2629,7 +2629,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 37
+            "y": 69
           },
           "hiddenSeries": false,
           "id": 256,
@@ -2759,7 +2759,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 44
+            "y": 76
           },
           "hiddenSeries": false,
           "id": 153,
@@ -2889,7 +2889,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 44
+            "y": 76
           },
           "hiddenSeries": false,
           "id": 267,
@@ -3019,7 +3019,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 51
+            "y": 83
           },
           "hiddenSeries": false,
           "id": 295,
@@ -3161,7 +3161,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 51
+            "y": 83
           },
           "hiddenSeries": false,
           "id": 268,
@@ -3302,7 +3302,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 3
+            "y": 35
           },
           "hiddenSeries": false,
           "id": 329,
@@ -3415,7 +3415,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 3
+            "y": 35
           },
           "hiddenSeries": false,
           "id": 331,
@@ -3520,7 +3520,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 10
+            "y": 42
           },
           "hiddenSeries": false,
           "id": 333,
@@ -3625,7 +3625,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 10
+            "y": 42
           },
           "hiddenSeries": false,
           "id": 335,
@@ -3744,7 +3744,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 4
+            "y": 36
           },
           "hiddenSeries": false,
           "id": 9,
@@ -3845,7 +3845,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 4
+            "y": 36
           },
           "hiddenSeries": false,
           "id": 2,
@@ -3944,7 +3944,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 11
+            "y": 43
           },
           "hiddenSeries": false,
           "id": 11,
@@ -4067,7 +4067,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 11
+            "y": 43
           },
           "hiddenSeries": false,
           "id": 12,
@@ -4164,7 +4164,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 18
+            "y": 50
           },
           "hiddenSeries": false,
           "id": 13,
@@ -4282,7 +4282,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 18
+            "y": 50
           },
           "hiddenSeries": false,
           "id": 14,
@@ -4383,7 +4383,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 25
+            "y": 57
           },
           "hiddenSeries": false,
           "id": 63,
@@ -4499,7 +4499,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 25
+            "y": 57
           },
           "hiddenSeries": false,
           "id": 165,
@@ -4600,7 +4600,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 32
+            "y": 64
           },
           "hiddenSeries": false,
           "id": 100,
@@ -4699,7 +4699,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 32
+            "y": 64
           },
           "hiddenSeries": false,
           "id": 77,
@@ -4797,7 +4797,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 39
+            "y": 71
           },
           "hiddenSeries": false,
           "id": 102,
@@ -4896,7 +4896,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 39
+            "y": 71
           },
           "hiddenSeries": false,
           "id": 101,
@@ -4996,7 +4996,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 46
+            "y": 78
           },
           "hiddenSeries": false,
           "id": 157,
@@ -5095,7 +5095,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 46
+            "y": 78
           },
           "hiddenSeries": false,
           "id": 103,
@@ -5194,7 +5194,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 53
+            "y": 85
           },
           "hiddenSeries": false,
           "id": 199,
@@ -5293,7 +5293,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 53
+            "y": 85
           },
           "hiddenSeries": false,
           "id": 166,
@@ -5394,7 +5394,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 60
+            "y": 92
           },
           "hiddenSeries": false,
           "id": 297,
@@ -5494,7 +5494,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 60
+            "y": 92
           },
           "hiddenSeries": false,
           "id": 299,
@@ -5613,7 +5613,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 5
+            "y": 37
           },
           "hiddenSeries": false,
           "id": 107,
@@ -5715,7 +5715,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 5
+            "y": 37
           },
           "hiddenSeries": false,
           "id": 109,
@@ -5853,7 +5853,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 13
+            "y": 45
           },
           "hiddenSeries": false,
           "id": 111,
@@ -5967,7 +5967,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 13
+            "y": 45
           },
           "hiddenSeries": false,
           "id": 113,
@@ -6081,7 +6081,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 21
+            "y": 53
           },
           "hiddenSeries": false,
           "id": 117,
@@ -6182,7 +6182,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 21
+            "y": 53
           },
           "hiddenSeries": false,
           "id": 115,
@@ -6316,7 +6316,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 6
+            "y": 38
           },
           "hiddenSeries": false,
           "id": 19,
@@ -6438,7 +6438,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 6
+            "y": 38
           },
           "hiddenSeries": false,
           "id": 18,
@@ -6536,7 +6536,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 13
+            "y": 45
           },
           "hiddenSeries": false,
           "id": 20,
@@ -6689,7 +6689,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 7
+            "y": 39
           },
           "hiddenSeries": false,
           "id": 310,
@@ -6816,7 +6816,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 7
+            "y": 39
           },
           "hiddenSeries": false,
           "id": 309,
@@ -6941,7 +6941,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 15
+            "y": 47
           },
           "hiddenSeries": false,
           "id": 316,
@@ -7045,7 +7045,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 15
+            "y": 47
           },
           "height": "",
           "hiddenSeries": false,
@@ -7172,7 +7172,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 23
+            "y": 55
           },
           "hiddenSeries": false,
           "id": 318,
@@ -7286,7 +7286,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 23
+            "y": 55
           },
           "height": "",
           "hiddenSeries": false,
@@ -7423,7 +7423,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 8
+            "y": 40
           },
           "hiddenSeries": false,
           "id": 325,
@@ -7523,7 +7523,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 8
+            "y": 40
           },
           "hiddenSeries": false,
           "id": 324,
@@ -7633,7 +7633,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 16
+            "y": 48
           },
           "hiddenSeries": false,
           "id": 319,
@@ -7743,7 +7743,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 16
+            "y": 48
           },
           "hiddenSeries": false,
           "id": 323,
@@ -7853,7 +7853,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 24
+            "y": 56
           },
           "hiddenSeries": false,
           "id": 238,
@@ -7979,7 +7979,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 24
+            "y": 56
           },
           "hiddenSeries": false,
           "id": 169,
@@ -8128,7 +8128,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 32
+            "y": 64
           },
           "hiddenSeries": false,
           "id": 168,
@@ -8252,7 +8252,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 32
+            "y": 64
           },
           "hiddenSeries": false,
           "id": 337,
@@ -8368,7 +8368,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 40
+            "y": 72
           },
           "hiddenSeries": false,
           "id": 289,
@@ -8493,7 +8493,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 40
+            "y": 72
           },
           "hiddenSeries": false,
           "id": 272,
@@ -8619,7 +8619,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 48
+            "y": 80
           },
           "hiddenSeries": false,
           "id": 291,
@@ -8728,7 +8728,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 48
+            "y": 80
           },
           "hiddenSeries": false,
           "id": 263,
@@ -8841,7 +8841,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 9
+            "y": 41
           },
           "hiddenSeries": false,
           "id": 41,
@@ -8954,7 +8954,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 9
+            "y": 41
           },
           "hiddenSeries": false,
           "id": 38,
@@ -9113,7 +9113,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 17
+            "y": 49
           },
           "height": "",
           "hiddenSeries": false,
@@ -9228,7 +9228,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 17
+            "y": 49
           },
           "height": "",
           "hiddenSeries": false,
@@ -9342,7 +9342,7 @@
             "h": 5,
             "w": 12,
             "x": 0,
-            "y": 25
+            "y": 57
           },
           "hiddenSeries": false,
           "id": 39,
@@ -9445,7 +9445,7 @@
             "h": 5,
             "w": 12,
             "x": 12,
-            "y": 25
+            "y": 57
           },
           "hiddenSeries": false,
           "id": 42,
@@ -9565,7 +9565,7 @@
             "h": 5,
             "w": 12,
             "x": 0,
-            "y": 30
+            "y": 62
           },
           "hiddenSeries": false,
           "id": 130,
@@ -9668,7 +9668,7 @@
             "h": 5,
             "w": 12,
             "x": 12,
-            "y": 30
+            "y": 62
           },
           "hiddenSeries": false,
           "id": 131,
@@ -9789,7 +9789,7 @@
             "h": 7,
             "w": 24,
             "x": 0,
-            "y": 35
+            "y": 67
           },
           "hiddenSeries": false,
           "id": 67,
@@ -9903,7 +9903,7 @@
             "h": 7,
             "w": 8,
             "x": 0,
-            "y": 42
+            "y": 74
           },
           "hiddenSeries": false,
           "id": 50,
@@ -10037,7 +10037,7 @@
             "h": 7,
             "w": 8,
             "x": 8,
-            "y": 42
+            "y": 74
           },
           "hiddenSeries": false,
           "id": 22,
@@ -10151,7 +10151,7 @@
             "h": 7,
             "w": 8,
             "x": 16,
-            "y": 42
+            "y": 74
           },
           "hiddenSeries": false,
           "id": 52,
@@ -10268,7 +10268,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 49
+            "y": 81
           },
           "hiddenSeries": false,
           "id": 46,
@@ -10384,7 +10384,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 49
+            "y": 81
           },
           "hiddenSeries": false,
           "id": 47,
@@ -10501,7 +10501,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 56
+            "y": 88
           },
           "height": "",
           "hiddenSeries": false,
@@ -10623,7 +10623,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 56
+            "y": 88
           },
           "height": "",
           "hiddenSeries": false,
@@ -10744,7 +10744,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 64
+            "y": 96
           },
           "hiddenSeries": false,
           "id": 294,
@@ -10850,7 +10850,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 64
+            "y": 96
           },
           "hiddenSeries": false,
           "id": 293,
@@ -10965,7 +10965,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 10
+            "y": 42
           },
           "hiddenSeries": false,
           "id": 40,
@@ -11065,7 +11065,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 10
+            "y": 42
           },
           "hiddenSeries": false,
           "id": 88,
@@ -11252,7 +11252,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 18
+            "y": 50
           },
           "hiddenSeries": false,
           "id": 292,
@@ -11381,7 +11381,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 18
+            "y": 50
           },
           "hiddenSeries": false,
           "id": 269,
@@ -11488,7 +11488,7 @@
             "h": 8,
             "w": 8,
             "x": 0,
-            "y": 26
+            "y": 58
           },
           "hiddenSeries": false,
           "id": 132,
@@ -11613,7 +11613,7 @@
             "h": 8,
             "w": 8,
             "x": 8,
-            "y": 26
+            "y": 58
           },
           "hiddenSeries": false,
           "id": 361,
@@ -11720,7 +11720,7 @@
             "h": 8,
             "w": 8,
             "x": 16,
-            "y": 26
+            "y": 58
           },
           "hiddenSeries": false,
           "id": 362,
@@ -11839,7 +11839,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 34
+            "y": 66
           },
           "hiddenSeries": false,
           "id": 301,
@@ -11969,7 +11969,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 34
+            "y": 66
           },
           "hiddenSeries": false,
           "id": 237,
@@ -12090,7 +12090,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 11
+            "y": 43
           },
           "hiddenSeries": false,
           "id": 128,
@@ -12233,7 +12233,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 11
+            "y": 43
           },
           "hiddenSeries": false,
           "id": 129,
@@ -12350,7 +12350,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 19
+            "y": 51
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -12412,7 +12412,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 19
+            "y": 51
           },
           "hiddenSeries": false,
           "id": 158,
@@ -12548,7 +12548,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 27
+            "y": 59
           },
           "hiddenSeries": false,
           "id": 163,
@@ -12653,7 +12653,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 27
+            "y": 59
           },
           "hiddenSeries": false,
           "id": 162,
@@ -12773,7 +12773,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 35
+            "y": 67
           },
           "hiddenSeries": false,
           "id": 164,
@@ -12889,7 +12889,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 35
+            "y": 67
           },
           "hiddenSeries": false,
           "id": 231,
@@ -12996,7 +12996,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 43
+            "y": 75
           },
           "height": "",
           "hiddenSeries": false,
@@ -13109,7 +13109,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 43
+            "y": 75
           },
           "hiddenSeries": false,
           "id": 123,
@@ -13240,7 +13240,7 @@
             "h": 9,
             "w": 24,
             "x": 0,
-            "y": 51
+            "y": 83
           },
           "hiddenSeries": false,
           "id": 232,
@@ -13347,7 +13347,7 @@
             "h": 9,
             "w": 24,
             "x": 0,
-            "y": 60
+            "y": 92
           },
           "hiddenSeries": false,
           "id": 345,
@@ -13588,8 +13588,6 @@
             "min": false,
             "rightSide": true,
             "show": true,
-            "sort": "max",
-            "sortDesc": true,
             "total": false,
             "values": true
           },
@@ -13669,7 +13667,7 @@
           "dashLength": 10,
           "dashes": false,
           "datasource": "${DS_TEST-CLUSTER}",
-          "description": "The storage I/O limiter metrics.",
+          "description": "I/O Limiter current pending gauge.",
           "fieldConfig": {
             "defaults": {},
             "overrides": []
@@ -13678,12 +13676,12 @@
           "fillGradient": 0,
           "gridPos": {
             "h": 8,
-            "w": 12,
+            "w": 8,
             "x": 0,
             "y": 20
           },
           "hiddenSeries": false,
-          "id": 266,
+          "id": 86,
           "legend": {
             "alignAsTable": true,
             "avg": false,
@@ -13701,7 +13699,7 @@
           "lines": true,
           "linewidth": 1,
           "links": [],
-          "nullPointMode": "null as zero",
+          "nullPointMode": "null",
           "options": {
             "alertThreshold": true
           },
@@ -13712,7 +13710,7 @@
           "renderer": "flot",
           "seriesOverrides": [
             {
-              "alias": "",
+              "alias": "/pending/",
               "yaxis": 2
             }
           ],
@@ -13721,24 +13719,55 @@
           "steppedLine": false,
           "targets": [
             {
-              "exemplar": true,
-              "expr": "sum(rate(tiflash_storage_io_limiter_pending_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (type, instance)",
+              "expr": "avg(tiflash_system_current_metric_RateLimiterPendingWriteRequest{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
               "format": "time_series",
-              "instant": false,
+              "hide": true,
               "interval": "",
-              "intervalFactor": 2,
-              "legendFormat": "{{type}}-{{instance}}",
+              "intervalFactor": 1,
+              "legendFormat": "other-current-{{instance}}",
               "refId": "A"
+            },
+            {
+              "exemplar": true,
+              "expr": "avg(tiflash_system_current_metric_IOLimiterPendingBgWriteReq{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "bgwrite-current-{{instance}}",
+              "refId": "B"
+            },
+            {
+              "exemplar": true,
+              "expr": "avg(tiflash_system_current_metric_IOLimiterPendingFgWriteReq{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "fgwrite-current-{{instance}}",
+              "refId": "C"
+            },
+            {
+              "exemplar": true,
+              "expr": "avg(tiflash_system_current_metric_IOLimiterPendingBgReadReq{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "bgread-current-{{instance}}",
+              "refId": "D"
+            },
+            {
+              "exemplar": true,
+              "expr": "avg(tiflash_system_current_metric_IOLimiterPendingFgReadReq{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "fgread-current-{{instance}}",
+              "refId": "E"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "I/O Limiter Pending Rate",
+          "title": "I/O Limiter Current Pending Gauge",
           "tooltip": {
             "shared": true,
-            "sort": 0,
+            "sort": 2,
             "value_type": "individual"
           },
           "type": "graph",
@@ -13752,11 +13781,11 @@
           "yaxes": [
             {
               "decimals": 0,
-              "format": "ops",
+              "format": "short",
               "label": null,
               "logBase": 1,
               "max": null,
-              "min": "0",
+              "min": null,
               "show": true
             },
             {
@@ -13779,7 +13808,7 @@
           "dashLength": 10,
           "dashes": false,
           "datasource": "${DS_TEST-CLUSTER}",
-          "description": "I/O Limiter current pending count.",
+          "description": "The storage I/O limiter metrics.",
           "fieldConfig": {
             "defaults": {},
             "overrides": []
@@ -13788,16 +13817,16 @@
           "fillGradient": 0,
           "gridPos": {
             "h": 8,
-            "w": 12,
-            "x": 12,
+            "w": 8,
+            "x": 8,
             "y": 20
           },
           "hiddenSeries": false,
-          "id": 86,
+          "id": 266,
           "legend": {
             "alignAsTable": true,
             "avg": false,
-            "current": false,
+            "current": true,
             "hideZero": true,
             "max": true,
             "min": false,
@@ -13811,7 +13840,7 @@
           "lines": true,
           "linewidth": 1,
           "links": [],
-          "nullPointMode": "null",
+          "nullPointMode": "null as zero",
           "options": {
             "alertThreshold": true
           },
@@ -13822,7 +13851,7 @@
           "renderer": "flot",
           "seriesOverrides": [
             {
-              "alias": "/pending/",
+              "alias": "",
               "yaxis": 2
             }
           ],
@@ -13831,71 +13860,24 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "avg(tiflash_system_current_metric_RateLimiterPendingWriteRequest{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
+              "exemplar": true,
+              "expr": "sum(rate(tiflash_storage_io_limiter_pending_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (type, instance)",
               "format": "time_series",
-              "hide": true,
+              "instant": false,
               "interval": "",
-              "intervalFactor": 1,
-              "legendFormat": "other-current-{{instance}}",
+              "intervalFactor": 2,
+              "legendFormat": "{{type}}-{{instance}}",
               "refId": "A"
-            },
-            {
-              "exemplar": true,
-              "expr": "avg(tiflash_system_current_metric_IOLimiterPendingBgWriteReq{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
-              "hide": false,
-              "interval": "",
-              "legendFormat": "bgwrite-current-{{instance}}",
-              "refId": "B"
-            },
-            {
-              "exemplar": true,
-              "expr": "avg(tiflash_system_current_metric_IOLimiterPendingFgWriteReq{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
-              "hide": false,
-              "interval": "",
-              "legendFormat": "fgwrite-current-{{instance}}",
-              "refId": "C"
-            },
-            {
-              "exemplar": true,
-              "expr": "avg(tiflash_system_current_metric_IOLimiterPendingBgReadReq{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
-              "hide": false,
-              "interval": "",
-              "legendFormat": "bgread-current-{{instance}}",
-              "refId": "D"
-            },
-            {
-              "exemplar": true,
-              "expr": "avg(tiflash_system_current_metric_IOLimiterPendingFgReadReq{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
-              "hide": false,
-              "interval": "",
-              "legendFormat": "fgread-current-{{instance}}",
-              "refId": "E"
-            },
-            {
-              "exemplar": true,
-              "expr": "histogram_quantile(1.00, sum(round(1000000000*rate(tiflash_storage_io_limiter_pending_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m]))) by (le, type) / 1000000000)",
-              "hide": false,
-              "interval": "",
-              "legendFormat": "{{type}}-pending-max",
-              "refId": "F"
-            },
-            {
-              "exemplar": true,
-              "expr": "histogram_quantile(0.99, sum(rate(tiflash_storage_io_limiter_pending_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (le, type))",
-              "hide": false,
-              "interval": "",
-              "legendFormat": "{{type}}-pending-P99",
-              "refId": "G"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "I/O Limiter Current Pending Count",
+          "title": "I/O Limiter Pending OPS",
           "tooltip": {
             "shared": true,
-            "sort": 2,
+            "sort": 0,
             "value_type": "individual"
           },
           "type": "graph",
@@ -13909,11 +13891,11 @@
           "yaxes": [
             {
               "decimals": 0,
-              "format": "short",
+              "format": "ops",
               "label": null,
               "logBase": 1,
               "max": null,
-              "min": null,
+              "min": "0",
               "show": true
             },
             {
@@ -13929,40 +13911,164 @@
             "align": false,
             "alignLevel": null
           }
-        }
-      ],
-      "title": "Rate Limiter",
-      "type": "row"
-    },
-    {
-      "collapsed": true,
-      "datasource": null,
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 12
-      },
-      "id": 64,
-      "panels": [
+        },
         {
           "aliasColors": {},
           "bars": false,
           "dashLength": 10,
           "dashes": false,
           "datasource": "${DS_TEST-CLUSTER}",
-          "description": "The stall duration of write and delete range",
+          "description": "I/O Limiter pending duration.",
           "fieldConfig": {
             "defaults": {},
             "overrides": []
           },
-          "fill": 0,
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 8,
+            "x": 16,
+            "y": 20
+          },
+          "hiddenSeries": false,
+          "id": 369,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "hideZero": true,
+            "max": true,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": "max",
+            "sortDesc": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.5.11",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [
+            {
+              "alias": "/pending/",
+              "yaxis": 2
+            }
+          ],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "histogram_quantile(1.00, sum(round(1000000000*rate(tiflash_storage_io_limiter_pending_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m]))) by (le, type) / 1000000000)",
+              "hide": true,
+              "interval": "",
+              "legendFormat": "{{type}}-pending-max",
+              "refId": "F"
+            },
+            {
+              "exemplar": true,
+              "expr": "histogram_quantile(0.999, sum(rate(tiflash_storage_io_limiter_pending_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (le, type))",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "{{type}}-pending-P999",
+              "refId": "G"
+            },
+            {
+              "exemplar": true,
+              "expr": "histogram_quantile(0.99, sum(rate(tiflash_storage_io_limiter_pending_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (le, type))",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "{{type}}-pending-P99",
+              "refId": "H"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "I/O Limiter Pending Duration",
+          "tooltip": {
+            "shared": true,
+            "sort": 2,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "s",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        }
+      ],
+      "title": "Rate Limiter",
+      "type": "row"
+    },
+    {
+      "collapsed": true,
+      "datasource": null,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 12
+      },
+      "id": 64,
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${DS_TEST-CLUSTER}",
+          "description": "The stall duration of write and delete range",
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": []
+          },
+          "fill": 0,
           "fillGradient": 0,
           "gridPos": {
             "h": 8,
             "w": 24,
             "x": 0,
-            "y": 13
+            "y": 61
           },
           "hiddenSeries": false,
           "id": 62,
@@ -14081,7 +14187,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 21
+            "y": 69
           },
           "height": "",
           "hiddenSeries": false,
@@ -14200,7 +14306,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 21
+            "y": 69
           },
           "height": "",
           "hiddenSeries": false,
@@ -14317,7 +14423,7 @@
             "h": 9,
             "w": 24,
             "x": 0,
-            "y": 29
+            "y": 77
           },
           "height": "",
           "hiddenSeries": false,
@@ -14439,7 +14545,7 @@
             "h": 9,
             "w": 24,
             "x": 0,
-            "y": 38
+            "y": 86
           },
           "hiddenSeries": false,
           "id": 90,
@@ -14567,7 +14673,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 14
+            "y": 62
           },
           "hiddenSeries": false,
           "id": 167,
@@ -14667,7 +14773,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 14
+            "y": 62
           },
           "hiddenSeries": false,
           "id": 35,
@@ -14765,7 +14871,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 21
+            "y": 69
           },
           "hiddenSeries": false,
           "id": 270,
@@ -14865,7 +14971,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 21
+            "y": 69
           },
           "hiddenSeries": false,
           "id": 271,
@@ -14965,7 +15071,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 28
+            "y": 76
           },
           "hiddenSeries": false,
           "id": 37,
@@ -15109,7 +15215,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 28
+            "y": 76
           },
           "hiddenSeries": false,
           "id": 36,
@@ -15237,7 +15343,7 @@
             "h": 7,
             "w": 24,
             "x": 0,
-            "y": 35
+            "y": 83
           },
           "hiddenSeries": false,
           "id": 82,
@@ -15396,7 +15502,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 42
+            "y": 90
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -15466,7 +15572,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 42
+            "y": 90
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -15536,7 +15642,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 49
+            "y": 97
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -15608,7 +15714,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 49
+            "y": 97
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -15670,7 +15776,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 56
+            "y": 104
           },
           "hiddenSeries": false,
           "id": 235,
@@ -15770,7 +15876,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 56
+            "y": 104
           },
           "hiddenSeries": false,
           "id": 241,
@@ -15878,7 +15984,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 63
+            "y": 111
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -15959,7 +16065,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 63
+            "y": 111
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -16032,7 +16138,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 70
+            "y": 118
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -16115,7 +16221,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 70
+            "y": 118
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -16188,7 +16294,7 @@
             "h": 7,
             "w": 24,
             "x": 0,
-            "y": 77
+            "y": 125
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -16253,7 +16359,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 84
+            "y": 132
           },
           "hiddenSeries": false,
           "id": 240,
@@ -16357,7 +16463,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 84
+            "y": 132
           },
           "hiddenSeries": false,
           "id": 239,
@@ -16495,7 +16601,7 @@
             "h": 7,
             "w": 24,
             "x": 0,
-            "y": 91
+            "y": 139
           },
           "height": "",
           "hiddenSeries": false,
@@ -16605,7 +16711,7 @@
             "h": 7,
             "w": 24,
             "x": 0,
-            "y": 98
+            "y": 146
           },
           "height": "",
           "hiddenSeries": false,
@@ -16721,7 +16827,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 105
+            "y": 153
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -16787,7 +16893,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 105
+            "y": 153
           },
           "hiddenSeries": false,
           "id": 91,
@@ -16915,7 +17021,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 112
+            "y": 160
           },
           "hiddenSeries": false,
           "id": 296,
@@ -17035,7 +17141,7 @@
             "h": 7,
             "w": 24,
             "x": 0,
-            "y": 15
+            "y": 63
           },
           "hiddenSeries": false,
           "id": 242,
@@ -17139,7 +17245,7 @@
             "h": 7,
             "w": 24,
             "x": 0,
-            "y": 22
+            "y": 70
           },
           "hiddenSeries": false,
           "id": 75,
@@ -17273,7 +17379,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 29
+            "y": 77
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -17339,7 +17445,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 29
+            "y": 77
           },
           "hiddenSeries": false,
           "id": 249,
@@ -17445,7 +17551,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 36
+            "y": 84
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -17517,7 +17623,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 36
+            "y": 84
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -17587,7 +17693,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 43
+            "y": 91
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -17659,7 +17765,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 43
+            "y": 91
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -17729,7 +17835,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 50
+            "y": 98
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -17802,7 +17908,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 50
+            "y": 98
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -17871,7 +17977,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 57
+            "y": 105
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -17948,7 +18054,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 16
+            "y": 64
           },
           "hiddenSeries": false,
           "id": 99,
@@ -18101,7 +18207,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 16
+            "y": 64
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -18179,7 +18285,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 25
+            "y": 65
           },
           "hiddenSeries": false,
           "id": 187,
@@ -18299,7 +18405,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 25
+            "y": 65
           },
           "height": "",
           "hiddenSeries": false,
@@ -18418,7 +18524,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 33
+            "y": 73
           },
           "height": "",
           "hiddenSeries": false,
@@ -18528,7 +18634,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 33
+            "y": 73
           },
           "height": "",
           "hiddenSeries": false,
@@ -18641,7 +18747,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 41
+            "y": 81
           },
           "hiddenSeries": false,
           "id": 176,
@@ -18749,7 +18855,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 41
+            "y": 81
           },
           "hiddenSeries": false,
           "id": 175,
@@ -18876,7 +18982,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 49
+            "y": 89
           },
           "hiddenSeries": false,
           "id": 189,
@@ -18978,7 +19084,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 49
+            "y": 89
           },
           "hiddenSeries": false,
           "id": 191,
@@ -19083,7 +19189,7 @@
             "h": 8,
             "w": 8,
             "x": 0,
-            "y": 57
+            "y": 97
           },
           "hiddenSeries": false,
           "id": 365,
@@ -19189,7 +19295,7 @@
             "h": 8,
             "w": 9,
             "x": 8,
-            "y": 57
+            "y": 97
           },
           "hiddenSeries": false,
           "id": 193,
@@ -19316,7 +19422,7 @@
             "h": 8,
             "w": 7,
             "x": 17,
-            "y": 57
+            "y": 97
           },
           "hiddenSeries": false,
           "id": 195,
@@ -19427,7 +19533,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 65
+            "y": 105
           },
           "hiddenSeries": false,
           "id": 363,
@@ -19533,7 +19639,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 65
+            "y": 105
           },
           "hiddenSeries": false,
           "id": 364,
@@ -19636,7 +19742,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 73
+            "y": 113
           },
           "hiddenSeries": false,
           "id": 251,
@@ -19743,7 +19849,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 73
+            "y": 113
           },
           "hiddenSeries": false,
           "id": 252,
@@ -19850,7 +19956,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 81
+            "y": 121
           },
           "hiddenSeries": false,
           "id": 254,
@@ -19957,7 +20063,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 81
+            "y": 121
           },
           "hiddenSeries": false,
           "id": 253,
@@ -20403,7 +20509,6 @@
           "dashLength": 10,
           "dashes": false,
           "datasource": "${DS_TEST-CLUSTER}",
-          "description": "Remote Cache Usage",
           "fieldConfig": {
             "defaults": {},
             "overrides": []
@@ -20417,31 +20522,30 @@
             "y": 34
           },
           "hiddenSeries": false,
-          "id": 188,
+          "id": 373,
           "legend": {
             "alignAsTable": true,
             "avg": false,
             "current": true,
-            "hideZero": true,
-            "max": false,
+            "max": true,
             "min": false,
             "rightSide": true,
             "show": true,
-            "sort": "current",
+            "sideWidth": null,
+            "sort": "max",
             "sortDesc": true,
             "total": false,
             "values": true
           },
           "lines": true,
           "linewidth": 1,
-          "links": [],
-          "nullPointMode": "null as zero",
+          "nullPointMode": "null",
           "options": {
             "alertThreshold": true
           },
           "percentage": false,
           "pluginVersion": "7.5.11",
-          "pointradius": 5,
+          "pointradius": 2,
           "points": false,
           "renderer": "flot",
           "seriesOverrides": [],
@@ -20451,43 +20555,36 @@
           "targets": [
             {
               "exemplar": true,
-              "expr": "sum(tiflash_system_current_metric_DTFileCacheCapacity{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
-              "format": "time_series",
-              "hide": false,
+              "expr": "histogram_quantile(0.999, sum(rate(tiflash_storage_remote_cache_bg_download_stage_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (le, stage, file_type, $additional_groupby))",
+              "hide": true,
               "interval": "",
-              "legendFormat": "DTFileCapacity-{{instance}}",
+              "legendFormat": "999%-{{stage}}-{{file_type}} {{$additional_groupby}}",
+              "queryType": "randomWalk",
               "refId": "B"
             },
             {
               "exemplar": true,
-              "expr": "sum(tiflash_system_current_metric_DTFileCacheUsed{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
-              "hide": false,
-              "interval": "",
-              "legendFormat": "DTFileUsed-{{instance}}",
-              "refId": "A"
-            },
-            {
-              "exemplar": true,
-              "expr": "sum(tiflash_system_current_metric_PageCacheCapacity{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
+              "expr": "histogram_quantile(0.999, sum(rate(tiflash_storage_remote_cache_bg_download_stage_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (le, stage, file_type, $additional_groupby))",
               "hide": false,
               "interval": "",
-              "legendFormat": "PageCapacity-{{instance}}",
-              "refId": "C"
+              "legendFormat": "99%-{{stage}}-{{file_type}} {{$additional_groupby}}",
+              "queryType": "randomWalk",
+              "refId": "D"
             },
             {
               "exemplar": true,
-              "expr": "sum(tiflash_system_current_metric_PageCacheUsed{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
+              "expr": "(sum(rate(\n    tiflash_storage_remote_cache_bg_download_stage_seconds_sum\n    {k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}\n    [$__rate_interval]\n)) by (stage, file_type, $additional_groupby)  / sum(rate(\n    tiflash_storage_remote_cache_bg_download_stage_seconds_count\n    {k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}\n    [$__rate_interval]\n)) by (stage, file_type, $additional_groupby) )",
               "hide": false,
               "interval": "",
-              "legendFormat": "PageUsed-{{instance}}",
-              "refId": "D"
+              "legendFormat": "avg-{{stage}}-{{file_type}} {{$additional_groupby}}",
+              "refId": "A"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "Remote Cache Usage",
+          "title": "Remote Cache BG Download Duration",
           "tooltip": {
             "shared": true,
             "sort": 2,
@@ -20503,8 +20600,7 @@
           },
           "yaxes": [
             {
-              "decimals": null,
-              "format": "bytes",
+              "format": "s",
               "label": null,
               "logBase": 1,
               "max": null,
@@ -20512,12 +20608,12 @@
               "show": true
             },
             {
-              "format": "percentunit",
+              "format": "short",
               "label": null,
               "logBase": 1,
               "max": null,
-              "min": "0",
-              "show": false
+              "min": null,
+              "show": true
             }
           ],
           "yaxis": {
@@ -20531,7 +20627,6 @@
           "dashLength": 10,
           "dashes": false,
           "datasource": "${DS_TEST-CLUSTER}",
-          "description": "Memory Usage of Storage Tasks",
           "fieldConfig": {
             "defaults": {},
             "overrides": []
@@ -20545,29 +20640,30 @@
             "y": 34
           },
           "hiddenSeries": false,
-          "id": 233,
+          "id": 375,
           "legend": {
             "alignAsTable": true,
             "avg": false,
             "current": false,
-            "hideZero": true,
             "max": true,
             "min": false,
             "rightSide": true,
             "show": true,
+            "sideWidth": null,
+            "sort": "max",
+            "sortDesc": true,
             "total": false,
             "values": true
           },
           "lines": true,
           "linewidth": 1,
-          "links": [],
-          "nullPointMode": "null as zero",
+          "nullPointMode": "null",
           "options": {
             "alertThreshold": true
           },
           "percentage": false,
           "pluginVersion": "7.5.11",
-          "pointradius": 5,
+          "pointradius": 2,
           "points": false,
           "renderer": "flot",
           "seriesOverrides": [],
@@ -20577,7 +20673,676 @@
           "targets": [
             {
               "exemplar": true,
-              "expr": "sum(tiflash_system_current_metric_MemoryTrackingQueryStorageTask{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
+              "expr": "histogram_quantile(0.999, sum(rate(tiflash_storage_remote_cache_wait_on_downloading_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (le, result, file_type, $additional_groupby))",
+              "hide": true,
+              "interval": "",
+              "legendFormat": "999%-{{result}}-{{file_type}} {{$additional_groupby}}",
+              "queryType": "randomWalk",
+              "refId": "A"
+            },
+            {
+              "exemplar": true,
+              "expr": "histogram_quantile(0.99, sum(rate(tiflash_storage_remote_cache_wait_on_downloading_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (le, result, file_type, $additional_groupby))",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "99%-{{result}}-{{file_type}} {{$additional_groupby}}",
+              "queryType": "randomWalk",
+              "refId": "C"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Remote Cache Wait on Downloading Duration",
+          "tooltip": {
+            "shared": true,
+            "sort": 2,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "s",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${DS_TEST-CLUSTER}",
+          "description": "",
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 42
+          },
+          "hiddenSeries": false,
+          "id": 371,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "hideZero": true,
+            "max": true,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": "max",
+            "sortDesc": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null as zero",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.5.11",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [
+            {
+              "alias": "",
+              "yaxis": 2
+            }
+          ],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "sum(rate(tiflash_storage_remote_cache_wait_on_downloading_result{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (result, file_type , $additional_groupby)",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "{{result}}-{{file_type}} {{additional_groupby}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Remote Cache Wait on Downloading OPS",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "ops",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "s",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${DS_TEST-CLUSTER}",
+          "description": "",
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": []
+          },
+          "fill": 0,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 42
+          },
+          "hiddenSeries": false,
+          "id": 372,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "hideZero": true,
+            "max": true,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": "max",
+            "sortDesc": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null as zero",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.5.11",
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "sum(rate(tiflash_storage_remote_cache_wait_on_downloading_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (result, file_type , $additional_groupby)",
+              "format": "time_series",
+              "hide": false,
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "{{result}}-{{file_type}} {{$additional_groupby}}",
+              "refId": "B"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Remote Cache Wait on Downloading Flow",
+          "tooltip": {
+            "shared": true,
+            "sort": 2,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": null,
+              "format": "binBps",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "percentunit",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${DS_TEST-CLUSTER}",
+          "description": "",
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 50
+          },
+          "hiddenSeries": false,
+          "id": 370,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "hideEmpty": false,
+            "hideZero": true,
+            "max": true,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": "max",
+            "sortDesc": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null as zero",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.5.11",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "sum(tiflash_storage_remote_cache_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (type, instance)",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "{{type}}-{{instance}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Remote Cache Gauge",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${DS_TEST-CLUSTER}",
+          "description": "",
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 50
+          },
+          "hiddenSeries": false,
+          "id": 374,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "hideZero": true,
+            "max": true,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": "max",
+            "sortDesc": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null as zero",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.5.11",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [
+            {
+              "alias": "",
+              "yaxis": 2
+            }
+          ],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "sum(rate(tiflash_storage_remote_cache_reject{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (reason, file_type, $additional_groupby)",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "{{reason}}-{{file_type}} {{additional_groupby}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Remote Cache Reject Download Type OPS",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "ops",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "s",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${DS_TEST-CLUSTER}",
+          "description": "Remote Cache Usage",
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": []
+          },
+          "fill": 0,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 58
+          },
+          "hiddenSeries": false,
+          "id": 188,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "hideZero": true,
+            "max": false,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": "current",
+            "sortDesc": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null as zero",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.5.11",
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "sum(tiflash_system_current_metric_DTFileCacheCapacity{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
+              "format": "time_series",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "DTFileCapacity-{{instance}}",
+              "refId": "B"
+            },
+            {
+              "exemplar": true,
+              "expr": "sum(tiflash_system_current_metric_DTFileCacheUsed{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "DTFileUsed-{{instance}}",
+              "refId": "A"
+            },
+            {
+              "exemplar": true,
+              "expr": "sum(tiflash_system_current_metric_PageCacheCapacity{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "PageCapacity-{{instance}}",
+              "refId": "C"
+            },
+            {
+              "exemplar": true,
+              "expr": "sum(tiflash_system_current_metric_PageCacheUsed{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "PageUsed-{{instance}}",
+              "refId": "D"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Remote Cache Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 2,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": null,
+              "format": "bytes",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "percentunit",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${DS_TEST-CLUSTER}",
+          "description": "Memory Usage of Storage Tasks",
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": []
+          },
+          "fill": 0,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 58
+          },
+          "hiddenSeries": false,
+          "id": 233,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": false,
+            "hideZero": true,
+            "max": true,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null as zero",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.5.11",
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "sum(tiflash_system_current_metric_MemoryTrackingQueryStorageTask{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}) by (instance)",
               "hide": false,
               "interval": "",
               "legendFormat": "MemoryTrackingQueryStorageTask-{{instance}}",
@@ -20667,7 +21432,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 42
+            "y": 66
           },
           "hiddenSeries": false,
           "id": 236,
@@ -20783,7 +21548,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 42
+            "y": 66
           },
           "hiddenSeries": false,
           "id": 356,
@@ -20903,7 +21668,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 49
+            "y": 73
           },
           "hiddenSeries": false,
           "id": 353,
@@ -21013,7 +21778,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 49
+            "y": 73
           },
           "hiddenSeries": false,
           "id": 358,
@@ -22300,7 +23065,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 20
+            "y": 162
           },
           "hiddenSeries": false,
           "id": 224,
@@ -22400,7 +23165,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 20
+            "y": 162
           },
           "hiddenSeries": false,
           "id": 226,
@@ -22508,7 +23273,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 28
+            "y": 170
           },
           "hiddenSeries": false,
           "id": 228,
@@ -22611,7 +23376,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 28
+            "y": 170
           },
           "hiddenSeries": false,
           "id": 222,
@@ -22784,7 +23549,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 36
+            "y": 178
           },
           "hiddenSeries": false,
           "id": 230,
@@ -22939,7 +23704,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 36
+            "y": 178
           },
           "hiddenSeries": false,
           "id": 218,
@@ -23069,7 +23834,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 44
+            "y": 186
           },
           "hiddenSeries": false,
           "id": 220,
@@ -23199,7 +23964,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 44
+            "y": 186
           },
           "hiddenSeries": false,
           "id": 216,
@@ -23325,7 +24090,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 52
+            "y": 194
           },
           "hiddenSeries": false,
           "id": 300,
@@ -23448,7 +24213,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 21
+            "y": 203
           },
           "hiddenSeries": false,
           "id": 246,
@@ -23615,7 +24380,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 21
+            "y": 203
           },
           "hiddenSeries": false,
           "id": 201,
@@ -23773,7 +24538,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 22
+            "y": 204
           },
           "hiddenSeries": false,
           "id": 338,
@@ -23893,7 +24658,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 22
+            "y": 204
           },
           "hiddenSeries": false,
           "id": 341,
@@ -24023,7 +24788,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 23
+            "y": 205
           },
           "hiddenSeries": false,
           "id": 286,
@@ -24144,7 +24909,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 23
+            "y": 205
           },
           "hiddenSeries": false,
           "id": 288,
@@ -24273,7 +25038,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 31
+            "y": 213
           },
           "hiddenSeries": false,
           "id": 282,
@@ -24400,7 +25165,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 31
+            "y": 213
           },
           "hiddenSeries": false,
           "id": 284,
@@ -24496,7 +25261,7 @@
       "type": "row"
     }
   ],
-  "refresh": "30s",
+  "refresh": "1m",
   "schemaVersion": 27,
   "style": "dark",
   "tags": [],
@@ -24613,7 +25378,7 @@
       {
         "allValue": null,
         "current": {
-          "selected": true,
+          "selected": false,
           "text": "none",
           "value": "none"
         },
@@ -24644,7 +25409,7 @@
       {
         "allValue": null,
         "current": {
-          "selected": true,
+          "selected": false,
           "text": "All",
           "value": ".*"
         },

From 9412be4ae7fd9cc22ca559f2ab337c1b7637d290 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sun, 5 Apr 2026 22:08:01 +0800
Subject: [PATCH 27/36] disagg: tighten filecache startup and low-limit reads

---
 dbms/src/Server/Server.cpp                    |  4 +++
 dbms/src/Storages/S3/FileCache.cpp            |  6 ++++
 dbms/src/Storages/S3/S3ReadLimiter.cpp        |  2 +-
 dbms/src/Storages/S3/S3ReadLimiter.h          |  2 +-
 .../src/Storages/S3/tests/gtest_filecache.cpp | 32 +++++++++++++++++++
 5 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp
index 1b4ed5d65d6..857f8d203fa 100644
--- a/dbms/src/Server/Server.cpp
+++ b/dbms/src/Server/Server.cpp
@@ -776,6 +776,10 @@ try
             config,
             server_info.cpu_info.logical_cores,
             global_context->getIORateLimiter());
+        // FileCache::initialize() only constructs the global instance. Push the current settings once
+        // here so startup-time values like dt_filecache_wait_on_downloading_ms take effect immediately
+        // instead of waiting for a later config reload.
+        FileCache::instance()->updateConfig(global_context->getSettingsRef());
     }
 
     /// Determining PageStorage run mode based on current files on disk and storage config.
diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 62d1ee41698..35f3f457e25 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -1278,6 +1278,12 @@ void downloadToLocal(
         return;
     }
 
+    // Keep each refill within the limiter-suggested chunk size. Otherwise a low byte limit would
+    // turn every 128 KiB refill into an oversized borrowing request and let downloads run ahead
+    // of the configured node-level budget.
+    buffer_size = std::min<Int64>(
+        buffer_size,
+        static_cast<Int64>(s3_read_limiter->getSuggestedChunkSize(static_cast<UInt64>(buffer_size))));
     // The limiter-aware buffer preserves the old copyData/write-buffer path while charging the shared
     // S3 budget before each refill from the remote body stream.
     ReadBufferFromIStreamWithLimiter rbuf(istr, buffer_size, s3_read_limiter, S3::S3ReadSource::FileCacheDownload);
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.cpp b/dbms/src/Storages/S3/S3ReadLimiter.cpp
index 013703cb563..11ec2b5090c 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.cpp
+++ b/dbms/src/Storages/S3/S3ReadLimiter.cpp
@@ -33,7 +33,7 @@ void recordWaitIfNeeded(bool waited, const Stopwatch & sw, F && observe)
 }
 } // namespace
 
-void DB::S3::S3ReadMetricsRecorder::recordBytes(UInt64 bytes, S3ReadSource source) const
+void DB::S3::S3ReadMetricsRecorder::recordBytes(UInt64 bytes, S3ReadSource source)
 {
     if (bytes == 0)
         return;
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.h b/dbms/src/Storages/S3/S3ReadLimiter.h
index bb8db363451..b2e90dd88f9 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.h
+++ b/dbms/src/Storages/S3/S3ReadLimiter.h
@@ -34,7 +34,7 @@ class S3ReadMetricsRecorder
 {
 public:
     /// Record remote-read bytes regardless of whether byte throttling is enabled.
-    void recordBytes(UInt64 bytes, S3ReadSource source) const;
+    static void recordBytes(UInt64 bytes, S3ReadSource source);
 };
 
 class S3ReadLimiter
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index bb4412a088a..e8fb127c411 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -1303,6 +1303,38 @@ TEST_F(FileCacheTest, BgDownloadWorksWithSharedS3ReadLimiter)
     waitForBgDownload(file_cache);
 }
 
+TEST_F(FileCacheTest, BgDownloadUsesLimiterSuggestedChunkSize)
+{
+    auto cache_dir = fmt::format("{}/bg_download_low_limit", tmp_dir);
+    StorageRemoteCacheConfig cache_config{.dir = cache_dir, .capacity = cache_capacity, .dtfile_level = 100};
+
+    UInt16 vcores = 2;
+    IORateLimiter rate_limiter;
+    FileCache file_cache(capacity_metrics, cache_config, vcores, rate_limiter);
+    Settings settings;
+    settings.dt_filecache_downloading_count_scale = 2.0;
+    settings.dt_filecache_max_downloading_count_scale = 2.0;
+    file_cache.updateConfig(settings);
+
+    auto object_key = fmt::format("s{}/data/t_{}/dmf_{}/small_merged.merged", nextId(), nextId(), nextId());
+    constexpr size_t object_size = 8 * 1024;
+    writeFile(object_key, '7', object_size, WriteSettings{});
+
+    auto limiter = std::make_shared<S3ReadLimiter>(32 * 1024, /*refill_period_ms*/ 100);
+    s3_client->setS3ReadLimiter(limiter);
+    SCOPE_EXIT({ s3_client->setS3ReadLimiter(nullptr); });
+
+    AtomicStopwatch watch;
+    ASSERT_EQ(file_cache.get(S3FilenameView::fromKey(object_key), object_size), nullptr);
+    waitForBgDownload(file_cache);
+    ASSERT_GE(watch.elapsedMilliseconds(), 200);
+
+    auto file_seg = file_cache.get(S3FilenameView::fromKey(object_key), object_size);
+    ASSERT_NE(file_seg, nullptr);
+    ASSERT_TRUE(file_seg->isReadyToRead());
+    ASSERT_EQ(file_seg->getSize(), object_size);
+}
+
 TEST_F(FileCacheTest, GetWaitOnDownloadingSupportsColDataAndOther)
 {
     auto cache_dir = fmt::format("{}/wait_on_downloading_non_merged", tmp_dir);

From 04ee7e81cb318aa4fe910d76197caabd79cb2647 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sun, 5 Apr 2026 23:47:10 +0800
Subject: [PATCH 28/36] disagg: fix failed download cache accounting

---
 dbms/src/Storages/S3/FileCache.cpp            |  4 ++
 dbms/src/Storages/S3/FileCache.h              |  8 ++++
 .../src/Storages/S3/tests/gtest_filecache.cpp | 38 +++++++++++++++++++
 3 files changed, 50 insertions(+)

diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 35f3f457e25..469f8c4d096 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -1325,6 +1325,10 @@ void FileCache::downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, c
         file_seg->setStatus(FileSegment::Status::Failed);
         return;
     }
+    // finalizeReservedSize() has already adjusted cache_used to the actual object size. Keep the
+    // segment size in sync before any later throw point so failed downloads release the correct
+    // reservation instead of the old estimated size.
+    file_seg->setSize(content_length);
 
     const auto & local_fname = file_seg->getLocalFileName();
     // download as a temp file then rename to a formal file
diff --git a/dbms/src/Storages/S3/FileCache.h b/dbms/src/Storages/S3/FileCache.h
index 700d36167fc..5d9c92c4bd5 100644
--- a/dbms/src/Storages/S3/FileCache.h
+++ b/dbms/src/Storages/S3/FileCache.h
@@ -107,6 +107,14 @@ class FileSegment
             cv_ready.notify_all();
     }
 
+    /// Update the reserved size without changing readiness state. This is used after reservation has
+    /// been rebased to the real object size but before the download finishes.
+    void setSize(UInt64 size_)
+    {
+        std::lock_guard lock(mtx);
+        size = size_;
+    }
+
     Status getStatus() const
     {
         std::lock_guard lock(mtx);
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index e8fb127c411..12a0856ed3f 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -1461,4 +1461,42 @@ TEST_F(FileCacheTest, GetBeingBlock)
     waitForBgDownload(file_cache);
 }
 
+TEST_F(FileCacheTest, FailedDownloadReleasesFinalizedReservedSize)
+{
+    auto cache_dir = fmt::format("{}/failed_download_reserved_size", tmp_dir);
+    StorageRemoteCacheConfig cache_config{.dir = cache_dir, .capacity = cache_capacity, .dtfile_level = 100};
+
+    UInt16 vcores = 2;
+    IORateLimiter rate_limiter;
+    FileCache file_cache(capacity_metrics, cache_config, vcores, rate_limiter);
+    Settings settings;
+    settings.dt_filecache_downloading_count_scale = 2.0;
+    settings.dt_filecache_max_downloading_count_scale = 2.0;
+    file_cache.updateConfig(settings);
+
+    auto objects = genObjects(/*store_count*/ 1, /*table_count*/ 1, /*file_count*/ 1, {"1.merged"});
+    auto key = S3FilenameView::fromKey(objects[0].key);
+
+    auto assert_failed_download_releases_space = [&](std::optional<UInt64> requested_size) {
+        FailPointHelper::enableFailPoint(FailPoints::file_cache_bg_download_fail);
+        SCOPE_EXIT({ FailPointHelper::disableFailPoint(FailPoints::file_cache_bg_download_fail); });
+
+        ASSERT_EQ(file_cache.cache_used, 0);
+        ASSERT_EQ(file_cache.get(key, requested_size), nullptr);
+        waitForBgDownload(file_cache);
+
+        ASSERT_EQ(file_cache.cache_used, 0);
+        ASSERT_EQ(file_cache.bg_download_fail_count.load(std::memory_order_relaxed), 1);
+        ASSERT_EQ(file_cache.bg_download_succ_count.load(std::memory_order_relaxed), 0);
+    };
+
+    // std::nullopt uses the file-type estimate and must still release the finalized reservation correctly.
+    assert_failed_download_releases_space(std::nullopt);
+
+    file_cache.bg_download_fail_count.store(0, std::memory_order_relaxed);
+
+    // A caller-provided wrong size should also not corrupt cache_used after finalizeReservedSize rebases it.
+    assert_failed_download_releases_space(objects[0].size - 1024);
+}
+
 } // namespace DB::tests::S3

From d440f12f2e08aa3ff8430c68256a0d292f55825f Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Sun, 5 Apr 2026 23:48:53 +0800
Subject: [PATCH 29/36] Update grafana panel

Signed-off-by: JaySon-Huang <tshent@qq.com>
---
 metrics/grafana/tiflash_summary.json | 79 +++++++++++++---------------
 1 file changed, 37 insertions(+), 42 deletions(-)

diff --git a/metrics/grafana/tiflash_summary.json b/metrics/grafana/tiflash_summary.json
index 22c00c4a054..2b9804cc295 100644
--- a/metrics/grafana/tiflash_summary.json
+++ b/metrics/grafana/tiflash_summary.json
@@ -52,7 +52,7 @@
   "gnetId": null,
   "graphTooltip": 1,
   "id": null,
-  "iteration": 1775388239670,
+  "iteration": 1775402037950,
   "links": [],
   "panels": [
     {
@@ -13959,12 +13959,7 @@
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
-          "seriesOverrides": [
-            {
-              "alias": "/pending/",
-              "yaxis": 2
-            }
-          ],
+          "seriesOverrides": [],
           "spaceLength": 10,
           "stack": false,
           "steppedLine": false,
@@ -14015,7 +14010,7 @@
           "yaxes": [
             {
               "decimals": 0,
-              "format": "short",
+              "format": "s",
               "label": null,
               "logBase": 1,
               "max": null,
@@ -18285,7 +18280,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 65
+            "y": 33
           },
           "hiddenSeries": false,
           "id": 187,
@@ -18405,7 +18400,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 65
+            "y": 33
           },
           "height": "",
           "hiddenSeries": false,
@@ -18524,7 +18519,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 73
+            "y": 41
           },
           "height": "",
           "hiddenSeries": false,
@@ -18634,7 +18629,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 73
+            "y": 41
           },
           "height": "",
           "hiddenSeries": false,
@@ -18747,7 +18742,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 81
+            "y": 49
           },
           "hiddenSeries": false,
           "id": 176,
@@ -18855,7 +18850,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 81
+            "y": 49
           },
           "hiddenSeries": false,
           "id": 175,
@@ -18982,7 +18977,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 89
+            "y": 57
           },
           "hiddenSeries": false,
           "id": 189,
@@ -19084,7 +19079,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 89
+            "y": 57
           },
           "hiddenSeries": false,
           "id": 191,
@@ -19189,7 +19184,7 @@
             "h": 8,
             "w": 8,
             "x": 0,
-            "y": 97
+            "y": 65
           },
           "hiddenSeries": false,
           "id": 365,
@@ -19295,7 +19290,7 @@
             "h": 8,
             "w": 9,
             "x": 8,
-            "y": 97
+            "y": 65
           },
           "hiddenSeries": false,
           "id": 193,
@@ -19422,7 +19417,7 @@
             "h": 8,
             "w": 7,
             "x": 17,
-            "y": 97
+            "y": 65
           },
           "hiddenSeries": false,
           "id": 195,
@@ -19533,7 +19528,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 105
+            "y": 73
           },
           "hiddenSeries": false,
           "id": 363,
@@ -19639,7 +19634,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 105
+            "y": 73
           },
           "hiddenSeries": false,
           "id": 364,
@@ -19742,7 +19737,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 113
+            "y": 81
           },
           "hiddenSeries": false,
           "id": 251,
@@ -19849,7 +19844,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 113
+            "y": 81
           },
           "hiddenSeries": false,
           "id": 252,
@@ -19956,7 +19951,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 121
+            "y": 89
           },
           "hiddenSeries": false,
           "id": 254,
@@ -20063,7 +20058,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 121
+            "y": 89
           },
           "hiddenSeries": false,
           "id": 253,
@@ -20184,7 +20179,7 @@
             "h": 8,
             "w": 24,
             "x": 0,
-            "y": 18
+            "y": 34
           },
           "hiddenSeries": false,
           "id": 173,
@@ -20285,7 +20280,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 26
+            "y": 42
           },
           "hiddenSeries": false,
           "id": 185,
@@ -20415,7 +20410,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 26
+            "y": 42
           },
           "hiddenSeries": false,
           "id": 186,
@@ -20519,7 +20514,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 34
+            "y": 50
           },
           "hiddenSeries": false,
           "id": 373,
@@ -20564,7 +20559,7 @@
             },
             {
               "exemplar": true,
-              "expr": "histogram_quantile(0.999, sum(rate(tiflash_storage_remote_cache_bg_download_stage_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (le, stage, file_type, $additional_groupby))",
+              "expr": "histogram_quantile(0.99, sum(rate(tiflash_storage_remote_cache_bg_download_stage_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\"}[1m])) by (le, stage, file_type, $additional_groupby))",
               "hide": false,
               "interval": "",
               "legendFormat": "99%-{{stage}}-{{file_type}} {{$additional_groupby}}",
@@ -20637,7 +20632,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 34
+            "y": 50
           },
           "hiddenSeries": false,
           "id": 375,
@@ -20748,7 +20743,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 42
+            "y": 58
           },
           "hiddenSeries": false,
           "id": 371,
@@ -20795,7 +20790,7 @@
               "instant": false,
               "interval": "",
               "intervalFactor": 2,
-              "legendFormat": "{{result}}-{{file_type}} {{additional_groupby}}",
+              "legendFormat": "{{result}}-{{file_type}} {{$additional_groupby}}",
               "refId": "A"
             }
           ],
@@ -20858,7 +20853,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 42
+            "y": 58
           },
           "hiddenSeries": false,
           "id": 372,
@@ -20963,7 +20958,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 50
+            "y": 66
           },
           "hiddenSeries": false,
           "id": 370,
@@ -21069,7 +21064,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 50
+            "y": 66
           },
           "hiddenSeries": false,
           "id": 374,
@@ -21179,7 +21174,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 58
+            "y": 74
           },
           "hiddenSeries": false,
           "id": 188,
@@ -21307,7 +21302,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 58
+            "y": 74
           },
           "hiddenSeries": false,
           "id": 233,
@@ -21432,7 +21427,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 66
+            "y": 82
           },
           "hiddenSeries": false,
           "id": 236,
@@ -21548,7 +21543,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 66
+            "y": 82
           },
           "hiddenSeries": false,
           "id": 356,
@@ -21668,7 +21663,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 73
+            "y": 89
           },
           "hiddenSeries": false,
           "id": 353,
@@ -21778,7 +21773,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 73
+            "y": 89
           },
           "hiddenSeries": false,
           "id": 358,

From 3e9bd94eb92c40e4bb26f60c30fdbbb2854cd1d9 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Mon, 6 Apr 2026 00:22:01 +0800
Subject: [PATCH 30/36] Update grafana panel

Signed-off-by: JaySon-Huang <tshent@qq.com>
---
 metrics/grafana/tiflash_summary.json | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/metrics/grafana/tiflash_summary.json b/metrics/grafana/tiflash_summary.json
index 2b9804cc295..c89d56ac423 100644
--- a/metrics/grafana/tiflash_summary.json
+++ b/metrics/grafana/tiflash_summary.json
@@ -9486,7 +9486,7 @@
               "hide": true,
               "interval": "",
               "intervalFactor": 2,
-              "legendFormat": "max-{{type}} {{additional_groupby}}",
+              "legendFormat": "max-{{type}} {{$additional_groupby}}",
               "refId": "A"
             },
             {
@@ -9494,7 +9494,7 @@
               "expr": "histogram_quantile(0.9999, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\", type!~\"(delta_merge|seg_merge|seg_split).*\"}[$__rate_interval])) by (le, type, $additional_groupby))",
               "hide": false,
               "interval": "",
-              "legendFormat": "9999-{{type}} {{additional_groupby}}",
+              "legendFormat": "9999-{{type}} {{$additional_groupby}}",
               "refId": "B"
             },
             {
@@ -9502,7 +9502,7 @@
               "expr": "histogram_quantile(0.99, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\", type!~\"(delta_merge|seg_merge|seg_split).*\"}[$__rate_interval])) by (le, type, $additional_groupby))",
               "hide": false,
               "interval": "",
-              "legendFormat": "99-{{type}} {{additional_groupby}}",
+              "legendFormat": "99-{{type}} {{$additional_groupby}}",
               "refId": "C"
             }
           ],
@@ -9709,7 +9709,7 @@
               "hide": true,
               "interval": "",
               "intervalFactor": 2,
-              "legendFormat": "max-{{type}} {{additional_groupby}}",
+              "legendFormat": "max-{{type}} {{$additional_groupby}}",
               "refId": "A"
             },
             {
@@ -9717,7 +9717,7 @@
               "expr": "histogram_quantile(0.9999, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\", type=~\"(delta_merge|seg_merge|seg_split).*\"}[$__rate_interval])) by (le, type, $additional_groupby))",
               "hide": false,
               "interval": "",
-              "legendFormat": "9999-{{type}} {{additional_groupby}}",
+              "legendFormat": "9999-{{type}} {{$additional_groupby}}",
               "refId": "B"
             },
             {
@@ -9725,7 +9725,7 @@
               "expr": "histogram_quantile(0.99, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", instance=~\"$tiflash_role\", type=~\"(delta_merge|seg_merge|seg_split).*\"}[$__rate_interval])) by (le, type, $additional_groupby))",
               "hide": false,
               "interval": "",
-              "legendFormat": "99-{{type}} {{additional_groupby}}",
+              "legendFormat": "99-{{type}} {{$additional_groupby}}",
               "refId": "C"
             }
           ],
@@ -21111,7 +21111,7 @@
               "instant": false,
               "interval": "",
               "intervalFactor": 2,
-              "legendFormat": "{{reason}}-{{file_type}} {{additional_groupby}}",
+              "legendFormat": "{{reason}}-{{file_type}} {{$additional_groupby}}",
               "refId": "A"
             }
           ],

From 1141c0913dfdda31dac13ebba6a8dca1774382f7 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Mon, 6 Apr 2026 00:38:35 +0800
Subject: [PATCH 31/36] disagg: refine filecache failure cleanup metrics

---
 dbms/src/Storages/S3/FileCache.cpp            | 37 ++++++++++++++-----
 dbms/src/Storages/S3/FileCache.h              |  3 +-
 .../src/Storages/S3/tests/gtest_filecache.cpp |  7 ++++
 3 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 469f8c4d096..12251fab335 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -779,7 +779,8 @@ std::pair<Int64, std::list<String>::iterator> FileCache::removeImpl(
     LRUFileTable & table,
     const String & s3_key,
     FileSegmentPtr & f,
-    bool force)
+    bool force,
+    bool count_as_evict)
 {
     // Except current thread and the FileTable,
     // there are other threads hold this FileSegment object.
@@ -794,8 +795,11 @@ std::pair<Int64, std::list<String>::iterator> FileCache::removeImpl(
     removeDiskFile(temp_fname, /*update_fsize_metrics*/ false);
 
     auto release_size = f->getSize();
-    GET_METRIC(tiflash_storage_remote_cache, type_dtfile_evict).Increment();
-    GET_METRIC(tiflash_storage_remote_cache_bytes, type_dtfile_evict_bytes).Increment(release_size);
+    if (count_as_evict)
+    {
+        GET_METRIC(tiflash_storage_remote_cache, type_dtfile_evict).Increment();
+        GET_METRIC(tiflash_storage_remote_cache_bytes, type_dtfile_evict_bytes).Increment(release_size);
+    }
     releaseSpaceImpl(release_size);
     return {release_size, table.remove(s3_key)};
 }
@@ -1271,9 +1275,11 @@ void downloadToLocal(
     {
         ReadBufferFromIStream rbuf(istr, buffer_size);
         WriteBufferFromWritableFile wbuf(ofile, buffer_size);
+        SCOPE_EXIT({
+            if (s3_read_metrics_recorder != nullptr)
+                s3_read_metrics_recorder->recordBytes(rbuf.count(), S3::S3ReadSource::FileCacheDownload);
+        });
         copyData(rbuf, wbuf, content_length);
-        if (s3_read_metrics_recorder != nullptr)
-            s3_read_metrics_recorder->recordBytes(rbuf.count(), S3::S3ReadSource::FileCacheDownload);
         wbuf.sync();
         return;
     }
@@ -1288,9 +1294,11 @@ void downloadToLocal(
     // S3 budget before each refill from the remote body stream.
     ReadBufferFromIStreamWithLimiter rbuf(istr, buffer_size, s3_read_limiter, S3::S3ReadSource::FileCacheDownload);
     WriteBufferFromWritableFile wbuf(ofile, buffer_size);
+    SCOPE_EXIT({
+        if (s3_read_metrics_recorder != nullptr)
+            s3_read_metrics_recorder->recordBytes(rbuf.count(), S3::S3ReadSource::FileCacheDownload);
+    });
     copyData(rbuf, wbuf, content_length);
-    if (s3_read_metrics_recorder != nullptr)
-        s3_read_metrics_recorder->recordBytes(rbuf.count(), S3::S3ReadSource::FileCacheDownload);
     wbuf.sync();
 }
 
@@ -1399,7 +1407,13 @@ void FileCache::bgDownloadExecutor(
         file_seg.reset();
         // Followers may still hold the failed segment while waking up from bounded wait. Force removal so
         // the failed placeholder does not stay published in the cache table and block later retries.
-        remove(s3_key, /*force*/ true);
+        // This is failed-download cleanup rather than cache eviction, so do not count eviction metrics.
+        auto file_type = getFileType(s3_key);
+        auto & table = tables[static_cast<UInt64>(file_type)];
+        std::unique_lock lock(mtx);
+        auto f = table.get(s3_key, /*update_lru*/ false);
+        if (f != nullptr)
+            std::ignore = removeImpl(table, s3_key, f, /*force*/ true, /*count_as_evict*/ false);
     }
     else
     {
@@ -1452,7 +1466,12 @@ void FileCache::fgDownload(const String & s3_key, FileSegmentPtr & file_seg)
         file_seg->setStatus(FileSegment::Status::Failed);
         GET_METRIC(tiflash_storage_remote_cache, type_dtfile_download_failed).Increment();
         file_seg.reset();
-        remove(s3_key, /*force*/ true);
+        auto file_type = getFileType(s3_key);
+        auto & table = tables[static_cast<UInt64>(file_type)];
+        std::unique_lock lock(mtx);
+        auto f = table.get(s3_key, /*update_lru*/ false);
+        if (f != nullptr)
+            std::ignore = removeImpl(table, s3_key, f, /*force*/ true, /*count_as_evict*/ false);
     }
 
     LOG_DEBUG(log, "foreground downloading => s3_key {} finished", s3_key);
diff --git a/dbms/src/Storages/S3/FileCache.h b/dbms/src/Storages/S3/FileCache.h
index 5d9c92c4bd5..df6c41225ab 100644
--- a/dbms/src/Storages/S3/FileCache.h
+++ b/dbms/src/Storages/S3/FileCache.h
@@ -391,7 +391,8 @@ class FileCache
         LRUFileTable & table,
         const String & s3_key,
         FileSegmentPtr & f,
-        bool force = false);
+        bool force = false,
+        bool count_as_evict = true);
     void removeDiskFile(const String & local_fname, bool update_fsize_metrics) const;
 
     // Estimated size is an empirical value.
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index 12a0856ed3f..2ca7036de36 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -1301,6 +1301,13 @@ TEST_F(FileCacheTest, BgDownloadWorksWithSharedS3ReadLimiter)
     sp_download.next();
     sp_download.disable();
     waitForBgDownload(file_cache);
+
+    auto file_seg1 = file_cache.get(S3FilenameView::fromKey(objects[0].key), objects[0].size);
+    auto file_seg2 = file_cache.get(S3FilenameView::fromKey(objects[1].key), objects[1].size);
+    ASSERT_NE(file_seg1, nullptr);
+    ASSERT_NE(file_seg2, nullptr);
+    ASSERT_TRUE(file_seg1->isReadyToRead());
+    ASSERT_TRUE(file_seg2->isReadyToRead());
 }
 
 TEST_F(FileCacheTest, BgDownloadUsesLimiterSuggestedChunkSize)

From 8e1e3564894329dde8a22cd6ed71cad5a68ef716 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Mon, 6 Apr 2026 01:19:16 +0800
Subject: [PATCH 32/36] disagg: restore S3 fast paths and scope bg download
 failure

---
 dbms/src/Storages/S3/FileCache.cpp             | 5 +++--
 dbms/src/Storages/S3/S3RandomAccessFile.cpp    | 2 +-
 dbms/src/Storages/S3/tests/gtest_filecache.cpp | 8 ++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 12251fab335..44e0ebbe4a1 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -1271,7 +1271,7 @@ void downloadToLocal(
     GET_METRIC(tiflash_storage_remote_cache_bytes, type_dtfile_download_bytes).Increment(content_length);
     constexpr Int64 max_buffer_size = 128 * 1024; // 128 KiB
     auto buffer_size = std::min<Int64>(content_length, max_buffer_size);
-    if (s3_read_limiter == nullptr)
+    if (s3_read_limiter == nullptr || s3_read_limiter->maxReadBytesPerSec() == 0)
     {
         ReadBufferFromIStream rbuf(istr, buffer_size);
         WriteBufferFromWritableFile wbuf(ofile, buffer_size);
@@ -1343,7 +1343,6 @@ void FileCache::downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, c
     prepareParentDir(local_fname);
     auto temp_fname = toTemporaryFilename(local_fname);
     SYNC_FOR("before_FileCache::downloadImpl_download_to_local");
-    FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::file_cache_bg_download_fail);
     downloadToLocal(
         result.GetBody(),
         temp_fname,
@@ -1390,6 +1389,8 @@ void FileCache::bgDownloadExecutor(
     Stopwatch download_watch;
     try
     {
+        SYNC_FOR("before_FileCache::bgDownloadExecutor_fail_point");
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::file_cache_bg_download_fail);
         GET_METRIC(tiflash_storage_remote_cache, type_dtfile_download).Increment();
         downloadImpl(s3_key, file_seg, write_limiter);
     }
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.cpp b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
index 30930ab8811..799e4d50bc5 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.cpp
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
@@ -130,7 +130,7 @@ ssize_t S3RandomAccessFile::read(char * buf, size_t size)
 
 ssize_t S3RandomAccessFile::readImpl(char * buf, size_t size)
 {
-    if (read_limiter != nullptr)
+    if (read_limiter != nullptr && read_limiter->maxReadBytesPerSec() > 0)
         // Charge the shared node-level budget in small chunks instead of allowing a single large `read()` to burst.
         return readChunked(buf, size);
 
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index 2ca7036de36..ad8a6c36750 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -1226,13 +1226,13 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingReturnsMissWhenDownloaderFails)
     file_cache.updateConfig(settings);
 
     auto objects = genObjects(/*store_count*/ 1, /*table_count*/ 1, /*file_count*/ 1, {"1.merged"});
-    auto sp_download = SyncPointCtl::enableInScope("before_FileCache::downloadImpl_download_to_local");
+    auto sp_fail = SyncPointCtl::enableInScope("before_FileCache::bgDownloadExecutor_fail_point");
     auto sp_wait = SyncPointCtl::enableInScope("before_FileSegment::waitForNotEmpty_wait");
 
     auto key = S3FilenameView::fromKey(objects[0].key);
     // First caller creates the `Empty` placeholder and starts the background download.
     ASSERT_EQ(file_cache.get(key, objects[0].size), nullptr);
-    sp_download.waitAndPause();
+    sp_fail.waitAndPause();
 
     // The follower reaches `get()` while the same key is still being downloaded. Inject a failure right before
     // the downloader starts copying the body so the follower wakes up with `Status::Failed` and returns miss.
@@ -1241,11 +1241,11 @@ TEST_F(FileCacheTest, GetWaitOnDownloadingReturnsMissWhenDownloaderFails)
         SCOPE_EXIT({ FailPointHelper::disableFailPoint(FailPoints::file_cache_bg_download_fail); });
         auto wait_failed = std::async(std::launch::async, [&]() { return file_cache.get(key, objects[0].size); });
         sp_wait.waitAndPause();
-        sp_download.next();
+        sp_fail.next();
         sp_wait.next();
         ASSERT_EQ(wait_failed.get(), nullptr);
     }
-    sp_download.disable();
+    sp_fail.disable();
 
     waitForBgDownload(file_cache);
     ASSERT_EQ(file_cache.bg_download_fail_count.load(std::memory_order_relaxed), 1);

From 56b7cafa59e03de1e873e505efe9ca554757c494 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Mon, 6 Apr 2026 01:39:18 +0800
Subject: [PATCH 33/36] disagg: restore disabled-limiter seek fast path

---
 dbms/src/Common/FailPoint.cpp               |  1 +
 dbms/src/Storages/S3/S3RandomAccessFile.cpp |  4 ++-
 dbms/src/Storages/S3/tests/gtest_s3file.cpp | 32 +++++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/dbms/src/Common/FailPoint.cpp b/dbms/src/Common/FailPoint.cpp
index 4146829b0d7..bc88af33b7b 100644
--- a/dbms/src/Common/FailPoint.cpp
+++ b/dbms/src/Common/FailPoint.cpp
@@ -136,6 +136,7 @@ namespace DB
     M(force_join_v2_probe_disable_lm)                        \
     M(force_s3_random_access_file_init_fail)                 \
     M(force_s3_random_access_file_read_fail)                 \
+    M(force_s3_random_access_file_seek_chunked)              \
     M(force_release_snap_meet_null_storage)
 
 #define APPLY_FOR_PAUSEABLE_FAILPOINTS_ONCE(M)    \
diff --git a/dbms/src/Storages/S3/S3RandomAccessFile.cpp b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
index 799e4d50bc5..7cf48dc1537 100644
--- a/dbms/src/Storages/S3/S3RandomAccessFile.cpp
+++ b/dbms/src/Storages/S3/S3RandomAccessFile.cpp
@@ -54,6 +54,7 @@ namespace DB::FailPoints
 {
 extern const char force_s3_random_access_file_init_fail[];
 extern const char force_s3_random_access_file_read_fail[];
+extern const char force_s3_random_access_file_seek_chunked[];
 } // namespace DB::FailPoints
 
 namespace DB::S3
@@ -272,7 +273,7 @@ off_t S3RandomAccessFile::seekImpl(off_t offset_, int whence)
         return cur_offset;
     }
 
-    if (read_limiter != nullptr)
+    if (read_limiter != nullptr && read_limiter->maxReadBytesPerSec() > 0)
         return seekChunked(offset_);
 
     // Forward seek
@@ -288,6 +289,7 @@ off_t S3RandomAccessFile::seekChunked(off_t offset)
 {
     Stopwatch sw;
     ProfileEvents::increment(ProfileEvents::S3IOSeek, 1);
+    FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::force_s3_random_access_file_seek_chunked);
     auto & istr = read_result.GetBody();
     // Use the same chunk heuristic as readChunked() so forward seeks do not turn into one oversized
     // limiter request when skipping a large remote range.
diff --git a/dbms/src/Storages/S3/tests/gtest_s3file.cpp b/dbms/src/Storages/S3/tests/gtest_s3file.cpp
index 6ba148d0a99..a1c9849efdb 100644
--- a/dbms/src/Storages/S3/tests/gtest_s3file.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_s3file.cpp
@@ -68,6 +68,7 @@ extern const char force_set_mocked_s3_object_mtime[];
 extern const char force_syncpoint_on_s3_upload[];
 extern const char force_s3_random_access_file_init_fail[];
 extern const char force_s3_random_access_file_read_fail[];
+extern const char force_s3_random_access_file_seek_chunked[];
 } // namespace DB::FailPoints
 
 namespace DB::tests
@@ -279,6 +280,37 @@ try
 }
 CATCH
 
+TEST_P(S3FileTest, SeekSkipsChunkedPathWhenLimiterDisabled)
+try
+{
+    const auto size = 1024 * 1024 * 10; // 10MB
+    const String key = "/a/b/c/seek_disabled_limiter";
+    writeFile(key, size, WriteSettings{});
+
+    auto prev_limiter = s3_client->getS3ReadLimiter();
+    auto disabled_limiter = std::make_shared<S3ReadLimiter>(0, 1);
+    s3_client->setS3ReadLimiter(disabled_limiter);
+    SCOPE_EXIT({ s3_client->setS3ReadLimiter(prev_limiter); });
+
+    S3RandomAccessFile file(s3_client, key, nullptr);
+    std::vector<char> tmp_buf(256);
+    ASSERT_EQ(file.read(tmp_buf.data(), tmp_buf.size()), tmp_buf.size());
+
+    FailPointHelper::enableFailPoint(FailPoints::force_s3_random_access_file_seek_chunked);
+    SCOPE_EXIT({ FailPointHelper::disableFailPoint(FailPoints::force_s3_random_access_file_seek_chunked); });
+
+    constexpr off_t target_offset = 1024 * 1024;
+    off_t seek_offset = -1;
+    ASSERT_NO_THROW(seek_offset = file.seek(target_offset, SEEK_SET));
+    ASSERT_EQ(seek_offset, target_offset);
+    ASSERT_EQ(file.read(tmp_buf.data(), tmp_buf.size()), tmp_buf.size());
+
+    std::vector<char> expected(256);
+    std::iota(expected.begin(), expected.end(), 0);
+    ASSERT_EQ(tmp_buf, expected);
+}
+CATCH
+
 TEST_P(S3FileTest, ReadAfterDel1)
 try
 {

From b767295a23c4f82320122f03a425d30cff70ae23 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Mon, 6 Apr 2026 02:04:13 +0800
Subject: [PATCH 34/36] disagg: clean up failed file cache scheduling

---
 dbms/src/Common/FailPoint.cpp                 |  1 +
 dbms/src/Storages/S3/FileCache.cpp            | 63 ++++++++++++-------
 dbms/src/Storages/S3/FileCache.h              |  2 +
 .../src/Storages/S3/tests/gtest_filecache.cpp | 32 ++++++++++
 4 files changed, 76 insertions(+), 22 deletions(-)

diff --git a/dbms/src/Common/FailPoint.cpp b/dbms/src/Common/FailPoint.cpp
index bc88af33b7b..f20aac96b35 100644
--- a/dbms/src/Common/FailPoint.cpp
+++ b/dbms/src/Common/FailPoint.cpp
@@ -116,6 +116,7 @@ namespace DB
     M(cop_send_failure)                                      \
     M(file_cache_fg_download_fail)                           \
     M(file_cache_bg_download_fail)                           \
+    M(file_cache_bg_download_schedule_fail)                  \
     M(force_set_parallel_prehandle_threshold)                \
     M(force_raise_prehandle_exception)                       \
     M(force_agg_on_partial_block)                            \
diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index 44e0ebbe4a1..c197f7d4bb7 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -77,6 +77,7 @@ namespace DB::FailPoints
 {
 extern const char file_cache_fg_download_fail[];
 extern const char file_cache_bg_download_fail[];
+extern const char file_cache_bg_download_schedule_fail[];
 } // namespace DB::FailPoints
 
 namespace DB
@@ -1402,31 +1403,15 @@ void FileCache::bgDownloadExecutor(
     observeBgDownloadStageMetrics(file_seg->getFileType(), BgDownloadStage::Download, download_watch.elapsedSeconds());
     if (!file_seg->isReadyToRead())
     {
-        file_seg->setStatus(FileSegment::Status::Failed);
         GET_METRIC(tiflash_storage_remote_cache, type_dtfile_download_failed).Increment();
         bg_download_fail_count.fetch_add(1, std::memory_order_relaxed);
-        file_seg.reset();
-        // Followers may still hold the failed segment while waking up from bounded wait. Force removal so
-        // the failed placeholder does not stay published in the cache table and block later retries.
-        // This is failed-download cleanup rather than cache eviction, so do not count eviction metrics.
-        auto file_type = getFileType(s3_key);
-        auto & table = tables[static_cast<UInt64>(file_type)];
-        std::unique_lock lock(mtx);
-        auto f = table.get(s3_key, /*update_lru*/ false);
-        if (f != nullptr)
-            std::ignore = removeImpl(table, s3_key, f, /*force*/ true, /*count_as_evict*/ false);
+        cleanupFailedDownload(s3_key, file_seg);
     }
     else
     {
         bg_download_succ_count.fetch_add(1, std::memory_order_relaxed);
     }
-    bg_downloading_count.fetch_sub(1, std::memory_order_relaxed);
-    updateBgDownloadStatusMetrics(bg_downloading_count.load(std::memory_order_relaxed));
-    LOG_DEBUG(
-        log,
-        "downloading count {} => s3_key {} finished",
-        bg_downloading_count.load(std::memory_order_relaxed),
-        s3_key);
+    finishBgDownload(s3_key);
 }
 
 void FileCache::bgDownload(const String & s3_key, FileSegmentPtr & file_seg)
@@ -1440,10 +1425,44 @@ void FileCache::bgDownload(const String & s3_key, FileSegmentPtr & file_seg)
         s3_key);
     auto write_limiter = rate_limiter.getBgWriteLimiter();
     auto enqueue_time = std::chrono::steady_clock::now();
-    S3FileCachePool::get().scheduleOrThrowOnError(
-        [this, s3_key = s3_key, file_seg = file_seg, limiter = std::move(write_limiter), enqueue_time]() mutable {
-            bgDownloadExecutor(s3_key, file_seg, limiter, enqueue_time);
-        });
+    try
+    {
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::file_cache_bg_download_schedule_fail);
+        S3FileCachePool::get().scheduleOrThrowOnError(
+            [this, s3_key = s3_key, file_seg = file_seg, limiter = std::move(write_limiter), enqueue_time]() mutable {
+                bgDownloadExecutor(s3_key, file_seg, limiter, enqueue_time);
+            });
+    }
+    catch (...)
+    {
+        tryLogCurrentWarningException(log, fmt::format("Schedule background download s3_key={} failed", s3_key));
+        GET_METRIC(tiflash_storage_remote_cache, type_dtfile_download_failed).Increment();
+        bg_download_fail_count.fetch_add(1, std::memory_order_relaxed);
+        cleanupFailedDownload(s3_key, file_seg);
+        finishBgDownload(s3_key);
+    }
+}
+
+void FileCache::finishBgDownload(const String & s3_key)
+{
+    const auto count_after_finish = bg_downloading_count.fetch_sub(1, std::memory_order_relaxed) - 1;
+    updateBgDownloadStatusMetrics(count_after_finish);
+    LOG_DEBUG(log, "downloading count {} => s3_key {} finished", count_after_finish, s3_key);
+}
+
+void FileCache::cleanupFailedDownload(const String & s3_key, FileSegmentPtr & file_seg)
+{
+    file_seg->setStatus(FileSegment::Status::Failed);
+    file_seg.reset();
+    // Followers may still hold the failed segment while waking up from bounded wait. Force removal so
+    // the failed placeholder does not stay published in the cache table and block later retries.
+    // This is failed-download cleanup rather than cache eviction, so do not count eviction metrics.
+    auto file_type = getFileType(s3_key);
+    auto & table = tables[static_cast<UInt64>(file_type)];
+    std::unique_lock lock(mtx);
+    auto f = table.get(s3_key, /*update_lru*/ false);
+    if (f != nullptr)
+        std::ignore = removeImpl(table, s3_key, f, /*force*/ true, /*count_as_evict*/ false);
 }
 
 void FileCache::fgDownload(const String & s3_key, FileSegmentPtr & file_seg)
diff --git a/dbms/src/Storages/S3/FileCache.h b/dbms/src/Storages/S3/FileCache.h
index df6c41225ab..f813af2d9d5 100644
--- a/dbms/src/Storages/S3/FileCache.h
+++ b/dbms/src/Storages/S3/FileCache.h
@@ -371,6 +371,8 @@ class FileCache
         FileSegmentPtr & file_seg,
         const WriteLimiterPtr & write_limiter,
         std::chrono::steady_clock::time_point enqueue_time);
+    void finishBgDownload(const String & s3_key);
+    void cleanupFailedDownload(const String & s3_key, FileSegmentPtr & file_seg);
     void downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, const WriteLimiterPtr & write_limiter);
 
     static String toTemporaryFilename(const String & fname);
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index ad8a6c36750..e48973576c8 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -58,6 +58,7 @@ extern const int FILE_DOESNT_EXIST;
 namespace DB::FailPoints
 {
 extern const char file_cache_bg_download_fail[];
+extern const char file_cache_bg_download_schedule_fail[];
 }
 
 namespace DB::tests::S3
@@ -1506,4 +1507,35 @@ TEST_F(FileCacheTest, FailedDownloadReleasesFinalizedReservedSize)
     assert_failed_download_releases_space(objects[0].size - 1024);
 }
 
+TEST_F(FileCacheTest, ScheduleBgDownloadFailureCleansUpPlaceholder)
+{
+    auto cache_dir = fmt::format("{}/schedule_bg_download_failure", tmp_dir);
+    StorageRemoteCacheConfig cache_config{.dir = cache_dir, .capacity = cache_capacity, .dtfile_level = 100};
+
+    UInt16 vcores = 2;
+    IORateLimiter rate_limiter;
+    FileCache file_cache(capacity_metrics, cache_config, vcores, rate_limiter);
+    Settings settings;
+    settings.dt_filecache_wait_on_downloading_ms = 200;
+    file_cache.updateConfig(settings);
+
+    auto objects = genObjects(/*store_count*/ 1, /*table_count*/ 1, /*file_count*/ 1, {"1.merged"});
+    auto key = S3FilenameView::fromKey(objects[0].key);
+    auto file_type = FileCache::getFileType(objects[0].key);
+
+    FailPointHelper::enableFailPoint(FailPoints::file_cache_bg_download_schedule_fail);
+    SCOPE_EXIT({ FailPointHelper::disableFailPoint(FailPoints::file_cache_bg_download_schedule_fail); });
+
+    ASSERT_EQ(file_cache.get(key, objects[0].size), nullptr);
+    ASSERT_EQ(file_cache.bg_downloading_count.load(std::memory_order_relaxed), 0);
+    ASSERT_EQ(file_cache.bg_download_fail_count.load(std::memory_order_relaxed), 1);
+    ASSERT_EQ(file_cache.tables[static_cast<UInt64>(file_type)].get(objects[0].key, /*update_lru*/ false), nullptr);
+
+    FailPointHelper::disableFailPoint(FailPoints::file_cache_bg_download_schedule_fail);
+    ASSERT_EQ(file_cache.get(key, objects[0].size), nullptr);
+    waitForBgDownload(file_cache);
+    ASSERT_EQ(file_cache.bg_download_succ_count.load(std::memory_order_relaxed), 1);
+    ASSERT_NE(file_cache.tables[static_cast<UInt64>(file_type)].get(objects[0].key, /*update_lru*/ false), nullptr);
+}
+
 } // namespace DB::tests::S3

From f7d36f731894e2e1508a8762d9078789a334ae99 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Mon, 6 Apr 2026 03:00:35 +0800
Subject: [PATCH 35/36] disagg: clarify S3 limiter semantics and harden tests

---
 dbms/src/IO/BaseFile/RateLimiter.cpp              |  7 +++++++
 dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp | 11 ++++++++---
 dbms/src/Storages/S3/S3ReadLimiter.h              |  9 ++++-----
 dbms/src/Storages/S3/tests/gtest_filecache.cpp    |  7 +++++--
 4 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/dbms/src/IO/BaseFile/RateLimiter.cpp b/dbms/src/IO/BaseFile/RateLimiter.cpp
index ab6b2c9af87..dab45dbc62e 100644
--- a/dbms/src/IO/BaseFile/RateLimiter.cpp
+++ b/dbms/src/IO/BaseFile/RateLimiter.cpp
@@ -527,6 +527,13 @@ void IORateLimiter::updateLimiterByConfig(const IORateLimitConfig & cfg)
     updateWriteLimiter(cfg.getBgWriteMaxBytesPerSec(), cfg.getFgWriteMaxBytesPerSec());
 
     // updateS3ReadLimiter
+    // Keep an existing S3 limiter object alive across reloads so readers that already snapped the
+    // shared_ptr can observe `nonzero -> 0` disable updates via `updateConfig(0)` instead of being
+    // stuck with a stale throttling state. Today we intentionally accept a narrower semantic on the
+    // first `0 -> nonzero` transition: if startup published no limiter object, only newly created
+    // readers will see the limiter after it is first created here.
+    // TODO: Consider publishing a no-op S3ReadLimiter even when the configured rate is 0, so a later
+    // `0 -> nonzero` reload can also reach readers that previously snapped a nullptr.
     if (s3_read_limiter == nullptr)
     {
         if (cfg.s3_max_read_bytes_per_sec != 0)
diff --git a/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp b/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
index c3dee082cb9..9418fa4703d 100644
--- a/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
+++ b/dbms/src/IO/BaseFile/tests/gtest_rate_limiter.cpp
@@ -427,9 +427,14 @@ TEST(S3ReadLimiterTest, LargeRequestDoesNotWaitForever)
 
     // The initial burst is only 100 bytes, but callers that request a larger chunk should still make
     // forward progress instead of waiting forever for a budget that can never accumulate.
-    AtomicStopwatch watch;
-    limiter.requestBytes(128 * 1024, S3::S3ReadSource::DirectRead);
-    ASSERT_LT(watch.elapsedMilliseconds(), 200);
+    auto future = std::async(std::launch::async, [&] {
+        AtomicStopwatch watch;
+        limiter.requestBytes(128 * 1024, S3::S3ReadSource::DirectRead);
+        return watch.elapsedMilliseconds();
+    });
+
+    ASSERT_EQ(future.wait_for(1s), std::future_status::ready);
+    ASSERT_LT(future.get(), 200);
 }
 
 #ifdef __linux__
diff --git a/dbms/src/Storages/S3/S3ReadLimiter.h b/dbms/src/Storages/S3/S3ReadLimiter.h
index b2e90dd88f9..55749cd7ecd 100644
--- a/dbms/src/Storages/S3/S3ReadLimiter.h
+++ b/dbms/src/Storages/S3/S3ReadLimiter.h
@@ -53,13 +53,12 @@ class S3ReadLimiter
 
     /// A lightweight node-level limiter for S3 remote reads.
     ///
-    /// It currently limits one dimension:
+    /// It currently enforces byte-rate limiting only:
     /// - total remote-read bytes consumed by direct reads and FileCache downloads
     ///
-    /// The stream dimension is best-effort protection against too many live response bodies, not a
-    /// replacement for byte throttling and not a safe cap on reader object count. In TiFlash a
-    /// `S3RandomAccessFile` may keep its body stream open across scheduling gaps, so a low stream
-    /// limit can block forward progress even when the node is no longer transferring many bytes.
+    /// Concurrent/open-stream limiting is not provided here. TiFlash readers may keep response
+    /// bodies open across scheduling gaps, so treating open streams as a hard cap can block forward
+    /// progress even when the node is no longer transferring many bytes.
     explicit S3ReadLimiter(UInt64 max_read_bytes_per_sec_ = 0, UInt64 refill_period_ms_ = 100);
 
     ~S3ReadLimiter();
diff --git a/dbms/src/Storages/S3/tests/gtest_filecache.cpp b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
index e48973576c8..acc8194278b 100644
--- a/dbms/src/Storages/S3/tests/gtest_filecache.cpp
+++ b/dbms/src/Storages/S3/tests/gtest_filecache.cpp
@@ -59,7 +59,7 @@ namespace DB::FailPoints
 {
 extern const char file_cache_bg_download_fail[];
 extern const char file_cache_bg_download_schedule_fail[];
-}
+} // namespace DB::FailPoints
 
 namespace DB::tests::S3
 {
@@ -1332,10 +1332,13 @@ TEST_F(FileCacheTest, BgDownloadUsesLimiterSuggestedChunkSize)
     s3_client->setS3ReadLimiter(limiter);
     SCOPE_EXIT({ s3_client->setS3ReadLimiter(nullptr); });
 
+    constexpr Int64 expected_delay_ms = 200;
+    constexpr Int64 delay_tolerance_ms = 40;
+
     AtomicStopwatch watch;
     ASSERT_EQ(file_cache.get(S3FilenameView::fromKey(object_key), object_size), nullptr);
     waitForBgDownload(file_cache);
-    ASSERT_GE(watch.elapsedMilliseconds(), 200);
+    ASSERT_GE(watch.elapsedMilliseconds(), expected_delay_ms - delay_tolerance_ms);
 
     auto file_seg = file_cache.get(S3FilenameView::fromKey(object_key), object_size);
     ASSERT_NE(file_seg, nullptr);

From 59bc78c09af64236173010beecf1049ec1d26208 Mon Sep 17 00:00:00 2001
From: JaySon-Huang <tshent@qq.com>
Date: Mon, 6 Apr 2026 19:23:20 +0800
Subject: [PATCH 36/36] disagg: avoid pool access during file cache shutdown

---
 dbms/src/Storages/S3/FileCache.cpp | 31 ++++++++++++++++++------------
 dbms/src/Storages/S3/FileCache.h   |  5 +++--
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/dbms/src/Storages/S3/FileCache.cpp b/dbms/src/Storages/S3/FileCache.cpp
index c197f7d4bb7..a29740b9711 100644
--- a/dbms/src/Storages/S3/FileCache.cpp
+++ b/dbms/src/Storages/S3/FileCache.cpp
@@ -244,10 +244,9 @@ void observeRemoteCacheRejectMetrics(FileType file_type)
         .Increment();
 }
 
-void updateBgDownloadStatusMetrics(Int64 bg_downloading_count)
+void updateBgDownloadStatusMetrics(Int64 bg_downloading_count, Int64 running_limit)
 {
     GET_METRIC(tiflash_storage_remote_cache_status, type_bg_downloading_count).Set(bg_downloading_count);
-    const auto running_limit = static_cast<Int64>(S3FileCachePool::get().getMaxThreads());
     GET_METRIC(tiflash_storage_remote_cache_status, type_bg_download_queue_count)
         .Set(std::max<Int64>(0, bg_downloading_count - running_limit));
 }
@@ -469,7 +468,7 @@ FileCache::FileCache(
     , log(Logger::get("FileCache"))
 {
     CurrentMetrics::set(CurrentMetrics::DTFileCacheCapacity, cache_capacity);
-    updateBgDownloadStatusMetrics(0);
+    updateBgDownloadStatusMetrics(0, /*running_limit*/ 0);
     prepareDir(cache_dir);
     restore();
 }
@@ -1380,7 +1379,8 @@ void FileCache::bgDownloadExecutor(
     const String & s3_key,
     FileSegmentPtr & file_seg,
     const WriteLimiterPtr & write_limiter,
-    std::chrono::steady_clock::time_point enqueue_time)
+    std::chrono::steady_clock::time_point enqueue_time,
+    Int64 running_limit)
 {
     observeBgDownloadStageMetrics(
         file_seg->getFileType(),
@@ -1411,13 +1411,17 @@ void FileCache::bgDownloadExecutor(
     {
         bg_download_succ_count.fetch_add(1, std::memory_order_relaxed);
     }
-    finishBgDownload(s3_key);
+    finishBgDownload(s3_key, running_limit);
 }
 
 void FileCache::bgDownload(const String & s3_key, FileSegmentPtr & file_seg)
 {
     bg_downloading_count.fetch_add(1, std::memory_order_relaxed);
-    updateBgDownloadStatusMetrics(bg_downloading_count.load(std::memory_order_relaxed));
+    // Capture the pool concurrency limit before scheduling. Background workers still update
+    // queue gauges while finishing, but tests may shut the global S3FileCachePool down at the
+    // same time. Re-reading the singleton from the worker tail would race with shutdown.
+    const auto running_limit = static_cast<Int64>(S3FileCachePool::get().getMaxThreads());
+    updateBgDownloadStatusMetrics(bg_downloading_count.load(std::memory_order_relaxed), running_limit);
     LOG_DEBUG(
         log,
         "downloading count {} => s3_key {} start",
@@ -1429,9 +1433,12 @@ void FileCache::bgDownload(const String & s3_key, FileSegmentPtr & file_seg)
     {
         FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::file_cache_bg_download_schedule_fail);
         S3FileCachePool::get().scheduleOrThrowOnError(
-            [this, s3_key = s3_key, file_seg = file_seg, limiter = std::move(write_limiter), enqueue_time]() mutable {
-                bgDownloadExecutor(s3_key, file_seg, limiter, enqueue_time);
-            });
+            [this,
+             s3_key = s3_key,
+             file_seg = file_seg,
+             limiter = std::move(write_limiter),
+             enqueue_time,
+             running_limit]() mutable { bgDownloadExecutor(s3_key, file_seg, limiter, enqueue_time, running_limit); });
     }
     catch (...)
     {
@@ -1439,14 +1446,14 @@ void FileCache::bgDownload(const String & s3_key, FileSegmentPtr & file_seg)
         GET_METRIC(tiflash_storage_remote_cache, type_dtfile_download_failed).Increment();
         bg_download_fail_count.fetch_add(1, std::memory_order_relaxed);
         cleanupFailedDownload(s3_key, file_seg);
-        finishBgDownload(s3_key);
+        finishBgDownload(s3_key, running_limit);
     }
 }
 
-void FileCache::finishBgDownload(const String & s3_key)
+void FileCache::finishBgDownload(const String & s3_key, Int64 running_limit)
 {
     const auto count_after_finish = bg_downloading_count.fetch_sub(1, std::memory_order_relaxed) - 1;
-    updateBgDownloadStatusMetrics(count_after_finish);
+    updateBgDownloadStatusMetrics(count_after_finish, running_limit);
     LOG_DEBUG(log, "downloading count {} => s3_key {} finished", count_after_finish, s3_key);
 }
 
diff --git a/dbms/src/Storages/S3/FileCache.h b/dbms/src/Storages/S3/FileCache.h
index f813af2d9d5..e64ba8f1ff9 100644
--- a/dbms/src/Storages/S3/FileCache.h
+++ b/dbms/src/Storages/S3/FileCache.h
@@ -370,8 +370,9 @@ class FileCache
         const String & s3_key,
         FileSegmentPtr & file_seg,
         const WriteLimiterPtr & write_limiter,
-        std::chrono::steady_clock::time_point enqueue_time);
-    void finishBgDownload(const String & s3_key);
+        std::chrono::steady_clock::time_point enqueue_time,
+        Int64 running_limit);
+    void finishBgDownload(const String & s3_key, Int64 running_limit);
     void cleanupFailedDownload(const String & s3_key, FileSegmentPtr & file_seg);
     void downloadImpl(const String & s3_key, FileSegmentPtr & file_seg, const WriteLimiterPtr & write_limiter);