Skip to content

Commit 76ff175

Browse files
DongheJinyq33victor
authored andcommitted
feat: reduce KV cache allocation when enable_schedule_overlap is disabled
1 parent fe16411 commit 76ff175

File tree

3 files changed

+19
-10
lines changed

3 files changed

+19
-10
lines changed

xllm/core/scheduler/chunked_prefill_scheduler.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ void ChunkedPrefillScheduler::handle_running_queue_requests(
5353
bool& budget_exhausted,
5454
bool& blocks_exhausted) {
5555
while (!running_queue->empty() &&
56-
remaining_token_budget > options_.num_speculative_tokens() * 2 &&
56+
remaining_token_budget > min_speculative_tokens_required_ &&
5757
latency_budget > estimate_latency && remaining_seq_budget > 0) {
5858
std::shared_ptr<Request> request(running_queue->top());
5959
// TODO: check if request is timeout
@@ -96,7 +96,7 @@ void ChunkedPrefillScheduler::handle_running_queue_requests(
9696
size_t num_tokens_to_handle =
9797
sequence->is_prefill_stage()
9898
? std::min(assume_max_tokens, num_tokens - kv_cache_tokens_num)
99-
: 1 + options_.num_speculative_tokens() * 2;
99+
: 1 + min_speculative_tokens_required_;
100100

101101
if (allocated_seqs + 1 > remaining_seq_budget ||
102102
allocated_tokens + num_tokens_to_handle > remaining_token_budget) {
@@ -723,7 +723,7 @@ bool ChunkedPrefillScheduler::allocate_blocks_for(
723723
size_t token_budget,
724724
size_t* current_step_handle_tokens) {
725725
// token budget should be large enough for one speculative decoding step
726-
CHECK_GT(token_budget, options_.num_speculative_tokens() * 2);
726+
CHECK_GT(token_budget, min_speculative_tokens_required_);
727727

728728
allocate_shared_blocks_for(sequence);
729729

@@ -740,7 +740,7 @@ bool ChunkedPrefillScheduler::allocate_blocks_for(
740740
// if in decoding stage
741741
if (options_.num_speculative_tokens() > 0 && !sequence->is_prefill_stage() &&
742742
kv_cache_tokens_num > 0) {
743-
max_handle_num_tokens += options_.num_speculative_tokens() * 2;
743+
max_handle_num_tokens += min_speculative_tokens_required_;
744744
}
745745

746746
// make sure the sequence proceeds forward

xllm/core/scheduler/continuous_scheduler.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,13 @@ ContinuousScheduler::ContinuousScheduler(Engine* engine, const Options& options)
8787
instance_info_.name = options_.instance_name().value_or("");
8888
instance_info_.type = options_.instance_role().value().to_string();
8989
instance_info_.dp_size = options.dp_size();
90+
91+
if (options_.enable_schedule_overlap()) {
92+
min_speculative_tokens_required_ = options_.num_speculative_tokens() * 2;
93+
} else {
94+
min_speculative_tokens_required_ = options_.num_speculative_tokens();
95+
}
96+
9097
}
9198

9299
ContinuousScheduler::~ContinuousScheduler() { running_requests_.clear(); }
@@ -366,7 +373,7 @@ void ContinuousScheduler::handle_decode_requests(
366373
size_t& num_online_decode_preempt_offline_requests,
367374
std::unique_ptr<DecodePriorityQueue>& running_queue) {
368375
while (!running_queue->empty() &&
369-
remaining_token_budget > options_.num_speculative_tokens() * 2 &&
376+
remaining_token_budget > min_speculative_tokens_required_ &&
370377
latency_budget > estimate_latency && remaining_seq_budget > 0) {
371378
std::shared_ptr<Request> request = running_queue->top();
372379
// TODO: check if request is timeout
@@ -402,15 +409,15 @@ void ContinuousScheduler::handle_decode_requests(
402409
break;
403410
}
404411
}
405-
if (allocated_tokens + options_.num_speculative_tokens() * 2 >=
412+
if (allocated_tokens + min_speculative_tokens_required_ >=
406413
remaining_token_budget ||
407414
allocated_seqs >= remaining_seq_budget) {
408415
has_enough_budget = false;
409416
break;
410417
}
411418
// sequence token already appended
412419
size_t updated_num_tokens =
413-
sequence->num_tokens() + options_.num_speculative_tokens() * 2;
420+
sequence->num_tokens() + min_speculative_tokens_required_;
414421
// no blocks left
415422
if (!kv_cache_manager_->allocate(sequence.get(), updated_num_tokens)) {
416423
has_enough_blocks = false;
@@ -422,12 +429,12 @@ void ContinuousScheduler::handle_decode_requests(
422429
}
423430

424431
// update the allocated tokens for the sequence
425-
allocated_tokens += options_.num_speculative_tokens() * 2 + 1;
432+
allocated_tokens += min_speculative_tokens_required_ + 1;
426433
allocated_seqs += 1;
427434
allocated_estimate_latency += seq_estimate_latency;
428435
candidate_sequences.emplace_back(sequence.get());
429-
candidate_token_budgets.emplace_back(
430-
options_.num_speculative_tokens() * 2 + 1);
436+
candidate_token_budgets.emplace_back(min_speculative_tokens_required_ +
437+
1);
431438
}
432439
CHECK(allocated_tokens <= remaining_token_budget);
433440
CHECK(allocated_seqs <= remaining_seq_budget);

xllm/core/scheduler/continuous_scheduler.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,8 @@ class ContinuousScheduler : public Scheduler {
246246

247247
InstanceInfo instance_info_;
248248

249+
int32_t min_speculative_tokens_required_ = 0;
250+
249251
virtual void handle_prefill_requests(
250252
double& latency_budget,
251253
double& estimate_latency,

0 commit comments

Comments
 (0)