Skip to content

Commit d7d52e0

Browse files
Merge pull request #284 from LouisDDN/ld/skip-multiturn-decode-only
kvcache: skip multi-turn cache reads in decode-only mode
2 parents 8323ed0 + f45de66 commit d7d52e0

1 file changed

Lines changed: 11 additions & 11 deletions

File tree

kv_cache_benchmark/kv_cache/benchmark.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -514,19 +514,19 @@ def process_requests(self, stop_event: threading.Event):
514514
storage_latency += read_lat
515515
request.context_tokens = remaining_tokens
516516

517-
# 2. For multi-turn conversations, access cache from previous turn.
518-
if self.conversation_manager and request.turn_number > 1:
519-
prev_turn_key = f"{request.conversation_id}_turn_{request.turn_number - 1}"
520-
location, read_latency = self.cache.access_cache(prev_turn_key, InferencePhase.DECODE, 'multi_turn')
521-
if location is not None:
522-
storage_latency += read_latency
523-
with self.results_lock: self.results['multi_turn_cache_hits'] += 1
524-
else:
525-
with self.results_lock: self.results['multi_turn_cache_misses'] += 1
526-
527-
# 3. Perform the main PREFILL operation (a cache WRITE).
528517
# Skip if decode_only mode (disaggregated decode node)
529518
if not self.decode_only:
519+
# 2. For multi-turn conversations, access cache from previous turn.
520+
if self.conversation_manager and request.turn_number > 1:
521+
prev_turn_key = f"{request.conversation_id}_turn_{request.turn_number - 1}"
522+
location, read_latency = self.cache.access_cache(prev_turn_key, InferencePhase.DECODE, 'multi_turn')
523+
if location is not None:
524+
storage_latency += read_latency
525+
with self.results_lock: self.results['multi_turn_cache_hits'] += 1
526+
else:
527+
with self.results_lock: self.results['multi_turn_cache_misses'] += 1
528+
529+
# 3. Perform the main PREFILL operation (a cache WRITE).
530530
if request.phase == InferencePhase.PREFILL or request.phase == InferencePhase.PREFILL_DECODE:
531531
success, location, write_latency = self.cache.allocate_cache(
532532
request.cache_key, request.context_tokens, InferencePhase.PREFILL

0 commit comments

Comments
 (0)