@@ -514,19 +514,19 @@ def process_requests(self, stop_event: threading.Event):
514514 storage_latency += read_lat
515515 request .context_tokens = remaining_tokens
516516
517- # 2. For multi-turn conversations, access cache from previous turn.
518- if self .conversation_manager and request .turn_number > 1 :
519- prev_turn_key = f"{ request .conversation_id } _turn_{ request .turn_number - 1 } "
520- location , read_latency = self .cache .access_cache (prev_turn_key , InferencePhase .DECODE , 'multi_turn' )
521- if location is not None :
522- storage_latency += read_latency
523- with self .results_lock : self .results ['multi_turn_cache_hits' ] += 1
524- else :
525- with self .results_lock : self .results ['multi_turn_cache_misses' ] += 1
526-
527- # 3. Perform the main PREFILL operation (a cache WRITE).
528517 # Skip if decode_only mode (disaggregated decode node)
529518 if not self .decode_only :
519+ # 2. For multi-turn conversations, access cache from previous turn.
520+ if self .conversation_manager and request .turn_number > 1 :
521+ prev_turn_key = f"{ request .conversation_id } _turn_{ request .turn_number - 1 } "
522+ location , read_latency = self .cache .access_cache (prev_turn_key , InferencePhase .DECODE , 'multi_turn' )
523+ if location is not None :
524+ storage_latency += read_latency
525+ with self .results_lock : self .results ['multi_turn_cache_hits' ] += 1
526+ else :
527+ with self .results_lock : self .results ['multi_turn_cache_misses' ] += 1
528+
529+ # 3. Perform the main PREFILL operation (a cache WRITE).
530530 if request .phase == InferencePhase .PREFILL or request .phase == InferencePhase .PREFILL_DECODE :
531531 success , location , write_latency = self .cache .allocate_cache (
532532 request .cache_key , request .context_tokens , InferencePhase .PREFILL
0 commit comments