diff --git a/olive/evaluator/lmeval_ort.py b/olive/evaluator/lmeval_ort.py index 50d1f1289..c4a158533 100644 --- a/olive/evaluator/lmeval_ort.py +++ b/olive/evaluator/lmeval_ort.py @@ -551,27 +551,25 @@ def model_call(self, input_ids: torch.Tensor, cont_len: int = 0) -> torch.Tensor self.params.set_search_options(batch_size=batch_size) generator = og.Generator(self.model, self.params) - if self._returns_full_logits: - generator.append_tokens(input_ids.tolist()) - return torch.from_numpy(generator.get_output("logits")).to(self.device) - - # Model only returns logits for the last appended position. if batch_size > 1 and cont_len > 1: raise ValueError( - "batch_size > 1 is not supported when the model returns single-position logits" + "batch_size > 1 is not supported when using incremental get_logits() retrieval" " and continuation length > 1. Right-padding misaligns continuation positions across" " batch elements. Use batch_size=1 instead." ) - # Bulk-append context tokens, then step through the last cont_len tokens - # one at a time to collect only the logits we actually need. + # Use incremental token appending with get_logits() to avoid copying + # the full logits tensor from GPU to CPU. get_output("logits") copies + # seq_len * vocab_size * 2 bytes (e.g. 472MB for 900 tokens with + # 262K vocab), while get_logits() copies only vocab_size * 4 bytes + # (~1MB) per position. n_logits = max(cont_len, 1) prefix_len = seq_len - n_logits generator.append_tokens(input_ids[:, : prefix_len + 1].tolist()) - all_logits = [torch.from_numpy(generator.get_output("logits")).to(self.device)] + all_logits = [torch.from_numpy(generator.get_logits()).to(self.device)] for i in range(prefix_len + 1, seq_len): generator.append_tokens(input_ids[:, i : i + 1].tolist()) - all_logits.append(torch.from_numpy(generator.get_output("logits")).to(self.device)) + all_logits.append(torch.from_numpy(generator.get_logits()).to(self.device)) # No need to pad to [batch, seq_len, vocab]. The slicing in _loglikelihood_tokens computes # ctx_len = inplen + (logits.shape[0] - padding_len_inp), which adjusts for the shorter