finalll

Shrey Modi · Shrey Modi · commit a1a973ebf347 · 2025-11-19T11:53:27.000Z
diff --git a/eval_protocol/mcp/execution/vllm_policy.py b/eval_protocol/mcp/execution/vllm_policy.py
@@ -1,13 +1,20 @@
 """
-VLLMPolicy - Policy for TRL's VLLMClient
+VLLMPolicy - Policy for TRL's VLLMClient or colocated vLLM LLM.
 
-Simple policy that calls TRL's vllm_client directly instead of going through LiteLLM.
-Works with `trl vllm-serve` endpoints.
+Thin adapter that turns Eval Protocol-style message lists into a single prompt,
+then calls either:
+
+- TRL's VLLMClient (server mode), or
+- a colocated vLLM LLM instance (SamplingParams mode).
 """
 
+import logging
 from typing import Any, Dict, List, Optional
 
 
+logger = logging.getLogger(__name__)
+
+
 class VLLMPolicy:
     """
     Policy that uses TRL's VLLMClient for generation.
@@ -52,7 +59,7 @@ async def _make_llm_call(
         tools: Optional[List] = None,
     ) -> Dict[str, Any]:
         """
-        Make LLM call using TRL's VLLMClient.
+        Make LLM call using TRL's VLLMClient or a colocated vLLM LLM.
 
         Args:
             messages: List of message dicts with 'role' and 'content'
@@ -70,29 +77,29 @@ async def _make_llm_call(
                     add_generation_prompt=True,
                     tokenize=False,
                 )
-                print("\n[VLLMPolicy] ===== CHAT TEMPLATE APPLIED =====", flush=True)
-                print(f"[VLLMPolicy] Input messages ({len(messages)} messages):", flush=True)
-                for i, msg in enumerate(messages):
-                    content_preview = str(msg.get("content", ""))[:100]
-                    print(f"  [{i}] {msg.get('role', '?')}: {content_preview}...", flush=True)
-                print(f"[VLLMPolicy] Formatted prompt (length={len(prompt_text)}):", flush=True)
-                print("[VLLMPolicy] Prompt preview (last 500 chars):", flush=True)
-                print(f"{prompt_text[-500:]}", flush=True)
-                print("[VLLMPolicy] ===================================", flush=True)
+                logger.debug(
+                    "[VLLMPolicy] Chat template applied for %d messages (prompt length=%d)",
+                    len(messages),
+                    len(prompt_text),
+                )
             except Exception as e:
-                print(f"[VLLMPolicy] Warning: Failed to apply chat template: {e}", flush=True)
-                # Fallback: simple concatenation
-                prompt_text = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
+                logger.warning(
+                    "[VLLMPolicy] Failed to apply chat template: %s",
+                    e,
+                    exc_info=True,
+                )
+                # Fallback: simple concatenation (defensive .get access)
+                prompt_text = "\n".join(f"{m.get('role', '?')}: {m.get('content', '')}" for m in messages)
         else:
             # No tokenizer: simple concatenation
-            prompt_text = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
+            prompt_text = "\n".join(f"{m.get('role', '?')}: {m.get('content', '')}" for m in messages)
 
         # Check if vllm_client is VLLMClient (server mode) or LLM (colocate mode)
         is_llm_object = hasattr(self.vllm_client, "llm_engine")  # LLM has llm_engine
 
         if is_llm_object:
             # Colocate mode: use SamplingParams
-            print("[VLLMPolicy] Using vLLM LLM (colocate mode) with SamplingParams", flush=True)
+            logger.debug("[VLLMPolicy] Using vLLM LLM (colocate mode) with SamplingParams")
             from vllm import SamplingParams
 
             sampling_params = SamplingParams(
@@ -103,7 +110,7 @@ async def _make_llm_call(
                 n=1,
             )
 
-            print("[VLLMPolicy] Calling LLM.generate()...", flush=True)
+            logger.debug("[VLLMPolicy] Calling LLM.generate()")
             outputs = self.vllm_client.generate([prompt_text], sampling_params=sampling_params, use_tqdm=False)
 
             # Extract from vLLM output format
@@ -116,7 +123,7 @@ async def _make_llm_call(
             }
         else:
             # Server mode: use VLLMClient with kwargs
-            print("[VLLMPolicy] Using VLLMClient (server mode)", flush=True)
+            logger.debug("[VLLMPolicy] Using VLLMClient (server mode)")
             vllm_params = {
                 "temperature": self.temperature,
                 "max_tokens": self.max_tokens,
@@ -126,7 +133,7 @@ async def _make_llm_call(
             }
             vllm_params.update(self.kwargs)
 
-            print("[VLLMPolicy] Calling vllm_client.generate()...", flush=True)
+            logger.debug("[VLLMPolicy] Calling vllm_client.generate()")
             response = self.vllm_client.generate(
                 prompts=[prompt_text],
                 **vllm_params,
@@ -140,16 +147,18 @@ async def _make_llm_call(
         if self.tokenizer is not None:
             try:
                 completion_text = self.tokenizer.decode(completion_ids, skip_special_tokens=True)
-                print("\n[VLLMPolicy] ===== GENERATION RESULT =====", flush=True)
-                print(f"[VLLMPolicy] Prompt tokens: {len(prompt_ids)}", flush=True)
-                print(f"[VLLMPolicy] Completion tokens: {len(completion_ids)}", flush=True)
-                print(f"[VLLMPolicy] FULL decoded completion ({len(completion_text)} chars):", flush=True)
-                print("───────────────────────────────────────", flush=True)
-                print(f"{completion_text}", flush=True)
-                print("───────────────────────────────────────", flush=True)
-                print("[VLLMPolicy] ==============================", flush=True)
+                logger.debug(
+                    "[VLLMPolicy] Generation result: prompt_tokens=%d, completion_tokens=%d, completion_chars=%d",
+                    len(prompt_ids),
+                    len(completion_ids),
+                    len(completion_text),
+                )
             except Exception as e:
-                print(f"[VLLMPolicy] Warning: Failed to decode completion: {e}", flush=True)
+                logger.warning(
+                    "[VLLMPolicy] Failed to decode completion: %s",
+                    e,
+                    exc_info=True,
+                )
                 completion_text = f"<decoded_error:{len(completion_ids)}_tokens>"
         else:
             # Fallback: just indicate number of tokens
diff --git a/eval_protocol/pytest/integrations/openenv_trl_vllm.py b/eval_protocol/pytest/integrations/openenv_trl_vllm.py
@@ -134,34 +134,18 @@ def rollout_func(prompts: List[str], trainer) -> Dict[str, List]:
         # 1) Build evaluation rows with rollout_id for logging
         import uuid
 
-        # Generate unique IDs for this batch
-        def _gen_id():
-            import random
-
-            words = [
-                "quick",
-                "lazy",
-                "happy",
-                "bright",
-                "calm",
-                "bold",
-                "wise",
-                "kind",
-            ]
-            return f"{random.choice(words)}-{random.choice(words)}-{random.randint(10, 99)}"
-
         evaluation_rows: List[EvaluationRow] = []
         for prompt_idx, prompt in enumerate(prompts):
             # One evaluation row per incoming prompt. GRPOTrainer will handle
             # grouping by `num_generations` at the trainer level; the custom
             # rollout_func must return one set of tokens per prompt.
             rollout_id = f"openenv_vllm_{uuid.uuid4().hex[:12]}"
-            row_id = _gen_id()
 
             row = EvaluationRow(
                 messages=[Message(role="user", content=prompt)],
                 input_metadata=InputMetadata(
-                    row_id=row_id,  # Required for ep logs UI!
+                    # Let Eval Protocol generate a stable row_id from content.
+                    row_id=None,
                     completion_params={},
                 ),
             )
diff --git a/eval_protocol/pytest/openenv_rollout_processor.py b/eval_protocol/pytest/openenv_rollout_processor.py
@@ -15,6 +15,7 @@
 import asyncio
 import logging
 import time
+from itertools import count
 from typing import List, Any, Dict, Callable, Generic, TypeVar, Optional, Type
 import json
 
@@ -142,7 +143,9 @@ def __init__(
         self._viewport_height = viewport_height
         self._timeout_ms = timeout_ms
         self._num_generations = max(1, int(num_generations)) if num_generations else 1
-        self._env_create_idx: int = 0
+        # Counter used for task rotation when creating environments. Uses
+        # itertools.count to avoid race conditions across concurrent rollouts.
+        self._env_create_counter = count()
 
         if self._tasks and not self._task_var:
             raise ValueError("task_var must be provided when tasks are configured.")
@@ -411,7 +414,9 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                 )
                 row.execution_metadata.duration_seconds = time.perf_counter() - start_time
 
-                # Store rewards for TRL reward functions
+                # Store per-step rewards in a sentinel system message so
+                # evaluation tests and downstream integrations can reconstruct
+                # episode rewards.
                 sentinel = "__ep_step_rewards__:" + json.dumps(step_rewards)
                 messages.append(Message(role="system", content=sentinel))
 
@@ -469,7 +474,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                     env.close()
                     logger.debug("[OpenEnvRolloutProcessor] Environment closed successfully")
                 except Exception as close_err:
-                    print(f"[OpenEnvRolloutProcessor] Warning: Error closing environment: {close_err}", flush=True)
                     logger.warning(
                         "[OpenEnvRolloutProcessor] Error closing environment: %s",
                         close_err,
@@ -534,8 +538,9 @@ def _generic_factory():
                 # Select task for this env instance (if provided), grouped by num_generations
                 selected_task: Optional[str] = None
                 if self._tasks:
-                    idx = self._env_create_idx
-                    self._env_create_idx = idx + 1
+                    # Use a monotonic counter so concurrent environment creation
+                    # does not reuse the same index across rollouts.
+                    idx = next(self._env_create_counter)
                     group = idx // max(1, self._num_generations)
                     selected_task = self._tasks[group % len(self._tasks)]
                     if not self._task_var: