eval-protocol · dphuang2 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
@@ -15,25 +15,7 @@ class RemoteRolloutProcessor(RolloutProcessor):
     """
     Rollout processor that triggers a remote HTTP server to perform the rollout.
 
-    Expected remote API:
-    - POST {remote_base_url}/init
-      Body: {
-        "rollout_id": str,
-        "model": str,
-        "messages": list[dict],
-        "tools": list[dict] | null,
-        "metadata": {
-          "invocation_id": str,
-          "experiment_id": str,
-          "rollout_id": str,
-          "run_id": str | null,
-          "row_id": str | null
-        },
-      }
-      Returns: {"ok": true}
-
-    - GET {remote_base_url}/status?rollout_id=...
-      Returns: {"terminated": bool, "info": {...}?}
+    See https://evalprotocol.io/tutorial/remote-rollout-processor for documentation.
     """
 
     def __init__(
@@ -126,7 +108,6 @@ async def _process_row(row: EvaluationRow) -> EvaluationRow:
                 raise ValueError("Rollout ID is required in RemoteRolloutProcessor")
 
             init_payload: InitRequest = InitRequest(
-                rollout_id=row.execution_metadata.rollout_id,
                 model=model,
                 messages=clean_messages,
                 tools=row.tools,

diff --git a/eval_protocol/types/remote_rollout_processor.py b/eval_protocol/types/remote_rollout_processor.py
@@ -20,7 +20,6 @@ class RolloutMetadata(BaseModel):
 class InitRequest(BaseModel):
     """Request model for POST /init endpoint."""
 
-    rollout_id: str
     model: str
     messages: List[Message] = Field(min_length=1)
     tools: Optional[List[Dict[str, Any]]] = None

diff --git a/tests/chinook/langgraph/test_langgraph_chinook_tools.py b/tests/chinook/langgraph/test_langgraph_chinook_tools.py
@@ -19,6 +19,7 @@ def build_graph_kwargs(cp: CompletionParams) -> Dict[str, Any]:
 
 
 @pytest.mark.asyncio
+@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Only run this test locally since its not stable")
 @pytest.mark.skipif(os.getenv("FIREWORKS_API_KEY") in (None, ""), reason="FIREWORKS_API_KEY not set")
 @evaluation_test(
     input_messages=[[[Message(role="user", content="Use tools to count total tracks in the database.")]]],

diff --git a/tests/remote_server/remote_server.py b/tests/remote_server/remote_server.py
@@ -23,7 +23,7 @@
 @app.post("/init")
 def init(req: InitRequest):
     # Persist state
-    _STATE[req.rollout_id] = {"terminated": False}
+    _STATE[req.metadata.rollout_id] = {"terminated": False}
 
     # Kick off worker thread that does a single-turn chat via Langfuse OpenAI integration
     def _worker():
@@ -43,10 +43,10 @@ def _worker():
 
         except Exception as e:
             # Best-effort; mark as done even on error to unblock polling
-            print(f"❌ Error in rollout {req.rollout_id}: {e}")
+            print(f"❌ Error in rollout {req.metadata.rollout_id}: {e}")
             pass
         finally:
-            _STATE[req.rollout_id]["terminated"] = True
+            _STATE[req.metadata.rollout_id]["terminated"] = True
 
     t = threading.Thread(target=_worker, daemon=True)
     t.start()

diff --git a/tests/remote_server/typescript-server/server.ts b/tests/remote_server/typescript-server/server.ts
@@ -46,7 +46,8 @@ app.post("/init", async (req: Request, res: Response) => {
   try {
     // Validate request body
     const validatedData = initRequestSchema.parse(req.body);
-    const { rollout_id, model } = validatedData;
+    const { model, metadata } = validatedData;
+    const rollout_id = metadata.rollout_id;
 
     console.log(`Initializing rollout ${rollout_id} with model ${model}`);
 
@@ -137,11 +138,12 @@ app.get("/status", (req: Request, res: Response) => {
 async function simulateRolloutExecution(
   initRequest: InitRequest
 ): Promise<void> {
-  const rolloutState = rolloutStates.get(initRequest.rollout_id);
+  const rollout_id = initRequest.metadata.rollout_id;
+  const rolloutState = rolloutStates.get(rollout_id);
   if (!rolloutState) return;
 
   try {
-    console.log(`Starting rollout execution for ${initRequest.rollout_id}`);
+    console.log(`Starting rollout execution for ${rollout_id}`);
 
     const openai = new OpenAI({
       apiKey: process.env["OPENAI_API_KEY"],
@@ -160,12 +162,9 @@ async function simulateRolloutExecution(
     rolloutState.ended_at = new Date().toISOString();
     rolloutState.completed_turns = 1;
 
-    console.log(`Rollout ${initRequest.rollout_id} completed successfully`);
+    console.log(`Rollout ${rollout_id} completed successfully`);
   } catch (error) {
-    console.error(
-      `Error in rollout execution for ${initRequest.rollout_id}:`,
-      error
-    );
+    console.error(`Error in rollout execution for ${rollout_id}:`, error);
 
     rolloutState.status = "failed";
     rolloutState.ended_at = new Date().toISOString();

diff --git a/typescript/index.ts b/typescript/index.ts
@@ -40,7 +40,6 @@ const metadataSchema = z
   .loose();
 
 export const initRequestSchema = z.object({
-  rollout_id: z.string(),
   model: z.string(),
   messages: z.array(messageSchema).min(1),
   tools: z.array(toolSchema).optional().nullable(),