From 2bd1d2cf43b105ff9bfb5301805a19c439765e1d Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Fri, 26 Sep 2025 13:42:29 -0700
Subject: [PATCH 1/3] Update RemoteRolloutProcessor docstring to reference
 online tutorial for API usage

---
 .../pytest/remote_rollout_processor.py        | 20 +------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
index 2359b6e1..8e345056 100644
--- a/eval_protocol/pytest/remote_rollout_processor.py
+++ b/eval_protocol/pytest/remote_rollout_processor.py
@@ -15,25 +15,7 @@ class RemoteRolloutProcessor(RolloutProcessor):
     """
     Rollout processor that triggers a remote HTTP server to perform the rollout.
 
-    Expected remote API:
-    - POST {remote_base_url}/init
-      Body: {
-        "rollout_id": str,
-        "model": str,
-        "messages": list[dict],
-        "tools": list[dict] | null,
-        "metadata": {
-          "invocation_id": str,
-          "experiment_id": str,
-          "rollout_id": str,
-          "run_id": str | null,
-          "row_id": str | null
-        },
-      }
-      Returns: {"ok": true}
-
-    - GET {remote_base_url}/status?rollout_id=...
-      Returns: {"terminated": bool, "info": {...}?}
+    See https://evalprotocol.io/tutorial/remote-rollout-processor for documentation.
     """
 
     def __init__(

From a706dd65248b235227ccb9b48dfc1bd872a362c0 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Fri, 26 Sep 2025 13:46:09 -0700
Subject: [PATCH 2/3] remove top-level rollout_id from InitRequest

---
 eval_protocol/pytest/remote_rollout_processor.py |  1 -
 eval_protocol/types/remote_rollout_processor.py  |  1 -
 tests/remote_server/remote_server.py             |  6 +++---
 tests/remote_server/typescript-server/server.ts  | 15 +++++++--------
 typescript/index.ts                              |  1 -
 5 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
index 8e345056..13ca5422 100644
--- a/eval_protocol/pytest/remote_rollout_processor.py
+++ b/eval_protocol/pytest/remote_rollout_processor.py
@@ -108,7 +108,6 @@ async def _process_row(row: EvaluationRow) -> EvaluationRow:
                 raise ValueError("Rollout ID is required in RemoteRolloutProcessor")
 
             init_payload: InitRequest = InitRequest(
-                rollout_id=row.execution_metadata.rollout_id,
                 model=model,
                 messages=clean_messages,
                 tools=row.tools,
diff --git a/eval_protocol/types/remote_rollout_processor.py b/eval_protocol/types/remote_rollout_processor.py
index 21d93ceb..89967729 100644
--- a/eval_protocol/types/remote_rollout_processor.py
+++ b/eval_protocol/types/remote_rollout_processor.py
@@ -20,7 +20,6 @@ class RolloutMetadata(BaseModel):
 class InitRequest(BaseModel):
     """Request model for POST /init endpoint."""
 
-    rollout_id: str
     model: str
     messages: List[Message] = Field(min_length=1)
     tools: Optional[List[Dict[str, Any]]] = None
diff --git a/tests/remote_server/remote_server.py b/tests/remote_server/remote_server.py
index 575ecd0e..ea831f51 100644
--- a/tests/remote_server/remote_server.py
+++ b/tests/remote_server/remote_server.py
@@ -23,7 +23,7 @@
 @app.post("/init")
 def init(req: InitRequest):
     # Persist state
-    _STATE[req.rollout_id] = {"terminated": False}
+    _STATE[req.metadata.rollout_id] = {"terminated": False}
 
     # Kick off worker thread that does a single-turn chat via Langfuse OpenAI integration
     def _worker():
@@ -43,10 +43,10 @@ def _worker():
 
         except Exception as e:
             # Best-effort; mark as done even on error to unblock polling
-            print(f"❌ Error in rollout {req.rollout_id}: {e}")
+            print(f"❌ Error in rollout {req.metadata.rollout_id}: {e}")
             pass
         finally:
-            _STATE[req.rollout_id]["terminated"] = True
+            _STATE[req.metadata.rollout_id]["terminated"] = True
 
     t = threading.Thread(target=_worker, daemon=True)
     t.start()
diff --git a/tests/remote_server/typescript-server/server.ts b/tests/remote_server/typescript-server/server.ts
index 0551e639..95362cd0 100644
--- a/tests/remote_server/typescript-server/server.ts
+++ b/tests/remote_server/typescript-server/server.ts
@@ -46,7 +46,8 @@ app.post("/init", async (req: Request, res: Response) => {
   try {
     // Validate request body
     const validatedData = initRequestSchema.parse(req.body);
-    const { rollout_id, model } = validatedData;
+    const { model, metadata } = validatedData;
+    const rollout_id = metadata.rollout_id;
 
     console.log(`Initializing rollout ${rollout_id} with model ${model}`);
 
@@ -137,11 +138,12 @@ app.get("/status", (req: Request, res: Response) => {
 async function simulateRolloutExecution(
   initRequest: InitRequest
 ): Promise<void> {
-  const rolloutState = rolloutStates.get(initRequest.rollout_id);
+  const rollout_id = initRequest.metadata.rollout_id;
+  const rolloutState = rolloutStates.get(rollout_id);
   if (!rolloutState) return;
 
   try {
-    console.log(`Starting rollout execution for ${initRequest.rollout_id}`);
+    console.log(`Starting rollout execution for ${rollout_id}`);
 
     const openai = new OpenAI({
       apiKey: process.env["OPENAI_API_KEY"],
@@ -160,12 +162,9 @@ async function simulateRolloutExecution(
     rolloutState.ended_at = new Date().toISOString();
     rolloutState.completed_turns = 1;
 
-    console.log(`Rollout ${initRequest.rollout_id} completed successfully`);
+    console.log(`Rollout ${rollout_id} completed successfully`);
   } catch (error) {
-    console.error(
-      `Error in rollout execution for ${initRequest.rollout_id}:`,
-      error
-    );
+    console.error(`Error in rollout execution for ${rollout_id}:`, error);
 
     rolloutState.status = "failed";
     rolloutState.ended_at = new Date().toISOString();
diff --git a/typescript/index.ts b/typescript/index.ts
index 26a42553..59f7a8af 100644
--- a/typescript/index.ts
+++ b/typescript/index.ts
@@ -40,7 +40,6 @@ const metadataSchema = z
   .loose();
 
 export const initRequestSchema = z.object({
-  rollout_id: z.string(),
   model: z.string(),
   messages: z.array(messageSchema).min(1),
   tools: z.array(toolSchema).optional().nullable(),

From 5f7ac8710243fb920a27c07d87a27951274af16c Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Fri, 26 Sep 2025 13:55:44 -0700
Subject: [PATCH 3/3] skip unstable test in CI

---
 tests/chinook/langgraph/test_langgraph_chinook_tools.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/chinook/langgraph/test_langgraph_chinook_tools.py b/tests/chinook/langgraph/test_langgraph_chinook_tools.py
index e9afabf1..27d4153e 100644
--- a/tests/chinook/langgraph/test_langgraph_chinook_tools.py
+++ b/tests/chinook/langgraph/test_langgraph_chinook_tools.py
@@ -19,6 +19,7 @@ def build_graph_kwargs(cp: CompletionParams) -> Dict[str, Any]:
 
 
 @pytest.mark.asyncio
+@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Only run this test locally since its not stable")
 @pytest.mark.skipif(os.getenv("FIREWORKS_API_KEY") in (None, ""), reason="FIREWORKS_API_KEY not set")
 @evaluation_test(
     input_messages=[[[Message(role="user", content="Use tools to count total tracks in the database.")]]],