From 2bd1d2cf43b105ff9bfb5301805a19c439765e1d Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Fri, 26 Sep 2025 13:42:29 -0700 Subject: [PATCH 1/3] Update RemoteRolloutProcessor docstring to reference online tutorial for API usage --- .../pytest/remote_rollout_processor.py | 20 +------------------ 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py index 2359b6e1..8e345056 100644 --- a/eval_protocol/pytest/remote_rollout_processor.py +++ b/eval_protocol/pytest/remote_rollout_processor.py @@ -15,25 +15,7 @@ class RemoteRolloutProcessor(RolloutProcessor): """ Rollout processor that triggers a remote HTTP server to perform the rollout. - Expected remote API: - - POST {remote_base_url}/init - Body: { - "rollout_id": str, - "model": str, - "messages": list[dict], - "tools": list[dict] | null, - "metadata": { - "invocation_id": str, - "experiment_id": str, - "rollout_id": str, - "run_id": str | null, - "row_id": str | null - }, - } - Returns: {"ok": true} - - - GET {remote_base_url}/status?rollout_id=... - Returns: {"terminated": bool, "info": {...}?} + See https://evalprotocol.io/tutorial/remote-rollout-processor for documentation. """ def __init__( From a706dd65248b235227ccb9b48dfc1bd872a362c0 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Fri, 26 Sep 2025 13:46:09 -0700 Subject: [PATCH 2/3] remove top-level rollout_id from InitRequest --- eval_protocol/pytest/remote_rollout_processor.py | 1 - eval_protocol/types/remote_rollout_processor.py | 1 - tests/remote_server/remote_server.py | 6 +++--- tests/remote_server/typescript-server/server.ts | 15 +++++++-------- typescript/index.ts | 1 - 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py index 8e345056..13ca5422 100644 --- a/eval_protocol/pytest/remote_rollout_processor.py +++ b/eval_protocol/pytest/remote_rollout_processor.py @@ -108,7 +108,6 @@ async def _process_row(row: EvaluationRow) -> EvaluationRow: raise ValueError("Rollout ID is required in RemoteRolloutProcessor") init_payload: InitRequest = InitRequest( - rollout_id=row.execution_metadata.rollout_id, model=model, messages=clean_messages, tools=row.tools, diff --git a/eval_protocol/types/remote_rollout_processor.py b/eval_protocol/types/remote_rollout_processor.py index 21d93ceb..89967729 100644 --- a/eval_protocol/types/remote_rollout_processor.py +++ b/eval_protocol/types/remote_rollout_processor.py @@ -20,7 +20,6 @@ class RolloutMetadata(BaseModel): class InitRequest(BaseModel): """Request model for POST /init endpoint.""" - rollout_id: str model: str messages: List[Message] = Field(min_length=1) tools: Optional[List[Dict[str, Any]]] = None diff --git a/tests/remote_server/remote_server.py b/tests/remote_server/remote_server.py index 575ecd0e..ea831f51 100644 --- a/tests/remote_server/remote_server.py +++ b/tests/remote_server/remote_server.py @@ -23,7 +23,7 @@ @app.post("/init") def init(req: InitRequest): # Persist state - _STATE[req.rollout_id] = {"terminated": False} + _STATE[req.metadata.rollout_id] = {"terminated": False} # Kick off worker thread that does a single-turn chat via Langfuse OpenAI integration def _worker(): @@ -43,10 +43,10 @@ def _worker(): except Exception as e: # Best-effort; mark as done even on error to unblock polling - print(f"❌ Error in rollout {req.rollout_id}: {e}") + print(f"❌ Error in rollout {req.metadata.rollout_id}: {e}") pass finally: - _STATE[req.rollout_id]["terminated"] = True + _STATE[req.metadata.rollout_id]["terminated"] = True t = threading.Thread(target=_worker, daemon=True) t.start() diff --git a/tests/remote_server/typescript-server/server.ts b/tests/remote_server/typescript-server/server.ts index 0551e639..95362cd0 100644 --- a/tests/remote_server/typescript-server/server.ts +++ b/tests/remote_server/typescript-server/server.ts @@ -46,7 +46,8 @@ app.post("/init", async (req: Request, res: Response) => { try { // Validate request body const validatedData = initRequestSchema.parse(req.body); - const { rollout_id, model } = validatedData; + const { model, metadata } = validatedData; + const rollout_id = metadata.rollout_id; console.log(`Initializing rollout ${rollout_id} with model ${model}`); @@ -137,11 +138,12 @@ app.get("/status", (req: Request, res: Response) => { async function simulateRolloutExecution( initRequest: InitRequest ): Promise { - const rolloutState = rolloutStates.get(initRequest.rollout_id); + const rollout_id = initRequest.metadata.rollout_id; + const rolloutState = rolloutStates.get(rollout_id); if (!rolloutState) return; try { - console.log(`Starting rollout execution for ${initRequest.rollout_id}`); + console.log(`Starting rollout execution for ${rollout_id}`); const openai = new OpenAI({ apiKey: process.env["OPENAI_API_KEY"], @@ -160,12 +162,9 @@ async function simulateRolloutExecution( rolloutState.ended_at = new Date().toISOString(); rolloutState.completed_turns = 1; - console.log(`Rollout ${initRequest.rollout_id} completed successfully`); + console.log(`Rollout ${rollout_id} completed successfully`); } catch (error) { - console.error( - `Error in rollout execution for ${initRequest.rollout_id}:`, - error - ); + console.error(`Error in rollout execution for ${rollout_id}:`, error); rolloutState.status = "failed"; rolloutState.ended_at = new Date().toISOString(); diff --git a/typescript/index.ts b/typescript/index.ts index 26a42553..59f7a8af 100644 --- a/typescript/index.ts +++ b/typescript/index.ts @@ -40,7 +40,6 @@ const metadataSchema = z .loose(); export const initRequestSchema = z.object({ - rollout_id: z.string(), model: z.string(), messages: z.array(messageSchema).min(1), tools: z.array(toolSchema).optional().nullable(), From 5f7ac8710243fb920a27c07d87a27951274af16c Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Fri, 26 Sep 2025 13:55:44 -0700 Subject: [PATCH 3/3] skip unstable test in CI --- tests/chinook/langgraph/test_langgraph_chinook_tools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/chinook/langgraph/test_langgraph_chinook_tools.py b/tests/chinook/langgraph/test_langgraph_chinook_tools.py index e9afabf1..27d4153e 100644 --- a/tests/chinook/langgraph/test_langgraph_chinook_tools.py +++ b/tests/chinook/langgraph/test_langgraph_chinook_tools.py @@ -19,6 +19,7 @@ def build_graph_kwargs(cp: CompletionParams) -> Dict[str, Any]: @pytest.mark.asyncio +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Only run this test locally since its not stable") @pytest.mark.skipif(os.getenv("FIREWORKS_API_KEY") in (None, ""), reason="FIREWORKS_API_KEY not set") @evaluation_test( input_messages=[[[Message(role="user", content="Use tools to count total tracks in the database.")]]],