eval-protocol
diff --git a/‎eval_protocol/pytest/remote_rollout_processor.py‎
Lines changed: 1 addition & 0 deletions b/‎eval_protocol/pytest/remote_rollout_processor.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/chinook/langfuse/test_remote_langfuse_chinook_typescript.py‎
Lines changed: 75 additions & 0 deletions b/‎tests/chinook/langfuse/test_remote_langfuse_chinook_typescript.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎tests/chinook/langfuse/typescript-server/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎tests/chinook/langfuse/typescript-server/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/chinook/langfuse/typescript-server/README.md‎
Lines changed: 151 additions & 0 deletions b/‎tests/chinook/langfuse/typescript-server/README.md‎
Lines changed: 151 additions & 0 deletions
diff --git a/‎tests/chinook/langfuse/typescript-server/env.ts‎
Lines changed: 31 additions & 0 deletions b/‎tests/chinook/langfuse/typescript-server/env.ts‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎tests/chinook/langfuse/typescript-server/instrumentation.ts‎
Lines changed: 15 additions & 0 deletions b/‎tests/chinook/langfuse/typescript-server/instrumentation.ts‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎tests/chinook/langfuse/typescript-server/package.json‎
Lines changed: 38 additions & 0 deletions b/‎tests/chinook/langfuse/typescript-server/package.json‎
Lines changed: 38 additions & 0 deletions
@@ -167,6 +167,7 @@ def _load_data():
             elif len(output_rows) == 1:  # Return the Langfuse row
                 langfuse_row = output_rows[0]
                 langfuse_row.input_metadata.completion_params = row.input_metadata.completion_params
+                langfuse_row.eval_metadata = row.eval_metadata
                 return langfuse_row
             else:
                 raise ValueError("RemoteRolloutProcessor's output_data_loader should return exactly one row.")
 
@@ -0,0 +1,75 @@
+import os
+from typing import List
+import atexit
+
+import pytest
+
+from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
+from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
+from eval_protocol.adapters.langfuse import create_langfuse_adapter
+from eval_protocol.quickstart.utils import filter_longest_conversation
+
+ROLLOUT_IDS = set()
+
+
+@pytest.fixture(autouse=True)
+def check_rollout_coverage():
+    """Ensure we processed all expected rollout_ids"""
+    global ROLLOUT_IDS
+    ROLLOUT_IDS.clear()
+    yield
+
+    # Verify we've seen the expected number of rollout_ids after test is done
+    expected_rollout_count = 3
+    assert len(ROLLOUT_IDS) == expected_rollout_count, (
+        f"Expected to see {expected_rollout_count} rollout_ids, but only saw {len(ROLLOUT_IDS)}: {ROLLOUT_IDS}"
+    )
+
+
+def fetch_langfuse_traces(rollout_id: str) -> List[EvaluationRow]:
+    global ROLLOUT_IDS  # Track all rollout_ids we've seen
+    ROLLOUT_IDS.add(rollout_id)
+
+    adapter = create_langfuse_adapter()
+    return adapter.get_evaluation_rows(tags=[f"rollout_id:{rollout_id}"])
+
+
+def langfuse_output_data_loader(rollout_id: str) -> DynamicDataLoader:
+    return DynamicDataLoader(
+        generators=[lambda: fetch_langfuse_traces(rollout_id)], preprocess_fn=filter_longest_conversation
+    )
+
+
+def rows() -> List[EvaluationRow]:
+    # Minimal single-user-turn message to trigger a response
+    row = EvaluationRow(messages=[Message(role="user", content="What is the capital of France?")])
+    return [row, row, row]
+
+
+@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Only run this test locally (skipped in CI)")
+@pytest.mark.parametrize("completion_params", [{"model": "gpt-5"}])
+@evaluation_test(
+    data_loaders=DynamicDataLoader(
+        generators=[rows],
+    ),
+    rollout_processor=RemoteRolloutProcessor(
+        remote_base_url="http://127.0.0.1:3000",
+        timeout_seconds=30,
+        output_data_loader=langfuse_output_data_loader,
+    ),
+)
+async def test_remote_rollout_and_fetch_langfuse_typescript(row: EvaluationRow) -> EvaluationRow:
+    """
+    End-to-end test:
+    - remote server started at import time
+    - trigger remote rollout via RemoteRolloutProcessor (calls init/status)
+    - fetch traces from Langfuse filtered by metadata via output_data_loader; FAIL if none found
+    """
+    assert row.messages[0].content == "What is the capital of France?", "Row should have correct message content"
+    assert row.execution_metadata.rollout_id in ROLLOUT_IDS, (
+        f"Row rollout_id {row.execution_metadata.rollout_id} should be in tracked rollout_ids: {ROLLOUT_IDS}"
+    )
+
+    return row
@@ -0,0 +1 @@
+!package.json
@@ -0,0 +1,151 @@
+# TypeScript Express Server for Remote Rollout Processor
+
+This TypeScript Express server implements the Remote Rollout Processor API contract as specified in the Eval Protocol documentation.
+
+## Features
+
+- **POST /init** - Initialize a rollout with validation using Zod schemas
+- **GET /status** - Check the status of a rollout
+- **GET /health** - Health check endpoint
+- Full TypeScript support with strict type checking
+- Request validation using Zod
+- Error handling and logging
+- CORS and security middleware
+
+## Installation
+
+```bash
+pnpm install
+```
+
+## Development
+
+```bash
+# Run in development mode with hot reload
+pnpm run dev
+
+# Build for production
+pnpm run build
+
+# Run production build
+pnpm run start
+```
+
+## API Endpoints
+
+### POST /init
+
+Initialize a new rollout.
+
+**Request Body:**
+```json
+{
+  "rollout_id": "rll_ijkl",
+  "model": "openai/gpt-4o",
+  "messages": [
+    { "role": "user", "content": "Hello" }
+  ],
+  "tools": null,
+  "metadata": {
+    "invocation_id": "ivk_abcd",
+    "experiment_id": "exp_efgh",
+    "rollout_id": "rll_ijkl",
+    "run_id": "run_123",
+    "row_id": "row_123"
+  },
+  "num_turns": 2
+}
+```
+
+**Response:**
+```json
+{
+  "status": "accepted",
+  "rollout_id": "rll_ijkl",
+  "message": "Rollout initialized successfully"
+}
+```
+
+### GET /status
+
+Check the status of a rollout.
+
+**Query Parameters:**
+- `rollout_id` (required): The ID of the rollout to check
+
+**Response (Running):**
+```json
+{
+  "terminated": false
+}
+```
+
+**Response (Completed):**
+```json
+{
+  "terminated": true,
+  "info": {
+    "reason": "completed",
+    "ended_at": "2025-01-24T12:34:56Z",
+    "num_turns": 2
+  }
+}
+```
+
+### GET /health
+
+Health check endpoint.
+
+**Response:**
+```json
+{
+  "status": "healthy",
+  "timestamp": "2025-01-24T12:34:56Z"
+}
+```
+
+## Usage with Eval Protocol
+
+This server can be used with the Eval Protocol's `RemoteRolloutProcessor`:
+
+```python
+from eval_protocol import (
+    evaluation_test,
+    DynamicDataLoader,
+    RemoteRolloutProcessor,
+)
+
+@pytest.mark.parametrize("completion_params", [{"model": "openai/gpt-4o"}])
+@evaluation_test(
+    data_loaders=[InlineDataLoader(messages=[[Message(role="user", content="Hello")]])],
+    rollout_processor=RemoteRolloutProcessor(
+        remote_base_url="http://localhost:3000",
+        output_data_loader=create_output_data_loader,
+    )
+)
+def test_remote_http(row: EvaluationRow) -> EvaluationRow:
+    return row
+```
+
+## Configuration
+
+The server runs on port 3000 by default. You can change this by setting the `PORT` environment variable:
+
+```bash
+PORT=8080 pnpm run dev
+```
+
+## Error Handling
+
+The server includes comprehensive error handling:
+- Request validation errors return 400 with detailed error messages
+- Missing rollout IDs return 404
+- Server errors return 500 with error details
+- All errors are logged to the console
+
+## Development Notes
+
+- The server simulates async rollout execution with a 1-second delay per turn
+- Rollout states are stored in memory (not persistent across restarts)
+- All requests are validated using Zod schemas
+- TypeScript strict mode is enabled for better type safety
@@ -0,0 +1,31 @@
+import * as dotenv from "dotenv";
+
+// Helper to resolve the root of the repo (for .env loading, etc.)
+import path from "path";
+import { fileURLToPath } from "url";
+import fs from "fs";
+
+// Returns the absolute path to the root of the repo (where .git or .env is found)
+function getRepoRoot(): string {
+  // __dirname is not available in ES modules, so use fileURLToPath
+  const currentDir = path.dirname(fileURLToPath(import.meta.url));
+  let dir = currentDir;
+  while (true) {
+    if (
+      fs.existsSync(path.join(dir, ".git")) ||
+      fs.existsSync(path.join(dir, ".env"))
+    ) {
+      return dir;
+    }
+    const parent = path.dirname(dir);
+    if (parent === dir) break;
+    dir = parent;
+  }
+  // Fallback to current directory if not found
+  return currentDir;
+}
+
+export const REPO_ROOT = getRepoRoot();
+
+// Load environment variables from .env at the root of the repo
+dotenv.config({ path: path.join(REPO_ROOT, ".env") });
@@ -0,0 +1,15 @@
+import { NodeSDK } from "@opentelemetry/sdk-node";
+import { LangfuseSpanProcessor } from "@langfuse/otel";
+import "./env";
+
+const sdk = new NodeSDK({
+  spanProcessors: [
+    new LangfuseSpanProcessor({
+      publicKey: process.env["LANGFUSE_PUBLIC_KEY"]!,
+      secretKey: process.env["LANGFUSE_SECRET_KEY"]!,
+      baseUrl: process.env["LANGFUSE_HOST"]!,
+    }),
+  ],
+});
+
+sdk.start();
@@ -0,0 +1,38 @@
+{
+  "name": "typescript-server",
+  "version": "1.0.0",
+  "description": "TypeScript Express server for Remote Rollout Processor",
+  "main": "dist/server.js",
+  "type": "module",
+  "scripts": {
+    "build": "tsc",
+    "start": "node dist/server.js",
+    "dev": "tsx server.ts",
+    "test": "node test-server.js",
+    "test:server": "node test-server.js"
+  },
+  "keywords": [],
+  "author": "",
+  "license": "ISC",
+  "packageManager": "pnpm@10.6.2",
+  "dependencies": {
+    "@langfuse/openai": "^4.2.0",
+    "@langfuse/otel": "^4.2.0",
+    "@langfuse/tracing": "^4.2.0",
+    "@opentelemetry/sdk-node": "^0.205.0",
+    "cors": "^2.8.5",
+    "dotenv": "^17.2.2",
+    "eval-protocol": "^0.1.2",
+    "express": "^5.1.0",
+    "helmet": "^7.1.0",
+    "openai": "^5.23.0"
+  },
+  "devDependencies": {
+    "@types/cors": "^2.8.17",
+    "@types/express": "^4.17.23",
+    "@types/node": "^20.10.0",
+    "tsx": "^4.6.0",
+    "typescript": "^5.9.2",
+    "zod": "^3.22.4"
+  }
+}