eval-protocol
diff --git a/‎.github/workflows/rollout.yml‎
Lines changed: 8 additions & 17 deletions b/‎.github/workflows/rollout.yml‎
Lines changed: 8 additions & 17 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎eval_protocol/__init__.py‎
Lines changed: 21 additions & 2 deletions b/‎eval_protocol/__init__.py‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎eval_protocol/adapters/fireworks_tracing.py‎
Lines changed: 8 additions & 6 deletions b/‎eval_protocol/adapters/fireworks_tracing.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎eval_protocol/mcp/execution/policy.py‎
Lines changed: 18 additions & 6 deletions b/‎eval_protocol/mcp/execution/policy.py‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎eval_protocol/models.py‎
Lines changed: 3 additions & 1 deletion b/‎eval_protocol/models.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎eval_protocol/proxy/.env.example‎
Lines changed: 2 additions & 0 deletions b/‎eval_protocol/proxy/.env.example‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎eval_protocol/proxy/Dockerfile.gateway‎
Lines changed: 23 additions & 0 deletions b/‎eval_protocol/proxy/Dockerfile.gateway‎
Lines changed: 23 additions & 0 deletions
@@ -1,27 +1,26 @@
 name: Eval Protocol Rollout
 
-run-name: rollout:${{ inputs.rollout_id }}
+run-name: rollout:${{ fromJSON(inputs.metadata).rollout_id }}
 
 on:
   workflow_dispatch:
     inputs:
       model:
-        description: 'Model to use for the rollout'
+        description: 'Model to use'
         required: true
         type: string
-      rollout_id:
-        description: 'Rollout ID for tracking'
+      metadata:
+        description: 'JSON serialized metadata object'
         required: true
         type: string
-      prompt:
-        description: 'User prompt for the rollout'
+      model_base_url:
+        description: 'Base URL for the model API'
         required: true
         type: string
 
 jobs:
   rollout:
     runs-on: ubuntu-latest
-    name: rollout-${{ inputs.rollout_id }}
 
     steps:
       - name: Checkout code
@@ -43,13 +42,5 @@ jobs:
         run: |
           python tests/github_actions/rollout_worker.py \
             --model "${{ inputs.model }}" \
-            --rollout-id "${{ inputs.rollout_id }}" \
-            --prompt "${{ inputs.prompt }}"
-
-      - name: Upload rollout trace
-        uses: actions/upload-artifact@v4
-        if: always()  # Upload even if the rollout failed
-        with:
-          name: rollout-trace-${{ inputs.rollout_id }}
-          path: rollout_trace_${{ inputs.rollout_id }}.json
-          retention-days: 7
+            --metadata '${{ inputs.metadata }}' \
+            --model-base-url "${{ inputs.model_base_url }}"
@@ -105,6 +105,9 @@ env.bak/
 venv.bak/
 *.backup
 
+# Secrets
+secrets.yaml
+
 # Spyder project settings
 .spyderproject
 .spyproject
 
@@ -29,8 +29,13 @@
 from .resources import create_llm_resource
 from .reward_function import RewardFunction
 from .typed_interface import reward_function
-from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
-from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor
+from .quickstart.aha_judge import aha_judge
+from .utils.evaluation_row_utils import (
+    multi_turn_assistant_to_ground_truth,
+    assistant_to_ground_truth,
+    filter_longest_conversation,
+)
+from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor, GithubActionRolloutProcessor
 from .pytest.remote_rollout_processor import create_elasticsearch_config_from_env
 from .pytest.parameterize import DefaultParameterIdGenerator
 from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
@@ -74,6 +79,14 @@
 except ImportError:
     WeaveAdapter = None
 
+try:
+    from .proxy import create_app, AuthProvider, AccountInfo
+except ImportError:
+    create_app = None
+    AuthProvider = None
+    AccountInfo = None
+
+
 warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
 
 __all__ = [
@@ -85,6 +98,7 @@
     "DataLoaderConfig",
     "Status",
     "RemoteRolloutProcessor",
+    "GithubActionRolloutProcessor",
     "InputMetadata",
     "EvaluationRow",
     "DefaultParameterIdGenerator",
@@ -93,6 +107,7 @@
     "aha_judge",
     "multi_turn_assistant_to_ground_truth",
     "assistant_to_ground_truth",
+    "filter_longest_conversation",
     "evaluation_test",
     "SingleTurnRolloutProcessor",
     "OpenAIResponsesAdapter",
@@ -137,6 +152,10 @@
     "RolloutMetadata",
     "StatusResponse",
     "create_langfuse_config_tags",
+    # Proxy
+    "create_app",
+    "AuthProvider",
+    "AccountInfo",
 ]
 
 from . import _version
 
@@ -7,9 +7,9 @@
 from __future__ import annotations
 import logging
 import requests
-import time
 from datetime import datetime
 from typing import Any, Dict, List, Optional, Protocol
+import os
 
 from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
 from .base import BaseAdapter
@@ -343,15 +343,17 @@ def get_evaluation_rows(
         # Remove None values
         params = {k: v for k, v in params.items() if v is not None}
 
-        # Make request to proxy
+        # Make request to proxy (using pointwise for efficiency)
         if self.project_id:
-            url = f"{self.base_url}/v1/project_id/{self.project_id}/traces"
+            url = f"{self.base_url}/v1/project_id/{self.project_id}/traces/pointwise"
         else:
-            url = f"{self.base_url}/v1/traces"
+            url = f"{self.base_url}/v1/traces/pointwise"
+
+        headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
 
         result = None
         try:
-            response = requests.get(url, params=params, timeout=self.timeout)
+            response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
             response.raise_for_status()
             result = response.json()
         except requests.exceptions.HTTPError as e:
@@ -365,7 +367,7 @@ def get_evaluation_rows(
                 except Exception:  # In case e.response.json() fails
                     error_msg = f"Proxy error: {e.response.text}"
 
-            logger.error("Failed to fetch traces from proxy: %s", error_msg)
+            logger.error("Failed to fetch traces from proxy (HTTP %s): %s", e.response.status_code, error_msg)
             return eval_rows
         except requests.exceptions.RequestException as e:
             # Non-HTTP errors (network issues, timeouts, etc.)
 
@@ -5,15 +5,14 @@
 Rewritten to use LiteLLM for unified retry logic, caching, and provider support.
 """
 
-import asyncio
-import json
 import logging
 import os
-from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Dict, List, Literal, Optional
 
 import litellm
-from litellm import acompletion, completion
+from litellm import acompletion
+from litellm.types.utils import ModelResponse
+from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
 from litellm.caching.caching import Cache
 from litellm.caching.dual_cache import DualCache
 from litellm.caching.in_memory_cache import InMemoryCache
@@ -194,7 +193,20 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
             request_params["tools"] = tools
 
         try:
-            response = await acompletion(model=self.model_id, **request_params)
+            if request_params.get("stream") is True:
+                chunks = []
+                stream = await acompletion(model=self.model_id, **request_params)
+
+                assert isinstance(stream, CustomStreamWrapper), "Stream should be a CustomStreamWrapper"
+
+                async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]
+                    chunks.append(chunk)
+                response = litellm.stream_chunk_builder(chunks, messages)
+            else:
+                response = await acompletion(model=self.model_id, **request_params)
+
+            assert response is not None, "Response is None"
+            assert isinstance(response, ModelResponse), "Response should be ModelResponse"
 
             # Log cache hit/miss for monitoring
             hidden = getattr(response, "_hidden_params", {})
 
@@ -598,7 +598,9 @@ class EvaluationRow(BaseModel):
     model_config = ConfigDict(extra="allow")
 
     # Core OpenAI ChatCompletion compatible conversation data
-    messages: List[Message] = Field(description="List of messages in the conversation. Also known as a trajectory.")
+    messages: List[Message] = Field(
+        default_factory=list, description="List of messages in the conversation. Also known as a trajectory."
+    )
 
     # Tool and function call information
     tools: Optional[List[Dict[str, Any]]] = Field(
 
@@ -0,0 +1,2 @@
+# In order to set other model providers keys for proxy, make a copy, rename to .env, and fill here
+OPENAI_API_KEY=sk-proj-xxx
@@ -0,0 +1,23 @@
+# Metadata Extraction Gateway - Sits in front of LiteLLM
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Prevent Python from buffering stdout/stderr
+ENV PYTHONUNBUFFERED=1
+
+# Copy requirements file
+COPY ./requirements.txt /app/requirements.txt
+
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the proxy package
+COPY ./proxy_core /app/proxy_core
+
+# Expose port
+EXPOSE 4000
+
+# Run the gateway as a module
+# LITELLM_URL will be set by environment (docker-compose or Cloud Run)
+CMD ["python", "-m", "proxy_core.main"]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# In order to set other model providers keys for proxy, make a copy, rename to .env, and fill here`
	`2`	`+OPENAI_API_KEY=sk-proj-xxx`