gha test

xzrderek · xzrderek · commit 0ca752b7a731 · 2025-10-12T19:25:56.000-07:00
diff --git a/.github/workflows/rollout.yml b/.github/workflows/rollout.yml
@@ -0,0 +1,58 @@
+name: Eval Protocol Rollout
+
+on:
+  workflow_dispatch:
+    inputs:
+      model:
+        description: 'Model to use for the rollout'
+        required: true
+        type: string
+      rollout_id:
+        description: 'Rollout ID for tracking'
+        required: true
+        type: string
+      messages_b64:
+        description: 'Base64 encoded JSON messages array'
+        required: true
+        type: string
+      tools_b64:
+        description: 'Base64 encoded JSON tools array (optional)'
+        required: false
+        type: string
+
+jobs:
+  rollout:
+    runs-on: ubuntu-latest
+    name: rollout-${{ inputs.rollout_id }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .
+
+      - name: Run rollout script
+        env:
+          FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
+        run: |
+          python tests/github_actions/rollout_worker.py \
+            --model "${{ inputs.model }}" \
+            --rollout-id "${{ inputs.rollout_id }}" \
+            --messages-b64 "${{ inputs.messages_b64 }}" \
+            ${{ inputs.tools_b64 && format('--tools-b64 "{0}"', inputs.tools_b64) || '' }}
+
+      - name: Upload rollout trace
+        uses: actions/upload-artifact@v4
+        if: always()  # Upload even if the rollout failed
+        with:
+          name: rollout-trace-${{ inputs.rollout_id }}
+          path: rollout_trace_${{ inputs.rollout_id }}.json
+          retention-days: 7
diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py
@@ -30,7 +30,7 @@
 from .reward_function import RewardFunction
 from .typed_interface import reward_function
 from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
-from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor
+from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor, GithubActionRolloutProcessor
 from .pytest.remote_rollout_processor import create_elasticsearch_config_from_env
 from .pytest.parameterize import DefaultParameterIdGenerator
 from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
@@ -85,6 +85,7 @@
     "DataLoaderConfig",
     "Status",
     "RemoteRolloutProcessor",
+    "GithubActionRolloutProcessor",
     "InputMetadata",
     "EvaluationRow",
     "DefaultParameterIdGenerator",
diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py
@@ -4,6 +4,7 @@
 from .default_no_op_rollout_processor import NoOpRolloutProcessor
 from .default_single_turn_rollout_process import SingleTurnRolloutProcessor
 from .remote_rollout_processor import RemoteRolloutProcessor
+from .github_action_rollout_processor import GithubActionRolloutProcessor
 from .evaluation_test import evaluation_test
 from .exception_config import ExceptionHandlerConfig, BackoffConfig, get_default_exception_handler_config
 from .rollout_processor import RolloutProcessor
@@ -33,6 +34,7 @@
     "RolloutProcessor",
     "SingleTurnRolloutProcessor",
     "RemoteRolloutProcessor",
+    "GithubActionRolloutProcessor",
     "NoOpRolloutProcessor",
     "default_dataset_adapter",
     "RolloutProcessorConfig",
diff --git a/eval_protocol/pytest/github_action_rollout_processor.py b/eval_protocol/pytest/github_action_rollout_processor.py
@@ -0,0 +1,213 @@
+import asyncio
+import base64
+import json
+import os
+import tempfile
+import time
+import zipfile
+from typing import Any, Dict, List, Optional
+
+import requests
+
+from eval_protocol.models import EvaluationRow, Message, Status
+
+from .rollout_processor import RolloutProcessor
+from .types import RolloutProcessorConfig
+
+
+class GithubActionRolloutProcessor(RolloutProcessor):
+    """
+    Rollout processor that dispatches and monitors a GitHub Actions workflow per evaluation row.
+
+    Expected GitHub Actions workflow:
+    - Workflow dispatch with inputs: model, messages_b64, tools_b64, rollout_id, etc.
+    - Workflow uploads artifact named "rollout-trace-{rollout_id}" containing trace JSON
+    - Trace JSON format: {"status": "success"|"error", "messages": [...], "tools": [...], "error": str?}
+    """
+
+    def __init__(
+        self,
+        *,
+        owner: str,
+        repo: str,
+        workflow_id: str,
+        ref: str = "main",
+        github_token: Optional[str] = None,
+        poll_interval: float = 3.0,
+        timeout_seconds: float = 1800.0,
+    ):
+        self._owner = owner
+        self._repo = repo
+        self._workflow_id = workflow_id
+        self._ref = ref
+        self._poll_interval = poll_interval
+        self._timeout_seconds = timeout_seconds
+        self._token = github_token or os.getenv("GITHUB_TOKEN") or os.getenv("GH_TOKEN")
+
+    def _headers(self) -> Dict[str, str]:
+        headers = {"Accept": "application/vnd.github+json"}
+        if self._token:
+            headers["Authorization"] = f"Bearer {self._token}"
+        return headers
+
+    def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]:
+        async def _process_row(row: EvaluationRow) -> EvaluationRow:
+            start_time = time.perf_counter()
+
+            # Extract model
+            model: Optional[str] = None
+            if row.input_metadata and row.input_metadata.completion_params:
+                model = row.input_metadata.completion_params.get("model")
+            if model is None and config.completion_params:
+                model = config.completion_params.get("model")
+            if model is None:
+                raise ValueError("Model must be provided")
+
+            # Clean and encode messages
+            allowed_fields = {"role", "content", "tool_calls", "tool_call_id", "name"}
+            clean_messages = []
+            for m in row.messages:
+                if hasattr(m, "model_dump"):
+                    md = m.model_dump()
+                elif isinstance(m, dict):
+                    md = m
+                else:
+                    md = {
+                        "role": getattr(m, "role", None),
+                        "content": getattr(m, "content", None),
+                        "tool_calls": getattr(m, "tool_calls", None),
+                        "tool_call_id": getattr(m, "tool_call_id", None),
+                        "name": getattr(m, "name", None),
+                    }
+                clean_messages.append({k: v for k, v in md.items() if k in allowed_fields and v is not None})
+
+            # Prepare workflow inputs
+            inputs = {
+                "model": model,
+                "rollout_id": row.execution_metadata.rollout_id,
+                "messages_b64": base64.b64encode(json.dumps(clean_messages).encode()).decode(),
+            }
+            if row.tools:
+                inputs["tools_b64"] = base64.b64encode(json.dumps(row.tools).encode()).decode()
+
+            # Dispatch workflow
+            def _dispatch():
+                url = f"https://api.github.com/repos/{self._owner}/{self._repo}/actions/workflows/{self._workflow_id}/dispatches"
+                payload = {"ref": self._ref, "inputs": inputs}
+                r = requests.post(url, json=payload, headers=self._headers(), timeout=30)
+                r.raise_for_status()
+
+            await asyncio.to_thread(_dispatch)
+
+            # Poll for completion
+            deadline = time.time() + self._timeout_seconds
+            run_id = None
+
+            while time.time() < deadline:
+
+                def _list_runs():
+                    url = f"https://api.github.com/repos/{self._owner}/{self._repo}/actions/workflows/{self._workflow_id}/runs"
+                    params = {"event": "workflow_dispatch", "branch": self._ref, "per_page": 10}
+                    r = requests.get(url, params=params, headers=self._headers(), timeout=30)
+                    r.raise_for_status()
+                    return r.json()
+
+                runs_data = await asyncio.to_thread(_list_runs)
+                runs = runs_data.get("workflow_runs", [])
+
+                # Find our run (prefer by name, fallback to newest)
+                preferred_name = f"rollout-{row.execution_metadata.rollout_id}"
+                candidate_run = None
+                for r in runs:
+                    if r.get("name") == preferred_name:
+                        candidate_run = r
+                        break
+                if not candidate_run and runs:
+                    candidate_run = sorted(runs, key=lambda r: r.get("id", 0), reverse=True)[0]
+
+                if candidate_run and candidate_run.get("status") == "completed":
+                    run_id = candidate_run.get("id")
+                    row.rollout_status = self._map_conclusion_to_status(candidate_run.get("conclusion"))
+                    break
+
+                await asyncio.sleep(self._poll_interval)
+            else:
+                row.rollout_status = Status.rollout_error(
+                    f"GitHub Actions run timed out after {self._timeout_seconds} seconds"
+                )
+                row.execution_metadata.duration_seconds = time.perf_counter() - start_time
+                return row
+
+            # Fetch trace from artifacts
+            if run_id:
+
+                def _get_artifacts():
+                    url = f"https://api.github.com/repos/{self._owner}/{self._repo}/actions/runs/{run_id}/artifacts"
+                    r = requests.get(url, headers=self._headers(), timeout=30)
+                    r.raise_for_status()
+                    return r.json()
+
+                artifacts_data = await asyncio.to_thread(_get_artifacts)
+                artifacts = artifacts_data.get("artifacts", [])
+
+                # Find trace artifact
+                trace_artifact = None
+                for artifact in artifacts:
+                    if artifact.get("name") == f"rollout-trace-{row.execution_metadata.rollout_id}":
+                        trace_artifact = artifact
+                        break
+
+                if trace_artifact:
+
+                    def _download_and_extract():
+                        # Download artifact
+                        r = requests.get(trace_artifact["archive_download_url"], headers=self._headers(), timeout=60)
+                        r.raise_for_status()
+
+                        # Extract trace JSON
+                        with tempfile.NamedTemporaryFile() as tmp_file:
+                            tmp_file.write(r.content)
+                            tmp_file.flush()
+
+                            with zipfile.ZipFile(tmp_file.name, "r") as zip_file:
+                                trace_filename = f"rollout_trace_{row.execution_metadata.rollout_id}.json"
+                                if trace_filename in zip_file.namelist():
+                                    with zip_file.open(trace_filename) as trace_file:
+                                        return json.loads(trace_file.read().decode("utf-8"))
+                        return None
+
+                    trace_data = await asyncio.to_thread(_download_and_extract)
+
+                    if trace_data and trace_data.get("status") == "success":
+                        trace_messages = trace_data.get("messages", [])
+                        if len(trace_messages) > len(row.messages):
+                            row.messages = [Message(**msg) if isinstance(msg, dict) else msg for msg in trace_messages]
+                            if trace_data.get("tools"):
+                                row.tools = trace_data["tools"]
+                        else:
+                            row.rollout_status = Status.rollout_error("Rollout finished with same number of messages")
+                    else:
+                        error_msg = trace_data.get("error", "Unknown error") if trace_data else "No trace data found"
+                        row.rollout_status = Status.rollout_error(f"Rollout failed: {error_msg}")
+
+            row.execution_metadata.duration_seconds = time.perf_counter() - start_time
+            return row
+
+        semaphore = config.semaphore
+
+        async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
+            async with semaphore:
+                return await _process_row(r)
+
+        return [asyncio.create_task(_sem_wrapper(row)) for row in rows]
+
+    @staticmethod
+    def _map_conclusion_to_status(conclusion: Optional[str]) -> Status:
+        if conclusion == "success":
+            return Status.finished("GitHub Actions workflow succeeded")
+        if conclusion in {"failure", "timed_out", "cancelled", "stale"}:
+            return Status.rollout_error(f"GitHub Actions workflow concluded with '{conclusion}'")
+        return Status(code=Status.Code.UNKNOWN, message=f"GitHub Actions workflow concluded with '{conclusion}'")
+
+    def cleanup(self) -> None:
+        return None
diff --git a/tests/github_actions/__init__.py b/tests/github_actions/__init__.py
@@ -0,0 +1 @@
+# GitHub Actions rollout processor tests and scripts
diff --git a/tests/github_actions/rollout_worker.py b/tests/github_actions/rollout_worker.py
diff --git a/tests/github_actions/test_github_actions_rollout.py b/tests/github_actions/test_github_actions_rollout.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# GitHub Actions rollout processor tests and scripts`