gsm8k math example

benjibc · benjibc · commit 1489b63f663a · 2025-10-27T17:10:32.000-07:00
diff --git a/eval_protocol/data_loader/__init__.py b/eval_protocol/data_loader/__init__.py
@@ -1,4 +1,5 @@
 from .dynamic_data_loader import DynamicDataLoader
 from .inline_data_loader import InlineDataLoader
+from .fireworks_dataset_loader import FireworksDatasetLoader
 
-__all__ = ["DynamicDataLoader", "InlineDataLoader"]
+__all__ = ["DynamicDataLoader", "InlineDataLoader", "FireworksDatasetLoader"]
diff --git a/eval_protocol/data_loader/fireworks_dataset_loader.py b/eval_protocol/data_loader/fireworks_dataset_loader.py
@@ -0,0 +1,206 @@
+from __future__ import annotations
+
+import json
+import os
+import shutil
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, Optional, cast
+from urllib.parse import quote
+
+import requests
+
+from eval_protocol.auth import get_fireworks_api_base, get_fireworks_api_key
+from eval_protocol.common_utils import load_jsonl
+from eval_protocol.data_loader.models import (
+    DataLoaderResult,
+    DataLoaderVariant,
+    EvaluationDataLoader,
+)
+from eval_protocol.models import EvaluationRow, JSONType
+
+
+def _default_dataset_adapter(rows: list[dict[str, object]]) -> list[EvaluationRow]:
+    """
+    Convert Fireworks dataset rows into EvaluationRow.
+
+    Preferred shape:
+    - { messages: [...], ground_truth?: any }
+
+    Fallback (legacy demo):
+    - { user_query: str, ground_truth_for_eval?: any }
+    """
+    converted: list[EvaluationRow] = []
+    for row in rows:
+        # Defer import to avoid cycles
+        from eval_protocol.models import Message
+
+        messages = row.get("messages")
+        ground_truth = cast(JSONType, row.get("ground_truth"))
+        if ground_truth is None:
+            ground_truth = cast(JSONType, row.get("ground_truth_for_eval"))
+
+        if isinstance(messages, list) and messages:
+            normalized_messages: list[Message] = []
+            for m in messages:
+                if isinstance(m, Message):
+                    normalized_messages.append(m)
+                elif isinstance(m, dict):
+                    # Let Message handle content types (str or list)
+                    normalized_messages.append(Message.model_validate(m))
+            converted.append(EvaluationRow(messages=normalized_messages, ground_truth=ground_truth))
+            continue
+
+        # Fallback: single-turn user_query
+        user_query = str(row.get("user_query", ""))
+        converted.append(EvaluationRow(messages=[Message(role="user", content=user_query)], ground_truth=ground_truth))
+    return converted
+
+
+def _download_fireworks_dataset_jsonl(
+    dataset_ref: str,
+    *,
+    api_key: Optional[str] = None,
+    api_base: Optional[str] = None,
+) -> Path:
+    """
+    Download a Fireworks dataset to a temporary file and return its path.
+
+    This mirrors `firectl download dataset <ref>` behavior using HTTP APIs.
+    We expect a single JSONL file under dataset/<name>/dataset_with_ground_truth_column_*.jsonl
+    """
+    # Prefer firectl if available, as in user's example
+    firectl_bin = shutil.which("firectl")
+    if firectl_bin:
+        tmp_root = Path(tempfile.mkdtemp(prefix="ep_fw_ds_"))
+        # firectl requires an explicit --output-dir
+        cmd = [firectl_bin, "download", "dataset", dataset_ref, "--output-dir", str(tmp_root)]
+        proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if proc.returncode != 0:
+            raise RuntimeError(f"firectl failed: {proc.stderr or proc.stdout}")
+        # Expected structure: <tmp_root>/dataset/<name>/*.jsonl
+        name_part = dataset_ref.split("/datasets/")[-1] if "/datasets/" in dataset_ref else None
+        candidate_dir = tmp_root / "dataset"
+        if name_part:
+            candidate_dir = candidate_dir / name_part
+        jsonl_files = (
+            list(candidate_dir.rglob("*.jsonl")) if candidate_dir.exists() else list((tmp_root).rglob("*.jsonl"))
+        )
+        if not jsonl_files:
+            raise RuntimeError("No JSONL files found after firectl download")
+        # Prefer ground_truth jsonl
+        jsonl_files.sort(key=lambda p: (0 if "ground_truth" in p.name else 1, 0 if p.suffix == ".jsonl" else 1))
+        return jsonl_files[0]
+
+    # Fallback to HTTP API
+    resolved_key = api_key or get_fireworks_api_key()
+    if not resolved_key:
+        raise RuntimeError("FIREWORKS_API_KEY is required to download Fireworks datasets")
+
+    base = (api_base or get_fireworks_api_base()).rstrip("/")
+    headers = {"Authorization": f"Bearer {resolved_key}"}
+
+    encoded_ref = quote(dataset_ref, safe="")
+    list_url = f"{base}/v1/datasets/{encoded_ref}/files"
+    resp = requests.get(list_url, headers=headers, timeout=60)
+    resp.raise_for_status()
+    payload = resp.json()
+    files = payload.get("files", []) if isinstance(payload, dict) else []
+    if not files:
+        raise RuntimeError(f"No files found for dataset {dataset_ref}")
+
+    def _score(name: str) -> tuple[int, int]:
+        name_lower = name.lower()
+        return (
+            0 if "ground_truth" in name_lower else 1,
+            0 if name_lower.endswith(".jsonl") else 1,
+        )
+
+    files_sorted = sorted(files, key=lambda f: _score(str(f.get("name", ""))))
+    chosen = None
+    for f in files_sorted:
+        name = str(f.get("name", ""))
+        if name.endswith(".jsonl"):
+            chosen = f
+            break
+    if not chosen:
+        raise RuntimeError(f"No JSONL file found for dataset {dataset_ref}")
+
+    file_id = chosen.get("id") or chosen.get("file_id") or chosen.get("name")
+    encoded_file = quote(str(file_id), safe="")
+    dl_url = f"{base}/v1/datasets/{encoded_ref}/files/{encoded_file}:download"
+    dl_resp = requests.get(dl_url, headers=headers, timeout=60)
+    dl_resp.raise_for_status()
+    dl_payload = dl_resp.json()
+    signed_url = dl_payload.get("url") or dl_payload.get("signed_url")
+    if not signed_url:
+        raise RuntimeError("Failed to obtain signed URL for dataset file download")
+
+    tmp_dir = Path(tempfile.mkdtemp(prefix="ep_fw_ds_"))
+    out_path = tmp_dir / Path(str(chosen.get("name", "dataset.jsonl"))).name
+    with requests.get(str(signed_url), stream=True, timeout=300) as r:
+        r.raise_for_status()
+        with open(out_path, "wb") as f:
+            for chunk in r.iter_content(chunk_size=1 << 16):
+                if chunk:
+                    f.write(chunk)
+
+    return out_path
+
+
+@dataclass(kw_only=True)
+class FireworksDatasetLoader(EvaluationDataLoader):
+    """
+    Data loader that downloads a dataset from Fireworks and emits `EvaluationRow`s.
+
+    - dataset_ref: e.g. "accounts/fireworks/datasets/demo-gsm8k-math-dataset-1000"
+    - dataset_adapter: function to convert list[dict] -> list[EvaluationRow]. If not provided,
+      defaults to an adapter that expects OpenAI-style `messages` rows and falls back to legacy demo shape.
+    - max_rows: optional limit on number of rows to emit.
+    - api_key/api_base: override resolution from environment if needed.
+    """
+
+    dataset_ref: str
+    dataset_adapter: Callable[[list[dict[str, object]]], list[EvaluationRow]] | None = None
+    max_rows: Optional[int] = None
+    api_key: Optional[str] = None
+    api_base: Optional[str] = None
+    id: str = "fireworks"
+    description: Optional[str] = None
+
+    def variants(self) -> list[DataLoaderVariant]:
+        def _load() -> DataLoaderResult:
+            jsonl_path = _download_fireworks_dataset_jsonl(
+                self.dataset_ref, api_key=self.api_key, api_base=self.api_base
+            )
+            try:
+                raw_rows = load_jsonl(str(jsonl_path))
+                if self.max_rows is not None:
+                    raw_rows = raw_rows[: self.max_rows]
+                adapter = self.dataset_adapter or _default_dataset_adapter
+                rows = adapter(raw_rows)
+                return DataLoaderResult(
+                    rows=rows,
+                    type=self.__class__.__name__,
+                    variant_id=self.id,
+                    variant_description=self.description or f"Fireworks dataset {self.dataset_ref}",
+                )
+            finally:
+                # Clean up temp file directory
+                try:
+                    # Remove file and its parent temp dir
+                    p = Path(jsonl_path)
+                    parent = p.parent
+                    if p.exists():
+                        p.unlink(missing_ok=True)  # type: ignore[arg-type]
+                    # Attempt to remove directory if empty
+                    try:
+                        parent.rmdir()
+                    except OSError:
+                        pass
+                except Exception:
+                    pass
+
+        return [_load]
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -21,6 +21,16 @@
 class SingleTurnRolloutProcessor(RolloutProcessor):
     """Single turn rollout processor for direct LLM calls."""
 
+    def __init__(self, *, drop_trailing_assistant_messages: bool = True) -> None:
+        """
+        Args:
+            drop_trailing_assistant_messages: When True (default), strip any trailing
+                assistant messages from the input conversation before calling the model.
+                This helps when datasets include previous assistant turns and you want
+                the model to answer the latest user query.
+        """
+        self.drop_trailing_assistant_messages = drop_trailing_assistant_messages
+
     def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]:
         """Generate single turn rollout tasks and return them for external handling."""
         # Do not modify global LiteLLM cache. Disable caching per-request instead.
@@ -32,7 +42,13 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             if len(row.messages) == 0:
                 raise ValueError("Messages is empty. Please provide a non-empty dataset")
 
-            messages_payload = [message.model_dump() for message in row.messages]
+            # Optionally drop trailing assistant messages for single-turn prompts
+            messages_for_request: List[Message] = list(row.messages)
+            if self.drop_trailing_assistant_messages:
+                while messages_for_request and messages_for_request[-1].role == "assistant":
+                    messages_for_request.pop()
+
+            messages_payload = [message.model_dump() for message in messages_for_request]
 
             request_params = {"messages": messages_payload, **config.completion_params}
             # Ensure caching is disabled only for this request (review feedback)
@@ -114,7 +130,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                         except Exception:
                             pass
 
-            messages = list(row.messages) + [
+            messages = list(messages_for_request) + [
                 Message(
                     role="assistant",
                     content=assistant_content,
diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
@@ -1,13 +1,16 @@
 from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
 from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
+from eval_protocol.data_loader import FireworksDatasetLoader
 from eval_protocol.rewards.math import math_reward
 from examples.math_example.main import check_think_answer_format
-from tests.pytest.helper.gsm8k_to_evaluation_row import gsm8k_to_evaluation_row
 
 
 @evaluation_test(
-    input_dataset=["development/gsm8k_sample.jsonl"],
-    dataset_adapter=gsm8k_to_evaluation_row,
+    data_loaders=FireworksDatasetLoader(
+        dataset_ref="accounts/fireworks/datasets/demo-gsm8k-math-dataset-1000",
+        id="fw-gsm8k-demo",
+        description="Fireworks demo GSM8K 1k dataset",
+    ),
     completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     max_dataset_rows=5,
     passed_threshold=0.0,
@@ -32,15 +35,34 @@ def test_math_dataset(row: EvaluationRow, **kwargs) -> EvaluationRow:
     Returns:
         EvaluationRow with the evaluation result
     """
-    # Get the assistant's response
+    # Get the assistant's response (coerce to text)
     assistant_message = row.messages[-1]
     if isinstance(assistant_message, dict):
-        assistant_response = assistant_message.get("content", "")
+        content = assistant_message.get("content", "")
     else:
-        assistant_response = assistant_message.content or ""
+        content = assistant_message.content or ""
+
+    def _to_text(val):
+        if isinstance(val, str):
+            return val
+        if isinstance(val, list):
+            parts = []
+            for part in val:
+                if isinstance(part, dict):
+                    t = part.get("text") or part.get("content")
+                    if isinstance(t, str):
+                        parts.append(t)
+            return "".join(parts)
+        return str(val) if val is not None else ""
+
+    assistant_response = _to_text(content)
 
     # Evaluate numerical accuracy using built-in function
-    accuracy_result = math_reward(messages=row.messages, ground_truth=row.ground_truth, **kwargs["math_reward_kwargs"])
+    accuracy_result = math_reward(
+        messages=row.messages,
+        ground_truth=str(row.ground_truth) if row.ground_truth is not None else "",
+        **kwargs["math_reward_kwargs"],
+    )
 
     # Evaluate format compliance (looking for <think>...</think><answer>...</answer> format)
     format_correct = check_think_answer_format(assistant_response)