eval-protocol
diff --git a/‎docs/developer_guide/tracing_integration_guide.mdx‎
Lines changed: 141 additions & 0 deletions b/‎docs/developer_guide/tracing_integration_guide.mdx‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎eval_protocol/adapters/__init__.py‎
Lines changed: 7 additions & 0 deletions b/‎eval_protocol/adapters/__init__.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎eval_protocol/adapters/weave.py‎
Lines changed: 278 additions & 0 deletions b/‎eval_protocol/adapters/weave.py‎
Lines changed: 278 additions & 0 deletions
@@ -87,3 +87,10 @@
     __all__.extend(["LangSmithAdapter"])
 except ImportError:
     pass
+
+try:
+    from .weave import WeaveAdapter, create_weave_adapter
+
+    __all__.extend(["WeaveAdapter", "create_weave_adapter"])
+except ImportError:
+    pass
@@ -0,0 +1,278 @@
+"""Weave adapter for Eval Protocol.
+
+This adapter pulls traces from Weights & Biases Weave Service API and converts
+them to EvaluationRow format for use in evaluation pipelines.
+
+References:
+- Guides: https://weave-docs.wandb.ai/guides/integrations/litellm/
+- Service API: https://weave-docs.wandb.ai/reference/gen_notebooks/weave_via_service_api/
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from typing import Any, Dict, List, Optional, Protocol
+
+import requests
+
+from eval_protocol.models import EvaluationRow, InputMetadata, Message
+from .base import BaseAdapter
+from .utils import extract_messages_from_data
+
+
+logger = logging.getLogger(__name__)
+
+
+class TraceConverter(Protocol):
+    """Protocol for custom Weave trace-to-EvaluationRow converter functions.
+
+    A converter function should take a Weave trace dict along with processing
+    options and return an EvaluationRow or None to skip the trace.
+    """
+
+    def __call__(
+        self,
+        trace: Dict[str, Any],
+        include_tool_calls: bool,
+    ) -> Optional[EvaluationRow]:
+        """Convert a Weave trace to an EvaluationRow.
+
+        Args:
+            trace: The Weave trace object (as returned by Service API) to convert
+            include_tool_calls: Whether to include tool calling information
+
+        Returns:
+            EvaluationRow or None if the trace should be skipped
+        """
+        raise NotImplementedError
+
+
+def _extract_messages_from_trace(trace: Dict[str, Any], include_tool_calls: bool = True) -> List[Message]:
+    """Extract messages from Weave trace inputs/outputs.
+
+    Weave Service API typically returns a root call document with fields like:
+      - id, project_id, op_name, inputs, output, summary, ...
+
+    We handle common payload shapes:
+      - inputs: { messages: [...] } | { prompt } | list[dict] | str
+      - output: { messages: [...] } | { content } | { result } | { choices: [{message: {...}}] } | list[dict] | str
+    """
+    messages: List[Message] = []
+
+    try:
+        inp = trace.get("inputs") or trace.get("input")
+        out = trace.get("output") or trace.get("outputs")
+
+        if inp is not None:
+            messages.extend(extract_messages_from_data(inp, include_tool_calls))
+
+        if out is not None:
+            # Prefer explicit messages array (preserves tool_calls and tool role messages)
+            if isinstance(out, dict) and isinstance(out.get("messages"), list):
+                messages.extend(extract_messages_from_data({"messages": out["messages"]}, include_tool_calls))
+            # Otherwise, support OpenAI-style choices for UI-rendered assistant content
+            elif isinstance(out, dict) and isinstance(out.get("choices"), list) and out["choices"]:
+                choice0 = out["choices"][0]
+                msg_dict = None
+                if isinstance(choice0, dict):
+                    msg_dict = choice0.get("message") or choice0.get("delta")
+                if isinstance(msg_dict, dict):
+                    if "role" not in msg_dict:
+                        msg_dict = {**msg_dict, "role": "assistant"}
+                    messages.append(Message.model_validate(msg_dict))
+                else:
+                    content = (
+                        choice0.get("message", {}).get("content") if isinstance(choice0, dict) else None
+                    )
+                    if content is not None:
+                        messages.append(Message(role="assistant", content=str(content)))
+            else:
+                # Generic extraction
+                messages.extend(extract_messages_from_data(out, include_tool_calls))
+    except (KeyError, TypeError, ValueError) as e:
+        logger.warning("Failed to extract Weave messages: %s", e)
+
+    return messages
+
+
+def convert_trace_to_evaluation_row(trace: Dict[str, Any], include_tool_calls: bool = True) -> Optional[EvaluationRow]:
+    """Convert a Weave trace dict to EvaluationRow format.
+
+    Args:
+        trace: Weave trace object as returned by Service API
+        include_tool_calls: Whether to include tool calling information
+
+    Returns:
+        EvaluationRow or None if conversion fails
+    """
+    try:
+        messages = _extract_messages_from_trace(trace, include_tool_calls)
+
+        tools = None
+        if include_tool_calls:
+            # Prefer tool schema from inputs.tools when present
+            inputs_obj = trace.get("inputs") or {}
+            if isinstance(inputs_obj, dict) and "tools" in inputs_obj:
+                tools = inputs_obj.get("tools")
+
+        if not messages:
+            return None
+
+        project_id = str(trace.get("project_id", ""))
+        weave_trace_id = str(trace.get("id", ""))
+
+        return EvaluationRow(
+            messages=messages,
+            tools=tools,
+            input_metadata=InputMetadata(
+                session_data={
+                    "weave_trace_id": weave_trace_id,
+                    "weave_project_id": project_id,
+                }
+            ),
+        )
+    except (KeyError, TypeError, ValueError) as e:
+        logger.error("Error converting Weave trace %s: %s", trace.get("id", "unknown"), e)
+        return None
+
+
+class WeaveAdapter(BaseAdapter):
+    """Adapter to pull Weave traces and convert to EvaluationRow format.
+
+    Configuration is sourced from parameters or environment variables:
+      - team_id: defaults to WANDB_ENTITY
+      - project_id: defaults to WANDB_PROJECT
+      - api_token: defaults to WANDB_API_KEY
+      - base_url: defaults to WEAVE_TRACE_BASE_URL or 'https://trace.wandb.ai'
+    """
+
+    def __init__(
+        self,
+        *,
+        team_id: Optional[str] = None,
+        project_id: Optional[str] = None,
+        api_token: Optional[str] = None,
+        base_url: Optional[str] = None,
+    ) -> None:
+        self.team_id = team_id or os.getenv("WEAVE_TEAM_ID") or os.getenv("WANDB_ENTITY")
+        self.project_id = project_id or os.getenv("WEAVE_PROJECT_ID") or os.getenv("WANDB_PROJECT")
+        self.api_token = api_token or os.getenv("WANDB_API_KEY")
+        self.base_url = base_url or os.getenv("WEAVE_TRACE_BASE_URL", "https://trace.wandb.ai")
+
+        if not self.api_token:
+            raise ValueError("WANDB_API_KEY environment variable or api_token parameter required")
+        if not self.team_id or not self.project_id:
+            raise ValueError(
+                "Weave project not configured. Provide team_id/project_id or set WANDB_ENTITY and WANDB_PROJECT"
+            )
+
+    def get_evaluation_rows(self, *args: Any, **kwargs: Any) -> List[EvaluationRow]:
+        """Query Weave Service API for root traces and convert to EvaluationRow.
+
+        Args:
+            limit: Max number of results (kwarg, default 100)
+            offset: Offset into result set (kwarg, default 0)
+            include_tool_calls: Whether to include tool calling information (kwarg, default True)
+            query: Server-side expression object (e.g., {"$expr": {...}}) per Weave docs (kwarg)
+            filter_obj: Additional filter options, defaults to {"trace_roots_only": True} (kwarg)
+            sort_by: Sort directives, defaults to started_at desc (kwarg)
+            include_feedback: Whether to include feedback in results (kwarg)
+            converter: Optional custom converter implementing TraceConverter protocol (kwarg)
+        """
+        limit: int = kwargs.pop("limit", 100)
+        offset: int = kwargs.pop("offset", 0)
+        include_tool_calls: bool = kwargs.pop("include_tool_calls", True)
+        query: Optional[Dict[str, Any]] = kwargs.pop("query", None)
+        filter_obj: Optional[Dict[str, Any]] = kwargs.pop("filter_obj", None)
+        sort_by: Optional[List[Dict[str, Any]]] = kwargs.pop("sort_by", None)
+        include_feedback: bool = kwargs.pop("include_feedback", False)
+        converter: Optional[TraceConverter] = kwargs.pop("converter", None)
+
+        # ignore remaining kwargs to remain forward compatible
+        url_stream_query = f"{self.base_url}/calls/stream_query"
+
+        payload: Dict[str, Any] = {
+            "project_id": f"{self.team_id}/{self.project_id}",
+            "filter": {"trace_roots_only": True},
+            "limit": limit,
+            "offset": offset,
+            "sort_by": sort_by or [{"field": "started_at", "direction": "desc"}],
+            "include_feedback": include_feedback,
+        }
+        if query is not None:
+            payload["query"] = query
+        if filter_obj is not None:
+            payload["filter"] = filter_obj
+
+        headers = {"Content-Type": "application/json"}
+
+        resp = requests.post(
+            url_stream_query, headers=headers, json=payload, auth=("api", self.api_token), timeout=30
+        )
+        resp.raise_for_status()
+
+        rows: List[EvaluationRow] = []
+
+        # The API may return either a JSON array/object or newline-delimited JSON (stream)
+        data: Any
+        try:
+            data = resp.json()
+            # Normalize to list
+            traces: List[Dict[str, Any]]
+            if isinstance(data, dict):
+                # Some endpoints may return a single object
+                traces = [data]
+            else:
+                traces = list(data) if isinstance(data, list) else []
+        except ValueError:
+            # Fallback decode for newline-delimited JSON
+            lines = [ln for ln in resp.text.strip().split("\n") if ln.strip()]
+            import json as _json
+
+            traces = []
+            for ln in lines:
+                try:
+                    obj = _json.loads(ln)
+                    traces.append(obj)
+                except _json.JSONDecodeError:
+                    continue
+
+        if not traces:
+            return []
+
+        for tr in traces:
+            try:
+                eval_row = converter(tr, include_tool_calls) if converter else convert_trace_to_evaluation_row(
+                    tr, include_tool_calls
+                )
+                if eval_row:
+                    rows.append(eval_row)
+            except (KeyError, TypeError, ValueError) as e:
+                logger.warning("Failed to convert Weave trace %s: %s", tr.get("id", "unknown"), e)
+
+        return rows
+
+    def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:  # noqa: D401
+        """No-op: Weave Service API does not expose a score feedback endpoint yet.
+
+        If/when Weave exposes an official feedback API for traces, this method can
+        be implemented to push evaluation results back to the provider.
+        """
+        logger.info("Weave upload_scores not implemented: no public feedback API available")
+
+    def upload_score(self, row: EvaluationRow, model_name: str) -> None:  # noqa: D401
+        """No-op per upload_scores; see note there."""
+        logger.info("Weave upload_score not implemented: no public feedback API available")
+
+
+def create_weave_adapter(
+    *, team_id: Optional[str] = None, project_id: Optional[str] = None, api_token: Optional[str] = None, base_url: Optional[str] = None
+) -> WeaveAdapter:
+    """Factory function to create a Weave adapter."""
+    return WeaveAdapter(team_id=team_id, project_id=project_id, api_token=api_token, base_url=base_url)
+
+
+__all__ = ["WeaveAdapter", "create_weave_adapter", "convert_trace_to_evaluation_row"]
+
+