eval-protocol
diff --git a/‎eval_protocol/adapters/langsmith.py‎
Lines changed: 245 additions & 0 deletions b/‎eval_protocol/adapters/langsmith.py‎
Lines changed: 245 additions & 0 deletions
diff --git a/‎eval_protocol/quickstart/llm_judge_langsmith.py‎
Lines changed: 128 additions & 0 deletions b/‎eval_protocol/quickstart/llm_judge_langsmith.py‎
Lines changed: 128 additions & 0 deletions
@@ -0,0 +1,245 @@
+"""LangSmith adapter for Eval Protocol.
+
+This adapter pulls runs from LangSmith and converts them to EvaluationRow format,
+mirroring the behavior of the Langfuse adapter.
+
+It supports extracting chat messages from inputs/outputs, and optionally includes
+tool calls and tool messages where present.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict, List, Optional
+
+from eval_protocol.models import EvaluationRow, InputMetadata, Message
+
+logger = logging.getLogger(__name__)
+
+try:
+    from langsmith import Client  # type: ignore
+
+    LANGSMITH_AVAILABLE = True
+except ImportError:
+    LANGSMITH_AVAILABLE = False
+
+
+class LangSmithAdapter:
+    """Adapter to pull data from LangSmith and convert to EvaluationRow format.
+
+    By default, fetches root runs from a project and maps inputs/outputs into
+    `Message` objects. It supports a variety of input/output shapes commonly
+    emitted by LangChain/LangGraph integrations, including:
+    - inputs: { messages: [...] } | { prompt } | { user_input } | { input } | str | list[dict]
+    - outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict]
+    """
+
+    def __init__(self, client: Optional[Client] = None) -> None:
+        if not LANGSMITH_AVAILABLE:
+            raise ImportError("LangSmith not installed. Install with: pip install langsmith")
+        self.client = client or Client()
+
+    def get_evaluation_rows(
+        self,
+        *,
+        project_name: str,
+        limit: int = 50,
+        include_tool_calls: bool = True,
+    ) -> List[EvaluationRow]:
+        """Pull runs from LangSmith and convert to EvaluationRow format.
+
+        Args:
+            project_name: LangSmith project to read runs from
+            limit: Maximum number of rows to return
+            include_tool_calls: Whether to include tool calling information when present
+        """
+        rows: List[EvaluationRow] = []
+
+        # Prefer root runs; they usually contain messages in inputs/outputs when tracing app-level flows
+        runs = list(
+            self.client.list_runs(
+                project_name=project_name,
+                is_root=True,
+                limit=limit,
+                select=["id", "inputs", "outputs"],
+            )
+        )
+
+        for r in runs:
+            try:
+                inp = getattr(r, "inputs", None)
+                out = getattr(r, "outputs", None)
+
+                ep_messages: List[Message] = []
+                # Prefer canonical conversation from outputs.messages if present to avoid duplicates
+                if isinstance(out, dict) and isinstance(out.get("messages"), list):
+                    ep_messages.extend(
+                        self._extract_messages_from_payload(
+                            {"messages": out["messages"]}, include_tool_calls, is_output=True
+                        )
+                    )
+                else:
+                    # Inputs → user messages
+                    ep_messages.extend(self._extract_messages_from_payload(inp, include_tool_calls))
+                    # Outputs → assistant (and possible tool messages)
+                    ep_messages.extend(self._extract_messages_from_payload(out, include_tool_calls, is_output=True))
+
+                # Deduplicate consecutive identical user messages (common echo pattern)
+                def _canon(text: Any) -> str:
+                    try:
+                        return " ".join(str(text or "").strip().lower().split())
+                    except Exception:
+                        return str(text or "")
+
+                deduped: List[Message] = []
+                for m in ep_messages:
+                    if deduped and m.role == "user" and deduped[-1].role == "user":
+                        if _canon(m.content) == _canon(deduped[-1].content):
+                            continue
+                    deduped.append(m)
+                ep_messages = deduped
+
+                if not ep_messages:
+                    continue
+
+                rows.append(
+                    EvaluationRow(
+                        messages=ep_messages,
+                        input_metadata=InputMetadata(
+                            session_data={
+                                "langsmith_run_id": str(getattr(r, "id", "")),
+                                "langsmith_project": project_name,
+                            }
+                        ),
+                    )
+                )
+            except Exception as e:
+                logger.warning("Failed to convert run %s: %s", getattr(r, "id", ""), e)
+                continue
+
+        return rows
+
+    def _extract_messages_from_payload(
+        self, payload: Any, include_tool_calls: bool, *, is_output: bool = False
+    ) -> List[Message]:
+        messages: List[Message] = []
+
+        def _dict_to_message(msg_dict: Dict[str, Any]) -> Message:
+            # Role
+            role = msg_dict.get("role")
+            if role is None:
+                # Map LangChain types to roles if available
+                msg_type = msg_dict.get("type")
+                if msg_type == "human":
+                    role = "user"
+                elif msg_type == "ai":
+                    role = "assistant"
+                else:
+                    role = "assistant" if is_output else "user"
+
+            content = msg_dict.get("content")
+            # LangChain content parts
+            if isinstance(content, list):
+                text = " ".join([part.get("text", "") for part in content if isinstance(part, dict)])
+                content = text or str(content)
+
+            name = msg_dict.get("name")
+
+            tool_calls = None
+            tool_call_id = None
+            function_call = None
+            if include_tool_calls:
+                if "tool_calls" in msg_dict and isinstance(msg_dict["tool_calls"], list):
+                    try:
+                        from openai.types.chat.chat_completion_message_tool_call import (
+                            ChatCompletionMessageToolCall,
+                            Function as ChatToolFunction,
+                        )
+
+                        typed_calls: List[ChatCompletionMessageToolCall] = []
+                        for tc in msg_dict["tool_calls"]:
+                            # Extract id/type/function fields from dicts or provider-native objects
+                            if isinstance(tc, dict):
+                                tc_id = tc.get("id", None)
+                                tc_type = tc.get("type", "function") or "function"
+                                fn = tc.get("function", {}) or {}
+                                fn_name = fn.get("name", None)
+                                fn_args = fn.get("arguments", None)
+                            else:
+                                tc_id = getattr(tc, "id", None)
+                                tc_type = getattr(tc, "type", None) or "function"
+                                f = getattr(tc, "function", None)
+                                fn_name = getattr(f, "name", None) if f is not None else None
+                                fn_args = getattr(f, "arguments", None) if f is not None else None
+
+                            # Build typed function object (arguments must be a string per OpenAI type)
+                            fn_obj = ChatToolFunction(
+                                name=str(fn_name) if fn_name is not None else "",
+                                arguments=str(fn_args) if fn_args is not None else "",
+                            )
+                            typed_calls.append(
+                                ChatCompletionMessageToolCall(
+                                    id=str(tc_id) if tc_id is not None else "",
+                                    type="function",
+                                    function=fn_obj,
+                                )
+                            )
+                        tool_calls = typed_calls
+                    except Exception:
+                        # If OpenAI types unavailable, leave None to satisfy type checker
+                        tool_calls = None
+                if "tool_call_id" in msg_dict:
+                    tool_call_id = msg_dict.get("tool_call_id")
+                if "function_call" in msg_dict:
+                    function_call = msg_dict.get("function_call")
+
+            return Message(
+                role=str(role),
+                content=str(content) if content is not None else "",
+                name=name,
+                tool_call_id=tool_call_id,
+                tool_calls=tool_calls,
+                function_call=function_call,
+            )
+
+        if isinstance(payload, dict):
+            # Common patterns
+            if isinstance(payload.get("messages"), list):
+                for m in payload["messages"]:
+                    if isinstance(m, dict):
+                        messages.append(_dict_to_message(m))
+                    else:
+                        messages.append(Message(role="assistant" if is_output else "user", content=str(m)))
+            elif "prompt" in payload and isinstance(payload["prompt"], str):
+                messages.append(Message(role="user" if not is_output else "assistant", content=str(payload["prompt"])))
+            elif "user_input" in payload and isinstance(payload["user_input"], str):
+                messages.append(
+                    Message(role="user" if not is_output else "assistant", content=str(payload["user_input"]))
+                )
+            elif "input" in payload and isinstance(payload["input"], str):
+                messages.append(Message(role="user" if not is_output else "assistant", content=str(payload["input"])))
+            elif "content" in payload and isinstance(payload["content"], str):
+                messages.append(Message(role="assistant", content=str(payload["content"])))
+            elif "result" in payload and isinstance(payload["result"], str):
+                messages.append(Message(role="assistant", content=str(payload["result"])))
+            elif "answer" in payload and isinstance(payload["answer"], str):
+                messages.append(Message(role="assistant", content=str(payload["answer"])))
+            elif "output" in payload and isinstance(payload["output"], str):
+                messages.append(Message(role="assistant", content=str(payload["output"])))
+            else:
+                # Fallback: stringify
+                messages.append(Message(role="assistant" if is_output else "user", content=str(payload)))
+        elif isinstance(payload, list):
+            for m in payload:
+                if isinstance(m, dict):
+                    messages.append(_dict_to_message(m))
+                else:
+                    messages.append(Message(role="assistant" if is_output else "user", content=str(m)))
+        elif isinstance(payload, str):
+            messages.append(Message(role="assistant" if is_output else "user", content=payload))
+
+        return messages
+
+
+def create_langsmith_adapter() -> LangSmithAdapter:
+    return LangSmithAdapter()
@@ -0,0 +1,128 @@
+"""
+LLM Judge quickstart that PULLS DATA FROM LANGSMITH and persists results locally via Eval Protocol.
+
+This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses
+LangSmith datasets/examples as the source of evaluation rows.
+
+Setup:
+  pip install -U langsmith
+
+Env vars:
+  export LANGSMITH_API_KEY=...             # required to fetch examples
+  export LS_DATASET="ep_langsmith_demo_ds"  # dataset to pull examples from
+
+Judge model keys:
+  - Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY
+  - Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY
+
+Run:
+  pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s
+"""
+
+import os
+from typing import Any, Dict, List, Optional
+
+import pytest
+
+from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
+from eval_protocol.quickstart.utils import (
+    split_multi_turn_rows,
+    JUDGE_CONFIGS,
+    calculate_bootstrap_scores,
+    run_judgment,
+)
+from eval_protocol.adapters.langsmith import LangSmithAdapter
+
+
+def fetch_langsmith_traces_as_evaluation_rows(
+    project_name: Optional[str] = None,
+    limit: int = 20,
+) -> List[EvaluationRow]:
+    """Fetch LangSmith root runs and convert to EvaluationRow, mirroring Langfuse adapter shape.
+
+    - Extract messages from run.inputs and run.outputs
+    - Append assistant message from outputs so split_multi_turn_rows can derive ground_truth
+    - Store run_id in input_metadata.session_data
+    """
+    project = project_name or os.getenv("LS_PROJECT", "ep-langgraph-examples")
+    try:
+        adapter = LangSmithAdapter()
+        return adapter.get_evaluation_rows(project_name=project, limit=limit, include_tool_calls=True)
+    except Exception as e:
+        print(f"❌ LangSmithAdapter failed: {e}")
+        return []
+
+
+@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[fetch_langsmith_traces_as_evaluation_rows()],
+    completion_params=[
+        {
+            "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
+        },
+        {
+            "max_tokens": 131000,
+            "extra_body": {"reasoning_effort": "low"},
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+        },
+    ],
+    rollout_processor=SingleTurnRolloutProcessor(),
+    preprocess_fn=split_multi_turn_rows,
+    mode="all",
+)
+async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[EvaluationRow]:
+    """LLM Judge evaluation over LangSmith-sourced rows, persisted locally by Eval Protocol.
+
+    Mirrors quickstart/llm_judge.py, using Arena-Hard-Auto style pairwise judgment.
+    """
+
+    judge_name = "gemini-2.5-pro"
+
+    if not rows:
+        print("❌ No evaluation rows provided")
+        return rows
+
+    print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging (LangSmith source)...")
+
+    model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model")
+
+    judgments: List[Dict[str, Any]] = []
+
+    for row in rows:
+        result = run_judgment(row, model_name, judge_name)
+        if result and result["games"][0] and result["games"][1]:
+            judgments.append(result)
+
+    if not judgments:
+        print("❌ No valid judgments generated")
+        return rows
+
+    print(f"✅ Generated {len(judgments)} valid judgments")
+
+    mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
+    if mean_score == 0.0:
+        print("❌ No valid scores extracted")
+        return rows
+
+    print("\n##### LLM Judge Results (90th percentile CI) #####")
+    clean_model_name = model_name.split("/")[-1]
+    print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
+    print("original: 50.0% (CI: 50.0% - 50.0%)")
+
+    for row in rows:
+        if row.evaluation_result:
+            row.evaluation_result.score = mean_score
+            row.evaluation_result.standard_error = (upper_score - lower_score) / (2 * 1.645)
+        else:
+            row.evaluation_result = EvaluateResult(
+                score=mean_score,
+                reason="Aggregated LLM judge score",
+                metrics={
+                    "summary": MetricResult(score=mean_score, reason="Aggregated over judgments"),
+                },
+            )
+
+    return rows