From 5d4daa66f64598bfbc79955b879ac41a95a71036 Mon Sep 17 00:00:00 2001
From: benjibc <youfychenbc5000@gmail.com>
Date: Mon, 15 Sep 2025 05:10:20 +0000
Subject: [PATCH 1/4] Langsmith example

---
 eval_protocol/adapters/langsmith.py           | 245 ++++++++++++++++++
 .../quickstart/llm_judge_langsmith.py         | 128 +++++++++
 .../langgraph/test_tools_langsmith_trace.py   |  48 ++++
 examples/langgraph/tools_graph.py             |  68 +++++
 examples/langsmith/README.md                  |  24 ++
 examples/langsmith/dump_traces_langsmith.py   | 115 ++++++++
 examples/langsmith/emit_tool_calls.py         | 116 +++++++++
 .../langsmith/llm_judge_from_langsmith.py     | 168 ++++++++++++
 tests/adapters/test_langsmith_adapter.py      | 183 +++++++++++++
 9 files changed, 1095 insertions(+)
 create mode 100644 eval_protocol/adapters/langsmith.py
 create mode 100644 eval_protocol/quickstart/llm_judge_langsmith.py
 create mode 100644 examples/langgraph/test_tools_langsmith_trace.py
 create mode 100644 examples/langgraph/tools_graph.py
 create mode 100644 examples/langsmith/README.md
 create mode 100644 examples/langsmith/dump_traces_langsmith.py
 create mode 100644 examples/langsmith/emit_tool_calls.py
 create mode 100644 examples/langsmith/llm_judge_from_langsmith.py
 create mode 100644 tests/adapters/test_langsmith_adapter.py

diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py
new file mode 100644
index 00000000..7556e1d9
--- /dev/null
+++ b/eval_protocol/adapters/langsmith.py
@@ -0,0 +1,245 @@
+"""LangSmith adapter for Eval Protocol.
+
+This adapter pulls runs from LangSmith and converts them to EvaluationRow format,
+mirroring the behavior of the Langfuse adapter.
+
+It supports extracting chat messages from inputs/outputs, and optionally includes
+tool calls and tool messages where present.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict, List, Optional
+
+from eval_protocol.models import EvaluationRow, InputMetadata, Message
+
+logger = logging.getLogger(__name__)
+
+try:
+    from langsmith import Client  # type: ignore
+
+    LANGSMITH_AVAILABLE = True
+except ImportError:
+    LANGSMITH_AVAILABLE = False
+
+
+class LangSmithAdapter:
+    """Adapter to pull data from LangSmith and convert to EvaluationRow format.
+
+    By default, fetches root runs from a project and maps inputs/outputs into
+    `Message` objects. It supports a variety of input/output shapes commonly
+    emitted by LangChain/LangGraph integrations, including:
+    - inputs: { messages: [...] } | { prompt } | { user_input } | { input } | str | list[dict]
+    - outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict]
+    """
+
+    def __init__(self, client: Optional[Client] = None) -> None:
+        if not LANGSMITH_AVAILABLE:
+            raise ImportError("LangSmith not installed. Install with: pip install langsmith")
+        self.client = client or Client()
+
+    def get_evaluation_rows(
+        self,
+        *,
+        project_name: str,
+        limit: int = 50,
+        include_tool_calls: bool = True,
+    ) -> List[EvaluationRow]:
+        """Pull runs from LangSmith and convert to EvaluationRow format.
+
+        Args:
+            project_name: LangSmith project to read runs from
+            limit: Maximum number of rows to return
+            include_tool_calls: Whether to include tool calling information when present
+        """
+        rows: List[EvaluationRow] = []
+
+        # Prefer root runs; they usually contain messages in inputs/outputs when tracing app-level flows
+        runs = list(
+            self.client.list_runs(
+                project_name=project_name,
+                is_root=True,
+                limit=limit,
+                select=["id", "inputs", "outputs"],
+            )
+        )
+
+        for r in runs:
+            try:
+                inp = getattr(r, "inputs", None)
+                out = getattr(r, "outputs", None)
+
+                ep_messages: List[Message] = []
+                # Prefer canonical conversation from outputs.messages if present to avoid duplicates
+                if isinstance(out, dict) and isinstance(out.get("messages"), list):
+                    ep_messages.extend(
+                        self._extract_messages_from_payload(
+                            {"messages": out["messages"]}, include_tool_calls, is_output=True
+                        )
+                    )
+                else:
+                    # Inputs → user messages
+                    ep_messages.extend(self._extract_messages_from_payload(inp, include_tool_calls))
+                    # Outputs → assistant (and possible tool messages)
+                    ep_messages.extend(self._extract_messages_from_payload(out, include_tool_calls, is_output=True))
+
+                # Deduplicate consecutive identical user messages (common echo pattern)
+                def _canon(text: Any) -> str:
+                    try:
+                        return " ".join(str(text or "").strip().lower().split())
+                    except Exception:
+                        return str(text or "")
+
+                deduped: List[Message] = []
+                for m in ep_messages:
+                    if deduped and m.role == "user" and deduped[-1].role == "user":
+                        if _canon(m.content) == _canon(deduped[-1].content):
+                            continue
+                    deduped.append(m)
+                ep_messages = deduped
+
+                if not ep_messages:
+                    continue
+
+                rows.append(
+                    EvaluationRow(
+                        messages=ep_messages,
+                        input_metadata=InputMetadata(
+                            session_data={
+                                "langsmith_run_id": str(getattr(r, "id", "")),
+                                "langsmith_project": project_name,
+                            }
+                        ),
+                    )
+                )
+            except Exception as e:
+                logger.warning("Failed to convert run %s: %s", getattr(r, "id", ""), e)
+                continue
+
+        return rows
+
+    def _extract_messages_from_payload(
+        self, payload: Any, include_tool_calls: bool, *, is_output: bool = False
+    ) -> List[Message]:
+        messages: List[Message] = []
+
+        def _dict_to_message(msg_dict: Dict[str, Any]) -> Message:
+            # Role
+            role = msg_dict.get("role")
+            if role is None:
+                # Map LangChain types to roles if available
+                msg_type = msg_dict.get("type")
+                if msg_type == "human":
+                    role = "user"
+                elif msg_type == "ai":
+                    role = "assistant"
+                else:
+                    role = "assistant" if is_output else "user"
+
+            content = msg_dict.get("content")
+            # LangChain content parts
+            if isinstance(content, list):
+                text = " ".join([part.get("text", "") for part in content if isinstance(part, dict)])
+                content = text or str(content)
+
+            name = msg_dict.get("name")
+
+            tool_calls = None
+            tool_call_id = None
+            function_call = None
+            if include_tool_calls:
+                if "tool_calls" in msg_dict and isinstance(msg_dict["tool_calls"], list):
+                    try:
+                        from openai.types.chat.chat_completion_message_tool_call import (
+                            ChatCompletionMessageToolCall,
+                            Function as ChatToolFunction,
+                        )
+
+                        typed_calls: List[ChatCompletionMessageToolCall] = []
+                        for tc in msg_dict["tool_calls"]:
+                            # Extract id/type/function fields from dicts or provider-native objects
+                            if isinstance(tc, dict):
+                                tc_id = tc.get("id", None)
+                                tc_type = tc.get("type", "function") or "function"
+                                fn = tc.get("function", {}) or {}
+                                fn_name = fn.get("name", None)
+                                fn_args = fn.get("arguments", None)
+                            else:
+                                tc_id = getattr(tc, "id", None)
+                                tc_type = getattr(tc, "type", None) or "function"
+                                f = getattr(tc, "function", None)
+                                fn_name = getattr(f, "name", None) if f is not None else None
+                                fn_args = getattr(f, "arguments", None) if f is not None else None
+
+                            # Build typed function object (arguments must be a string per OpenAI type)
+                            fn_obj = ChatToolFunction(
+                                name=str(fn_name) if fn_name is not None else "",
+                                arguments=str(fn_args) if fn_args is not None else "",
+                            )
+                            typed_calls.append(
+                                ChatCompletionMessageToolCall(
+                                    id=str(tc_id) if tc_id is not None else "",
+                                    type="function",
+                                    function=fn_obj,
+                                )
+                            )
+                        tool_calls = typed_calls
+                    except Exception:
+                        # If OpenAI types unavailable, leave None to satisfy type checker
+                        tool_calls = None
+                if "tool_call_id" in msg_dict:
+                    tool_call_id = msg_dict.get("tool_call_id")
+                if "function_call" in msg_dict:
+                    function_call = msg_dict.get("function_call")
+
+            return Message(
+                role=str(role),
+                content=str(content) if content is not None else "",
+                name=name,
+                tool_call_id=tool_call_id,
+                tool_calls=tool_calls,
+                function_call=function_call,
+            )
+
+        if isinstance(payload, dict):
+            # Common patterns
+            if isinstance(payload.get("messages"), list):
+                for m in payload["messages"]:
+                    if isinstance(m, dict):
+                        messages.append(_dict_to_message(m))
+                    else:
+                        messages.append(Message(role="assistant" if is_output else "user", content=str(m)))
+            elif "prompt" in payload and isinstance(payload["prompt"], str):
+                messages.append(Message(role="user" if not is_output else "assistant", content=str(payload["prompt"])))
+            elif "user_input" in payload and isinstance(payload["user_input"], str):
+                messages.append(
+                    Message(role="user" if not is_output else "assistant", content=str(payload["user_input"]))
+                )
+            elif "input" in payload and isinstance(payload["input"], str):
+                messages.append(Message(role="user" if not is_output else "assistant", content=str(payload["input"])))
+            elif "content" in payload and isinstance(payload["content"], str):
+                messages.append(Message(role="assistant", content=str(payload["content"])))
+            elif "result" in payload and isinstance(payload["result"], str):
+                messages.append(Message(role="assistant", content=str(payload["result"])))
+            elif "answer" in payload and isinstance(payload["answer"], str):
+                messages.append(Message(role="assistant", content=str(payload["answer"])))
+            elif "output" in payload and isinstance(payload["output"], str):
+                messages.append(Message(role="assistant", content=str(payload["output"])))
+            else:
+                # Fallback: stringify
+                messages.append(Message(role="assistant" if is_output else "user", content=str(payload)))
+        elif isinstance(payload, list):
+            for m in payload:
+                if isinstance(m, dict):
+                    messages.append(_dict_to_message(m))
+                else:
+                    messages.append(Message(role="assistant" if is_output else "user", content=str(m)))
+        elif isinstance(payload, str):
+            messages.append(Message(role="assistant" if is_output else "user", content=payload))
+
+        return messages
+
+
+def create_langsmith_adapter() -> LangSmithAdapter:
+    return LangSmithAdapter()
diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py
new file mode 100644
index 00000000..f4efb7f5
--- /dev/null
+++ b/eval_protocol/quickstart/llm_judge_langsmith.py
@@ -0,0 +1,128 @@
+"""
+LLM Judge quickstart that PULLS DATA FROM LANGSMITH and persists results locally via Eval Protocol.
+
+This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses
+LangSmith datasets/examples as the source of evaluation rows.
+
+Setup:
+  pip install -U langsmith
+
+Env vars:
+  export LANGSMITH_API_KEY=...             # required to fetch examples
+  export LS_DATASET="ep_langsmith_demo_ds"  # dataset to pull examples from
+
+Judge model keys:
+  - Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY
+  - Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY
+
+Run:
+  pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s
+"""
+
+import os
+from typing import Any, Dict, List, Optional
+
+import pytest
+
+from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
+from eval_protocol.quickstart.utils import (
+    split_multi_turn_rows,
+    JUDGE_CONFIGS,
+    calculate_bootstrap_scores,
+    run_judgment,
+)
+from eval_protocol.adapters.langsmith import LangSmithAdapter
+
+
+def fetch_langsmith_traces_as_evaluation_rows(
+    project_name: Optional[str] = None,
+    limit: int = 20,
+) -> List[EvaluationRow]:
+    """Fetch LangSmith root runs and convert to EvaluationRow, mirroring Langfuse adapter shape.
+
+    - Extract messages from run.inputs and run.outputs
+    - Append assistant message from outputs so split_multi_turn_rows can derive ground_truth
+    - Store run_id in input_metadata.session_data
+    """
+    project = project_name or os.getenv("LS_PROJECT", "ep-langgraph-examples")
+    try:
+        adapter = LangSmithAdapter()
+        return adapter.get_evaluation_rows(project_name=project, limit=limit, include_tool_calls=True)
+    except Exception as e:
+        print(f"❌ LangSmithAdapter failed: {e}")
+        return []
+
+
+@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[fetch_langsmith_traces_as_evaluation_rows()],
+    completion_params=[
+        {
+            "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
+        },
+        {
+            "max_tokens": 131000,
+            "extra_body": {"reasoning_effort": "low"},
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+        },
+    ],
+    rollout_processor=SingleTurnRolloutProcessor(),
+    preprocess_fn=split_multi_turn_rows,
+    mode="all",
+)
+async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[EvaluationRow]:
+    """LLM Judge evaluation over LangSmith-sourced rows, persisted locally by Eval Protocol.
+
+    Mirrors quickstart/llm_judge.py, using Arena-Hard-Auto style pairwise judgment.
+    """
+
+    judge_name = "gemini-2.5-pro"
+
+    if not rows:
+        print("❌ No evaluation rows provided")
+        return rows
+
+    print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging (LangSmith source)...")
+
+    model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model")
+
+    judgments: List[Dict[str, Any]] = []
+
+    for row in rows:
+        result = run_judgment(row, model_name, judge_name)
+        if result and result["games"][0] and result["games"][1]:
+            judgments.append(result)
+
+    if not judgments:
+        print("❌ No valid judgments generated")
+        return rows
+
+    print(f"✅ Generated {len(judgments)} valid judgments")
+
+    mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
+    if mean_score == 0.0:
+        print("❌ No valid scores extracted")
+        return rows
+
+    print("\n##### LLM Judge Results (90th percentile CI) #####")
+    clean_model_name = model_name.split("/")[-1]
+    print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
+    print("original: 50.0% (CI: 50.0% - 50.0%)")
+
+    for row in rows:
+        if row.evaluation_result:
+            row.evaluation_result.score = mean_score
+            row.evaluation_result.standard_error = (upper_score - lower_score) / (2 * 1.645)
+        else:
+            row.evaluation_result = EvaluateResult(
+                score=mean_score,
+                reason="Aggregated LLM judge score",
+                metrics={
+                    "summary": MetricResult(score=mean_score, reason="Aggregated over judgments"),
+                },
+            )
+
+    return rows
diff --git a/examples/langgraph/test_tools_langsmith_trace.py b/examples/langgraph/test_tools_langsmith_trace.py
new file mode 100644
index 00000000..064b639b
--- /dev/null
+++ b/examples/langgraph/test_tools_langsmith_trace.py
@@ -0,0 +1,48 @@
+import os
+import pytest
+
+
+@pytest.mark.skipif(os.getenv("FIREWORKS_API_KEY") in (None, ""), reason="FIREWORKS_API_KEY not set")
+@pytest.mark.asyncio
+async def test_tools_graph_traced_to_langsmith() -> None:
+    from langsmith import Client
+    from langsmith import traceable
+    from .tools_graph import build_tools_graph
+    from langchain_core.messages import HumanMessage
+
+    os.environ.setdefault("LANGSMITH_TRACING", "true")
+    os.environ.setdefault("LANGCHAIN_PROJECT", os.getenv("LS_PROJECT", "ep-langgraph-examples"))
+
+    app = build_tools_graph()
+
+    @traceable
+    async def run_once(prompt: str) -> dict:
+        # Run the graph once
+        _ = await app.ainvoke({"messages": [HumanMessage(content=prompt)]})
+        # Return a ChatML-like transcript including a tool response so LangSmith records role=tool
+        tool_args = '{"a":2,"b":3}'
+        return {
+            "messages": [
+                {"role": "user", "content": prompt},
+                {
+                    "role": "assistant",
+                    "content": "Tool Calls:\ncalculator_add\n" + tool_args,
+                    "tool_calls": [
+                        {
+                            "id": "call_1",
+                            "type": "function",
+                            "function": {"name": "calculator_add", "arguments": tool_args},
+                        }
+                    ],
+                },
+                {
+                    "role": "tool",
+                    "name": "calculator_add",
+                    "tool_call_id": "call_1",
+                    "content": "5",
+                },
+                {"role": "assistant", "content": "The result is 5."},
+            ]
+        }
+
+    await run_once("Use calculator_add to add 2 and 3")
diff --git a/examples/langgraph/tools_graph.py b/examples/langgraph/tools_graph.py
new file mode 100644
index 00000000..523e613f
--- /dev/null
+++ b/examples/langgraph/tools_graph.py
@@ -0,0 +1,68 @@
+from typing import Any, Dict, List
+from typing_extensions import TypedDict, Annotated
+
+
+def build_tools_graph() -> Any:
+    from langgraph.graph import StateGraph, END
+    from langgraph.graph.message import add_messages
+    from langchain_core.messages import BaseMessage
+    from langchain.chat_models import init_chat_model
+
+    class State(TypedDict):
+        messages: Annotated[List[BaseMessage], add_messages]
+
+    # Use fireworks provider; expects FIREWORKS_API_KEY
+    llm = init_chat_model(
+        "accounts/fireworks/models/kimi-k2-instruct",
+        model_provider="fireworks",
+        temperature=0.0,
+        tools=[
+            {
+                "type": "function",
+                "function": {
+                    "name": "calculator_add",
+                    "description": "Add two integers",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "a": {"type": "integer"},
+                            "b": {"type": "integer"},
+                        },
+                        "required": ["a", "b"],
+                    },
+                },
+            }
+        ],
+    )
+
+    async def tool_router(state: State, **_: Any) -> Dict[str, Any]:
+        msgs: List[BaseMessage] = state.get("messages", [])
+        resp = await llm.ainvoke(msgs)
+        # If tool call requested, synthesize tool result message
+        try:
+            tcs = getattr(resp, "tool_calls", None)
+            if tcs:
+                # naive parse for demo
+                a, b = 0, 0
+                try:
+                    import json
+
+                    args = json.loads(tcs[0].function.arguments)
+                    a = int(args.get("a", 0))
+                    b = int(args.get("b", 0))
+                except Exception:
+                    pass
+                result = a + b
+                from langchain_core.messages import ToolMessage
+
+                tool_msg = ToolMessage(content=str(result), tool_call_id=tcs[0].id, name=tcs[0].function.name)
+                return {"messages": [resp, tool_msg]}
+        except Exception:
+            pass
+        return {"messages": [resp]}
+
+    g = StateGraph(State)
+    g.add_node("tool_router", tool_router)
+    g.set_entry_point("tool_router")
+    g.add_edge("tool_router", END)
+    return g.compile()
diff --git a/examples/langsmith/README.md b/examples/langsmith/README.md
new file mode 100644
index 00000000..079cd874
--- /dev/null
+++ b/examples/langsmith/README.md
@@ -0,0 +1,24 @@
+# LangSmith Bootstrap Scripts
+
+These scripts are ONLY for dumping synthetic traces into LangSmith to exercise the adapter and quickstart examples.
+
+- `dump_traces_langsmith.py`: emits simple @traceable runs and an optional mini LangGraph echo flow.
+- `emit_tool_calls.py`: emits runs that include assistant tool calls and a tool response message.
+
+Usage:
+1) Set your API key:
+
+```bash
+export LANGSMITH_API_KEY=...
+export LANGSMITH_TRACING=true
+export LS_PROJECT=ep-langgraph-examples
+```
+
+2) Run emitters:
+
+```bash
+python examples/langsmith/dump_traces_langsmith.py
+python examples/langsmith/emit_tool_calls.py
+```
+
+These are not production examples; they exist to seed LangSmith with traces that the adapter can consume.
diff --git a/examples/langsmith/dump_traces_langsmith.py b/examples/langsmith/dump_traces_langsmith.py
new file mode 100644
index 00000000..68bca4f6
--- /dev/null
+++ b/examples/langsmith/dump_traces_langsmith.py
@@ -0,0 +1,115 @@
+"""Quick script to send a few throwaway traces to LangSmith.
+
+Usage:
+  export LANGSMITH_API_KEY=...  # required
+  export LANGSMITH_TRACING=true  # recommended
+  python python-sdk/examples/langsmith/dump_traces_langsmith.py
+
+Notes:
+- This does not require any external model keys. It logs a few synthetic
+  traced function calls, and optionally a tiny LangGraph flow if available.
+"""
+
+import asyncio
+import os
+from typing import Any, Dict, List
+import importlib
+
+
+def _ensure_env_defaults() -> None:
+    # Prefer modern env vars; fall back maintained for compatibility.
+    if os.environ.get("LANGSMITH_TRACING") is None:
+        os.environ["LANGSMITH_TRACING"] = "true"
+    # Project name helps organize traces in the LangSmith UI
+    os.environ.setdefault("LANGCHAIN_PROJECT", "ep-langgraph-examples")
+
+
+def _log_synthetic_traces() -> None:
+    traceable = None
+    try:
+        mod = importlib.import_module("langsmith")
+        traceable = getattr(mod, "traceable", None)
+    except ImportError:
+        pass
+    if traceable is None:
+        print("LangSmith not installed; skipping @traceable demo. `pip install langsmith`.")
+        return
+
+    @traceable(name="toy_pipeline")
+    def toy_pipeline(user_input: str) -> Dict[str, Any]:
+        reversed_text = user_input[::-1]
+        upper_text = reversed_text.upper()
+        return {"result": upper_text, "len": len(upper_text)}
+
+    print("Emitting synthetic traces via @traceable...")
+    toy_pipeline("hello langsmith")
+    toy_pipeline("trace number two")
+    toy_pipeline("final short run")
+
+
+async def _maybe_run_tiny_langgraph() -> None:
+    """Optionally run a tiny LangGraph flow to log a couple of runs.
+
+    This avoids any external LLM providers by using a pure-Python node.
+    """
+    try:
+        graph_mod = importlib.import_module("langgraph.graph")
+        msg_mod = importlib.import_module("langgraph.graph.message")
+        lc_msgs = importlib.import_module("langchain_core.messages")
+        te_mod = importlib.import_module("typing_extensions")
+    except ImportError:
+        print("LangGraph/LangChain not installed; skipping tiny graph demo. `pip install langgraph langchain-core`.")
+        return
+
+    END = getattr(graph_mod, "END")
+    StateGraph = getattr(graph_mod, "StateGraph")
+    add_messages = getattr(msg_mod, "add_messages")
+    AIMessage = getattr(lc_msgs, "AIMessage")
+    BaseMessage = getattr(lc_msgs, "BaseMessage")
+    HumanMessage = getattr(lc_msgs, "HumanMessage")
+    Annotated = getattr(te_mod, "Annotated")
+    TypedDict = getattr(te_mod, "TypedDict")
+
+    class State(TypedDict):  # type: ignore[misc]
+        messages: Annotated[List[BaseMessage], add_messages]  # type: ignore[index]
+
+    async def echo_node(state: State, **_: Any) -> Dict[str, Any]:
+        messages: List[BaseMessage] = state.get("messages", [])
+        last_user = next((m for m in reversed(messages) if isinstance(m, HumanMessage)), None)
+        content = getattr(last_user, "content", "")
+        reply = AIMessage(content=f"Echo: {content}")
+        return {"messages": [reply]}
+
+    graph = StateGraph(State)
+    graph.add_node("echo", echo_node)
+    graph.set_entry_point("echo")
+    graph.add_edge("echo", END)
+    app = graph.compile()
+
+    print("Emitting a couple LangGraph runs...")
+    await app.ainvoke({"messages": [HumanMessage(content="hi there")]})
+    await app.ainvoke({"messages": [HumanMessage(content="how are you?")]})
+
+
+def main() -> None:
+    _ensure_env_defaults()
+
+    if not os.getenv("LANGSMITH_API_KEY") and not os.getenv("LANGCHAIN_API_KEY"):
+        print("Missing LangSmith API key. Set LANGSMITH_API_KEY (or LANGCHAIN_API_KEY) and rerun.")
+        return
+
+    _log_synthetic_traces()
+
+    try:
+        asyncio.run(_maybe_run_tiny_langgraph())
+    except RuntimeError:
+        # Fallback for event loop already running (e.g. in notebooks)
+        loop = asyncio.get_event_loop()
+        loop.create_task(_maybe_run_tiny_langgraph())
+        loop.run_until_complete(asyncio.sleep(0.1))
+
+    print("Done. Visit LangSmith to see your new traces.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/langsmith/emit_tool_calls.py b/examples/langsmith/emit_tool_calls.py
new file mode 100644
index 00000000..5cc474dc
--- /dev/null
+++ b/examples/langsmith/emit_tool_calls.py
@@ -0,0 +1,116 @@
+"""Emit a few tool-call traces into LangSmith for adapter testing.
+
+Requirements:
+  export LANGSMITH_API_KEY=...
+  optional: export LANGCHAIN_PROJECT=ep-langgraph-examples (or set --project)
+
+Run:
+  python python-sdk/examples/langsmith/emit_tool_calls.py
+"""
+
+import os
+from typing import Any, Dict, List
+
+
+def make_messages_with_tool_call(user_text: str) -> Dict[str, Any]:
+    """Return inputs/outputs shaped like LangChain messages with tool calls."""
+    inputs = {
+        "messages": [
+            {
+                "role": "user",
+                "content": user_text,
+                "type": "human",
+            }
+        ]
+    }
+    # Assistant proposes a tool call (function)
+    assistant_with_tool = {
+        "role": "assistant",
+        "content": "I'll call the calculator.",
+        "type": "ai",
+        "tool_calls": [
+            {
+                "id": "call_1",
+                "type": "function",
+                "function": {
+                    "name": "calculator.add",
+                    "arguments": '{"a": 2, "b": 3}',
+                },
+            }
+        ],
+    }
+    # Tool response message
+    tool_message = {
+        "role": "tool",
+        "name": "calculator.add",
+        "tool_call_id": "call_1",
+        "content": "5",
+    }
+    # Final assistant message
+    final_assistant = {
+        "role": "assistant",
+        "content": "The result is 5.",
+        "type": "ai",
+    }
+    outputs = {
+        "messages": [
+            inputs["messages"][0],
+            assistant_with_tool,
+            tool_message,
+            final_assistant,
+        ]
+    }
+    return {"inputs": inputs, "outputs": outputs}
+
+
+def main() -> None:
+    try:
+        from langsmith import Client  # type: ignore
+    except Exception as e:
+        print(f"Missing langsmith dependency: {e}")
+        return
+
+    project = os.getenv("LANGCHAIN_PROJECT", os.getenv("LS_PROJECT", "ep-langgraph-examples"))
+    client = Client()
+
+    samples: List[str] = [
+        "Add 2 and 3",
+        "Compute 7 + 11",
+        "Sum 10 and 25",
+    ]
+
+    for i, text in enumerate(samples, start=1):
+        payload = make_messages_with_tool_call(text)
+        name = f"tool-demo-{i}"
+        # Create a chain run as container
+        client.create_run(name=name, inputs=payload["inputs"], run_type="chain", project_name=project)
+        # Log an llm child run carrying the assistant/tool messages as outputs
+        client.create_run(
+            name=f"{name}-llm",
+            inputs=payload["inputs"],
+            run_type="llm",
+            project_name=project,
+        )
+        # Finalize by writing one more chain run with the aggregated outputs
+        client.create_run(
+            name=f"{name}-final",
+            inputs=payload["inputs"],
+            run_type="chain",
+            project_name=project,
+        )
+        # Note: For simplicity, we attach outputs only on the final chain run
+        # using update_run is possible, but create_run keeps the example lightweight
+        # and the adapter reads from root runs' inputs/outputs or messages arrays.
+        # Many LangSmith clients attach outputs via end_run; here we keep it minimal.
+        try:
+            # If available, end_run to attach outputs on the final run
+            client.end_run(outputs=payload["outputs"])  # type: ignore[arg-type]
+        except Exception:
+            # Fallback: best-effort; runs may still be visible with inputs and llm child
+            pass
+
+    print(f"Emitted {len(samples)} tool-call demo traces to project '{project}'.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/langsmith/llm_judge_from_langsmith.py b/examples/langsmith/llm_judge_from_langsmith.py
new file mode 100644
index 00000000..e17f40f0
--- /dev/null
+++ b/examples/langsmith/llm_judge_from_langsmith.py
@@ -0,0 +1,168 @@
+"""Run a quick LLM-as-judge evaluation using LangSmith datasets and evaluators.
+
+This mirrors our Langfuse example: we define a tiny dataset, a trivial target,
+and run a rubric-based LLM judge via LangSmith's evaluation API.
+
+Requirements:
+  pip install -U langsmith langchain-openai
+
+Env Vars:
+  export LANGSMITH_API_KEY=...         # required
+  export OPENAI_API_KEY=...            # optional; if absent uses heuristic judge
+  export LANGSMITH_TRACING=true        # optional, to record runs
+
+Run:
+  python python-sdk/examples/langsmith/llm_judge_from_langsmith.py
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Dict
+import importlib
+
+
+def _ensure_env() -> None:
+    os.environ.setdefault("LANGCHAIN_PROJECT", "ep-langgraph-examples")
+    # Enable tracing so target runs + evaluator runs are visible in the UI
+    os.environ.setdefault("LANGSMITH_TRACING", "true")
+
+
+def main() -> None:
+    _ensure_env()
+
+    if not os.getenv("LANGSMITH_API_KEY") and not os.getenv("LANGCHAIN_API_KEY"):
+        raise SystemExit("Please set LANGSMITH_API_KEY (or LANGCHAIN_API_KEY).")
+    use_openai = bool(os.getenv("OPENAI_API_KEY"))
+
+    # Import here to allow the script to print clearer errors if deps are missing.
+    try:
+        ls = importlib.import_module("langsmith")
+        eval_mod = importlib.import_module("langsmith.evaluation")
+    except ImportError as e:
+        raise SystemExit("Missing dependency. Please `pip install -U langsmith`. ") from e
+
+    Client = getattr(ls, "Client")
+    evaluate = getattr(eval_mod, "evaluate")
+
+    client = Client()
+
+    dataset_name = "ep_langsmith_demo_ds"
+    # Create or get dataset
+    try:
+        dataset = client.create_dataset(dataset_name, description="EP demo dataset for LLM-as-judge")
+    except Exception:
+        dataset = client.read_dataset(dataset_name=dataset_name)
+
+    # Seed examples (idempotent-ish: try to insert; duplicates are okay for demo)
+    examples = [
+        ({"prompt": "Say hello to Bob."}, {"answer": "Hello Bob!"}),
+        ({"prompt": "What is 2+2?"}, {"answer": "4"}),
+        (
+            {"prompt": "Respond with a haiku about spring."},
+            {"answer": "Gentle rains arrive\nBuds whisper to warming winds\nEarth breathes life anew"},
+        ),
+    ]
+    for inputs, outputs in examples:
+        try:
+            client.create_example(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+        except Exception:
+            # Ignore duplicate errors in throwaway demo
+            pass
+
+    # Define the target function: pretend model that returns uppercase
+    def target_func(example_inputs: Dict[str, Any]) -> Dict[str, Any]:
+        text = example_inputs.get("prompt", "")
+        return {"answer": str(text).upper()}
+
+    # Define an evaluator that either uses OpenAI (if available) or a heuristic fallback
+    import json
+    import re
+    from typing import cast
+
+    def _normalize_text(text: str) -> str:
+        return re.sub(r"\s+", " ", text.strip().lower())
+
+    def heuristic_score(pred: str, ref: str) -> float:
+        if not ref:
+            return 0.0 if not pred else 0.5
+        p = _normalize_text(pred)
+        r = _normalize_text(ref)
+        if p == r:
+            return 1.0
+        if r in p:
+            return 0.8
+        return 0.0
+
+    def llm_as_judge(run, example):  # type: ignore[no-untyped-def]
+        # Extract strings
+        pred = ""
+        try:
+            out = run.outputs or {}
+            pred = cast(str, out.get("answer") or out.get("output") or "")
+        except Exception:
+            pred = ""
+        ref = ""
+        try:
+            ex_out = example.outputs or {}
+            ref = cast(str, ex_out.get("answer") or ex_out.get("output") or "")
+        except Exception:
+            ref = ""
+
+        if not use_openai:
+            score = heuristic_score(pred, ref)
+            return {"key": "llm_judge", "score": float(score), "comment": "heuristic"}
+
+        try:
+            from langchain_openai import ChatOpenAI  # type: ignore
+        except Exception:
+            score = heuristic_score(pred, ref)
+            return {"key": "llm_judge", "score": float(score), "comment": "heuristic (no openai)"}
+
+        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
+        system = (
+            "You are an impartial grader. Compare the candidate answer to the reference answer. "
+            "Return a JSON object with fields 'score' (float 0.0-1.0) and 'reason' (short string). "
+            "Award 1.0 for semantic equivalence, 0.8 for close paraphrase, else 0.0."
+        )
+        user = json.dumps({"reference": ref, "candidate": pred})
+        try:
+            resp = llm.invoke([{"role": "system", "content": system}, {"role": "user", "content": user}])
+            content = getattr(resp, "content", "")
+            data = {}
+            try:
+                if isinstance(content, str):
+                    data = json.loads(content)
+                else:
+                    # langchain message content may be a list of dicts
+                    data = json.loads(content[0].get("text", "{}"))  # type: ignore[index]
+            except Exception:
+                data = {"score": heuristic_score(pred, ref), "reason": "fallback parse"}
+            score = float(max(0.0, min(1.0, float(data.get("score", 0.0)))))
+            reason = str(data.get("reason", ""))[:500]
+            return {"key": "llm_judge", "score": score, "comment": reason}
+        except Exception as e:
+            score = heuristic_score(pred, ref)
+            return {"key": "llm_judge", "score": float(score), "comment": f"heuristic (error: {e})"}
+
+    print("Running evaluation... this will create an experiment in LangSmith.")
+    results = evaluate(
+        target_func,
+        data=dataset_name,
+        evaluators=[llm_as_judge],
+        experiment_prefix="ep-llm-judge-demo",
+        max_concurrency=4,
+        metadata={"source": "examples/langsmith"},
+    )
+
+    print("Experiment URL:")
+    try:
+        print(results.get("url"))  # type: ignore[reportUnknownMemberType]
+    except Exception:
+        pass
+
+    print("Done. Visit LangSmith to review scores and details.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/adapters/test_langsmith_adapter.py b/tests/adapters/test_langsmith_adapter.py
new file mode 100644
index 00000000..2f32282a
--- /dev/null
+++ b/tests/adapters/test_langsmith_adapter.py
@@ -0,0 +1,183 @@
+import types
+from types import SimpleNamespace
+from typing import Any, Dict, List
+
+import pytest
+
+from eval_protocol.adapters.langsmith import LangSmithAdapter
+from eval_protocol.models import Message
+
+
+class FakeClient:
+    def __init__(self, runs: List[Any]):
+        self._runs = runs
+
+    def list_runs(self, *, project_name: str, is_root: bool, limit: int, select: List[str]):  # type: ignore[no-untyped-def]
+        return iter(self._runs[:limit])
+
+
+def _msg(role: str, content: str, **kwargs: Any) -> Dict[str, Any]:
+    m = {"role": role, "content": content}
+    m.update(kwargs)
+    return m
+
+
+def test_outputs_messages_preferred_and_dedup_user():
+    # outputs.messages exists with duplicate consecutive user messages
+    runs = [
+        SimpleNamespace(
+            id="r1",
+            inputs={"messages": [_msg("user", "hi")]},
+            outputs={
+                "messages": [
+                    _msg("user", "hi"),
+                    _msg("user", "hi"),  # duplicate
+                    _msg("assistant", "hello"),
+                ]
+            },
+        )
+    ]
+    adapter = LangSmithAdapter(client=FakeClient(runs))
+    rows = adapter.get_evaluation_rows(project_name="p", limit=10)
+    assert len(rows) == 1
+    msgs = rows[0].messages
+    assert [m.role for m in msgs] == ["user", "assistant"]
+    assert msgs[0].content == "hi"
+    assert msgs[1].content == "hello"
+
+
+def test_inputs_variants_prompt_user_input_input():
+    runs = [
+        SimpleNamespace(id="p1", inputs={"prompt": "A"}, outputs={"content": "OA"}),
+        SimpleNamespace(id="p2", inputs={"user_input": "B"}, outputs={"result": "OB"}),
+        SimpleNamespace(id="p3", inputs={"input": "C"}, outputs={"answer": "OC"}),
+        SimpleNamespace(id="p4", inputs="D", outputs="OD"),
+    ]
+    adapter = LangSmithAdapter(client=FakeClient(runs))
+    rows = adapter.get_evaluation_rows(project_name="p", limit=10)
+    texts = [[(m.role, m.content) for m in r.messages] for r in rows]
+    assert ("user", "A") in texts[0]
+    assert ("assistant", "OA") in texts[0]
+    assert ("user", "B") in texts[1]
+    assert ("assistant", "OB") in texts[1]
+    assert ("user", "C") in texts[2]
+    assert ("assistant", "OC") in texts[2]
+    assert ("user", "D") in texts[3]
+    assert ("assistant", "OD") in texts[3]
+
+
+def test_outputs_variants_and_list_payloads():
+    runs = [
+        SimpleNamespace(id="o1", inputs=[], outputs={"output": "X"}),
+        SimpleNamespace(id="o2", inputs=[_msg("user", "U")], outputs=[_msg("assistant", "V")]),
+    ]
+    adapter = LangSmithAdapter(client=FakeClient(runs))
+    rows = adapter.get_evaluation_rows(project_name="p", limit=10)
+    msgs1 = rows[0].messages
+    assert any(m.role == "assistant" and m.content == "X" for m in msgs1)
+    msgs2 = rows[1].messages
+    assert any(m.role == "user" and m.content == "U" for m in msgs2)
+    assert any(m.role == "assistant" and m.content == "V" for m in msgs2)
+
+
+def test_tool_calls_and_tool_role_preserved():
+    tool_args = '{"a":2,"b":3}'
+    assistant_with_tool = _msg(
+        "assistant",
+        "Tool Calls:\ncalculator_add\n" + tool_args,
+        tool_calls=[
+            {
+                "id": "call_1",
+                "type": "function",
+                "function": {"name": "calculator_add", "arguments": tool_args},
+            }
+        ],
+    )
+    tool_msg = {"role": "tool", "name": "calculator_add", "tool_call_id": "call_1", "content": "5"}
+    runs = [
+        SimpleNamespace(
+            id="t1",
+            inputs={"messages": [_msg("user", "Add 2 and 3")]},
+            outputs={
+                "messages": [
+                    _msg("user", "Add 2 and 3"),
+                    assistant_with_tool,
+                    tool_msg,
+                    _msg("assistant", "The result is 5."),
+                ]
+            },
+        )
+    ]
+    adapter = LangSmithAdapter(client=FakeClient(runs))
+    rows = adapter.get_evaluation_rows(project_name="p", limit=10)
+    msgs = rows[0].messages
+    # Ensure tool role present
+    assert any(m.role == "tool" and (m.content or "").strip() == "5" for m in msgs)
+    # Ensure assistant with tool_calls preserved
+    assistants = [m for m in msgs if m.role == "assistant" and m.tool_calls]
+    assert len(assistants) >= 1
+    tc = assistants[0].tool_calls[0]
+    # tool_calls may be provider-native objects; normalize via getattr first
+    fname = None
+    if hasattr(tc, "function"):
+        fn = getattr(tc, "function")
+        fname = getattr(fn, "name", None)
+    elif isinstance(tc, dict):
+        fname = tc.get("function", {}).get("name")
+    assert fname == "calculator_add"
+
+
+def test_system_prompt_first_and_multiple_user_allowed():
+    runs = [
+        SimpleNamespace(
+            id="s1",
+            inputs={
+                "messages": [
+                    _msg("system", "You are helpful"),
+                    _msg("user", "hi"),
+                    _msg("user", "hi again"),
+                ]
+            },
+            outputs={"content": "hello there"},
+        )
+    ]
+    adapter = LangSmithAdapter(client=FakeClient(runs))
+    rows = adapter.get_evaluation_rows(project_name="p", limit=10)
+    msgs = rows[0].messages
+    roles = [m.role for m in msgs]
+    assert roles[0] == "system"
+    # both user messages retained (not deduped since content differs)
+    assert roles[1] == "user" and roles[2] == "user"
+    assert roles[-1] == "assistant"
+
+
+def test_parallel_tool_calls_normalized():
+    # Two tool calls in a single assistant message
+    tool_args1 = '{"a":2,"b":3}'
+    tool_args2 = '{"a":4,"b":5}'
+    assistant_with_tools = _msg(
+        "assistant",
+        "Two calls",
+        tool_calls=[
+            {"id": "c1", "type": "function", "function": {"name": "calculator_add", "arguments": tool_args1}},
+            {"id": "c2", "type": "function", "function": {"name": "calculator_add", "arguments": tool_args2}},
+        ],
+    )
+    runs = [
+        SimpleNamespace(
+            id="pt1",
+            inputs={"messages": [_msg("user", "sum two pairs")]},
+            outputs={"messages": [assistant_with_tools]},
+        ),
+    ]
+    adapter = LangSmithAdapter(client=FakeClient(runs))
+    rows = adapter.get_evaluation_rows(project_name="p", limit=10)
+    msgs = rows[0].messages
+    assistants = [m for m in msgs if m.role == "assistant" and m.tool_calls]
+    assert len(assistants) == 1
+    tcs = assistants[0].tool_calls
+    assert isinstance(tcs, list) and len(tcs) == 2
+    names = [
+        (getattr(tc, "function").name if hasattr(tc, "function") else tc.get("function", {}).get("name")) for tc in tcs
+    ]
+    assert names == ["calculator_add", "calculator_add"]

From a65ab80162c38e841dde895a49a81eb6af311d77 Mon Sep 17 00:00:00 2001
From: benjibc <youfychenbc5000@gmail.com>
Date: Tue, 16 Sep 2025 01:21:01 +0000
Subject: [PATCH 2/4] langsmith changes

---
 eval_protocol/adapters/langsmith.py | 194 +++++++++++++++++++++++++---
 pyproject.toml                      |   3 +
 2 files changed, 177 insertions(+), 20 deletions(-)

diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py
index 7556e1d9..2503e3ad 100644
--- a/eval_protocol/adapters/langsmith.py
+++ b/eval_protocol/adapters/langsmith.py
@@ -10,7 +10,7 @@
 from __future__ import annotations
 
 import logging
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Iterable
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
 
@@ -36,7 +36,7 @@ class LangSmithAdapter:
 
     def __init__(self, client: Optional[Client] = None) -> None:
         if not LANGSMITH_AVAILABLE:
-            raise ImportError("LangSmith not installed. Install with: pip install langsmith")
+            raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'")
         self.client = client or Client()
 
     def get_evaluation_rows(
@@ -45,6 +45,31 @@ def get_evaluation_rows(
         project_name: str,
         limit: int = 50,
         include_tool_calls: bool = True,
+        # Pass-through filters to list_runs to match LangSmith Client API
+        run_id: Optional[str] = None,
+        ids: Optional[List[str]] = None,
+        run_type: Optional[str] = None,
+        execution_order: Optional[int] = None,
+        parent_run_id: Optional[str] = None,
+        trace_id: Optional[str] = None,
+        trace_ids: Optional[List[str]] = None,
+        reference_example_id: Optional[str] = None,
+        session_name: Optional[str] = None,
+        error: Optional[bool] = None,
+        start_time: Optional[str] = None,
+        end_time: Optional[str] = None,
+        filter_expr: Optional[str] = None,  # server-side filter DSL
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        feedback_keys: Optional[List[str]] = None,
+        feedback_source: Optional[str] = None,
+        tree_id: Optional[str] = None,
+        # ordering/pagination
+        offset: Optional[int] = None,
+        order_by: Optional[str] = None,
+        # selection
+        select: Optional[List[str]] = None,
+        **list_runs_kwargs: Any,
     ) -> List[EvaluationRow]:
         """Pull runs from LangSmith and convert to EvaluationRow format.
 
@@ -55,17 +80,57 @@ def get_evaluation_rows(
         """
         rows: List[EvaluationRow] = []
 
-        # Prefer root runs; they usually contain messages in inputs/outputs when tracing app-level flows
-        runs = list(
-            self.client.list_runs(
-                project_name=project_name,
-                is_root=True,
-                limit=limit,
-                select=["id", "inputs", "outputs"],
-            )
-        )
-
+        # Fetch runs with pass-through filters. Prefer root runs by default.
+        params: Dict[str, Any] = {"project_name": project_name, "limit": limit}
+        # Only include non-None params
+        if run_type is None:
+            params["is_root"] = True
+        for key, value in [
+            ("id", run_id),
+            ("ids", ids),
+            ("run_type", run_type),
+            ("execution_order", execution_order),
+            ("parent_run_id", parent_run_id),
+            ("trace_id", trace_id),
+            ("trace_ids", trace_ids),
+            ("reference_example_id", reference_example_id),
+            ("session_name", session_name),
+            ("error", error),
+            ("start_time", start_time),
+            ("end_time", end_time),
+            ("filter", filter_expr),
+            ("tags", tags),
+            ("metadata", metadata),
+            ("feedback_keys", feedback_keys),
+            ("feedback_source", feedback_source),
+            ("tree_id", tree_id),
+            ("offset", offset),
+            ("order_by", order_by),
+        ]:
+            if value is not None:
+                params[key] = value
+        params["select"] = select or ["id", "inputs", "outputs", "trace_id"]
+
+        # Merge any additional kwargs last to allow explicit overrides
+        if list_runs_kwargs:
+            for k, v in list_runs_kwargs.items():
+                if v is not None:
+                    params[k] = v
+
+        runs_iter: Iterable[Any] = self.client.list_runs(**params)
+
+        runs = list(runs_iter)
+        if not runs:
+            logger.warning("No LangSmith runs found for project '%s' with current filters", project_name)
+            return []
+
+        # Group by trace_id and pick the last run in each trace (assume iterator yields chronological)
+        trace_to_last_run: Dict[str, Any] = {}
         for r in runs:
+            t_id = str(getattr(r, "trace_id", "")) or str(getattr(r, "id", ""))
+            trace_to_last_run[t_id] = r
+
+        for r in trace_to_last_run.values():
             try:
                 inp = getattr(r, "inputs", None)
                 out = getattr(r, "outputs", None)
@@ -86,10 +151,9 @@ def get_evaluation_rows(
 
                 # Deduplicate consecutive identical user messages (common echo pattern)
                 def _canon(text: Any) -> str:
-                    try:
-                        return " ".join(str(text or "").strip().lower().split())
-                    except Exception:
-                        return str(text or "")
+                    # Best-effort canonicalization; avoid broad exception handling warnings by handling types
+                    text_str = str(text) if text is not None else ""
+                    return " ".join(text_str.strip().lower().split())
 
                 deduped: List[Message] = []
                 for m in ep_messages:
@@ -102,23 +166,115 @@ def _canon(text: Any) -> str:
                 if not ep_messages:
                     continue
 
+                tools = None
+                if include_tool_calls and isinstance(inp, dict):
+                    # Try to extract tool schema if present in inputs
+                    if "tools" in inp:
+                        tools = inp["tools"]
+
                 rows.append(
                     EvaluationRow(
                         messages=ep_messages,
+                        tools=tools,
                         input_metadata=InputMetadata(
                             session_data={
                                 "langsmith_run_id": str(getattr(r, "id", "")),
+                                "langsmith_trace_id": str(getattr(r, "trace_id", "")),
                                 "langsmith_project": project_name,
                             }
                         ),
                     )
                 )
-            except Exception as e:
+            except (AttributeError, ValueError, KeyError, TypeError) as e:
                 logger.warning("Failed to convert run %s: %s", getattr(r, "id", ""), e)
                 continue
 
         return rows
 
+    def get_evaluation_rows_by_ids(
+        self,
+        *,
+        run_ids: Optional[List[str]] = None,
+        trace_ids: Optional[List[str]] = None,
+        include_tool_calls: bool = True,
+        project_name: Optional[str] = None,
+    ) -> List[EvaluationRow]:
+        """Fetch specific runs or traces and convert to EvaluationRow.
+
+        If both run_ids and trace_ids are provided, both sets are fetched.
+        """
+        results: List[EvaluationRow] = []
+
+        fetched_runs: List[Any] = []
+        try:
+            if run_ids:
+                fetched_runs.extend(list(self.client.list_runs(ids=run_ids, select=["id", "inputs", "outputs", "trace_id"])) )
+            if trace_ids:
+                fetched_runs.extend(list(self.client.list_runs(trace_ids=trace_ids, select=["id", "inputs", "outputs", "trace_id"])) )
+        except (AttributeError, ValueError, KeyError, TypeError) as e:
+            logger.warning("Failed to fetch runs by ids: %s", e)
+            return []
+
+        if not fetched_runs:
+            logger.warning("No LangSmith runs found for provided ids")
+            return []
+
+        # Prefer the last run per trace id
+        trace_to_last_run: Dict[str, Any] = {}
+        for r in fetched_runs:
+            t_id = str(getattr(r, "trace_id", "")) or str(getattr(r, "id", ""))
+            trace_to_last_run[t_id] = r
+
+        for r in trace_to_last_run.values():
+            try:
+                inp = getattr(r, "inputs", None)
+                out = getattr(r, "outputs", None)
+
+                ep_messages: List[Message] = []
+                if isinstance(out, dict) and isinstance(out.get("messages"), list):
+                    ep_messages.extend(self._extract_messages_from_payload({"messages": out["messages"]}, include_tool_calls, is_output=True))
+                else:
+                    ep_messages.extend(self._extract_messages_from_payload(inp, include_tool_calls))
+                    ep_messages.extend(self._extract_messages_from_payload(out, include_tool_calls, is_output=True))
+
+                def _canon(text: Any) -> str:
+                    text_str = str(text) if text is not None else ""
+                    return " ".join(text_str.strip().lower().split())
+
+                deduped: List[Message] = []
+                for m in ep_messages:
+                    if deduped and m.role == "user" and deduped[-1].role == "user":
+                        if _canon(m.content) == _canon(deduped[-1].content):
+                            continue
+                    deduped.append(m)
+                ep_messages = deduped
+
+                if not ep_messages:
+                    continue
+
+                tools = None
+                if include_tool_calls and isinstance(inp, dict) and "tools" in inp:
+                    tools = inp["tools"]
+
+                results.append(
+                    EvaluationRow(
+                        messages=ep_messages,
+                        tools=tools,
+                        input_metadata=InputMetadata(
+                            session_data={
+                                "langsmith_run_id": str(getattr(r, "id", "")),
+                                "langsmith_trace_id": str(getattr(r, "trace_id", "")),
+                                "langsmith_project": project_name or "",
+                            }
+                        ),
+                    )
+                )
+            except (AttributeError, ValueError, KeyError, TypeError) as e:
+                logger.warning("Failed to convert run %s: %s", getattr(r, "id", ""), e)
+                continue
+
+        return results
+
     def _extract_messages_from_payload(
         self, payload: Any, include_tool_calls: bool, *, is_output: bool = False
     ) -> List[Message]:
@@ -161,13 +317,11 @@ def _dict_to_message(msg_dict: Dict[str, Any]) -> Message:
                             # Extract id/type/function fields from dicts or provider-native objects
                             if isinstance(tc, dict):
                                 tc_id = tc.get("id", None)
-                                tc_type = tc.get("type", "function") or "function"
                                 fn = tc.get("function", {}) or {}
                                 fn_name = fn.get("name", None)
                                 fn_args = fn.get("arguments", None)
                             else:
                                 tc_id = getattr(tc, "id", None)
-                                tc_type = getattr(tc, "type", None) or "function"
                                 f = getattr(tc, "function", None)
                                 fn_name = getattr(f, "name", None) if f is not None else None
                                 fn_args = getattr(f, "arguments", None) if f is not None else None
@@ -185,7 +339,7 @@ def _dict_to_message(msg_dict: Dict[str, Any]) -> Message:
                                 )
                             )
                         tool_calls = typed_calls
-                    except Exception:
+                    except (ImportError, AttributeError, TypeError, ValueError):
                         # If OpenAI types unavailable, leave None to satisfy type checker
                         tool_calls = None
                 if "tool_call_id" in msg_dict:
diff --git a/pyproject.toml b/pyproject.toml
index 07061b96..b55ace62 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -114,6 +114,9 @@ adapters = [
     "datasets>=3.0.0",
     "transformers>=4.0.0",
 ]
+langsmith = [
+    "langsmith>=0.1.86",
+]
 bigquery = [
     "google-cloud-bigquery>=3.0.0",
     "google-auth>=2.0.0",

From f71658c986f4aa096ceacd261c9cc62e8dda8246 Mon Sep 17 00:00:00 2001
From: benjibc <youfychenbc5000@gmail.com>
Date: Tue, 16 Sep 2025 04:26:56 +0000
Subject: [PATCH 3/4] update lock

---
 uv.lock | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/uv.lock b/uv.lock
index 227436c7..6d333a22 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1275,6 +1275,9 @@ langgraph-tools = [
     { name = "langchain-fireworks" },
     { name = "langgraph" },
 ]
+langsmith = [
+    { name = "langsmith" },
+]
 openevals = [
     { name = "openevals" },
 ]
@@ -1343,6 +1346,7 @@ requires-dist = [
     { name = "langfuse", marker = "extra == 'langfuse'", specifier = ">=2.0.0" },
     { name = "langgraph", marker = "extra == 'langgraph'", specifier = ">=0.6.7" },
     { name = "langgraph", marker = "extra == 'langgraph-tools'", specifier = ">=0.6.7" },
+    { name = "langsmith", marker = "extra == 'langsmith'", specifier = ">=0.1.86" },
     { name = "litellm", specifier = ">=1.0.0" },
     { name = "loguru", specifier = ">=0.6.0" },
     { name = "mcp", specifier = ">=1.9.2" },
@@ -1390,7 +1394,7 @@ requires-dist = [
     { name = "websockets", specifier = ">=15.0.1" },
     { name = "werkzeug", marker = "extra == 'dev'", specifier = ">=2.0.0" },
 ]
-provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "bigquery", "svgbench", "pydantic", "supabase", "chinook", "langchain", "langgraph", "langgraph-tools"]
+provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "langsmith", "bigquery", "svgbench", "pydantic", "supabase", "chinook", "langchain", "langgraph", "langgraph-tools"]
 
 [package.metadata.requires-dev]
 dev = [

From e85ac0a8a231390a94fa1d4958be0b0bc5f067d5 Mon Sep 17 00:00:00 2001
From: benjibc <youfychenbc5000@gmail.com>
Date: Tue, 16 Sep 2025 04:36:40 +0000
Subject: [PATCH 4/4] formatting

---
 eval_protocol/adapters/langsmith.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py
index 2503e3ad..1d29b66a 100644
--- a/eval_protocol/adapters/langsmith.py
+++ b/eval_protocol/adapters/langsmith.py
@@ -208,9 +208,13 @@ def get_evaluation_rows_by_ids(
         fetched_runs: List[Any] = []
         try:
             if run_ids:
-                fetched_runs.extend(list(self.client.list_runs(ids=run_ids, select=["id", "inputs", "outputs", "trace_id"])) )
+                fetched_runs.extend(
+                    list(self.client.list_runs(ids=run_ids, select=["id", "inputs", "outputs", "trace_id"]))
+                )
             if trace_ids:
-                fetched_runs.extend(list(self.client.list_runs(trace_ids=trace_ids, select=["id", "inputs", "outputs", "trace_id"])) )
+                fetched_runs.extend(
+                    list(self.client.list_runs(trace_ids=trace_ids, select=["id", "inputs", "outputs", "trace_id"]))
+                )
         except (AttributeError, ValueError, KeyError, TypeError) as e:
             logger.warning("Failed to fetch runs by ids: %s", e)
             return []
@@ -232,7 +236,11 @@ def get_evaluation_rows_by_ids(
 
                 ep_messages: List[Message] = []
                 if isinstance(out, dict) and isinstance(out.get("messages"), list):
-                    ep_messages.extend(self._extract_messages_from_payload({"messages": out["messages"]}, include_tool_calls, is_output=True))
+                    ep_messages.extend(
+                        self._extract_messages_from_payload(
+                            {"messages": out["messages"]}, include_tool_calls, is_output=True
+                        )
+                    )
                 else:
                     ep_messages.extend(self._extract_messages_from_payload(inp, include_tool_calls))
                     ep_messages.extend(self._extract_messages_from_payload(out, include_tool_calls, is_output=True))