eval-protocol · benjibc · Sep 16, 2025 · Sep 15, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py
diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py
@@ -0,0 +1,128 @@
+"""
+LLM Judge quickstart that PULLS DATA FROM LANGSMITH and persists results locally via Eval Protocol.
+
+This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses
+LangSmith datasets/examples as the source of evaluation rows.
+
+Setup:
+  pip install -U langsmith
+
+Env vars:
+  export LANGSMITH_API_KEY=...             # required to fetch examples
+  export LS_DATASET="ep_langsmith_demo_ds"  # dataset to pull examples from
+
+Judge model keys:
+  - Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY
+  - Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY
+
+Run:
+  pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s
+"""
+
+import os
+from typing import Any, Dict, List, Optional
+
+import pytest
+
+from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
+from eval_protocol.quickstart.utils import (
+    split_multi_turn_rows,
+    JUDGE_CONFIGS,
+    calculate_bootstrap_scores,
+    run_judgment,
+)
+from eval_protocol.adapters.langsmith import LangSmithAdapter
+
+
+def fetch_langsmith_traces_as_evaluation_rows(
+    project_name: Optional[str] = None,
+    limit: int = 20,
+) -> List[EvaluationRow]:
+    """Fetch LangSmith root runs and convert to EvaluationRow, mirroring Langfuse adapter shape.
+
+    - Extract messages from run.inputs and run.outputs
+    - Append assistant message from outputs so split_multi_turn_rows can derive ground_truth
+    - Store run_id in input_metadata.session_data
+    """
+    project = project_name or os.getenv("LS_PROJECT", "ep-langgraph-examples")
+    try:
+        adapter = LangSmithAdapter()
+        return adapter.get_evaluation_rows(project_name=project, limit=limit, include_tool_calls=True)
+    except Exception as e:
+        print(f"❌ LangSmithAdapter failed: {e}")
+        return []
+
+
+@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[fetch_langsmith_traces_as_evaluation_rows()],
+    completion_params=[
+        {
+            "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
+        },
+        {
+            "max_tokens": 131000,
+            "extra_body": {"reasoning_effort": "low"},
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+        },
+    ],
+    rollout_processor=SingleTurnRolloutProcessor(),
+    preprocess_fn=split_multi_turn_rows,
+    mode="all",
+)
+async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[EvaluationRow]:
+    """LLM Judge evaluation over LangSmith-sourced rows, persisted locally by Eval Protocol.
+
+    Mirrors quickstart/llm_judge.py, using Arena-Hard-Auto style pairwise judgment.
+    """
+
+    judge_name = "gemini-2.5-pro"
+
+    if not rows:
+        print("❌ No evaluation rows provided")
+        return rows
+
+    print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging (LangSmith source)...")
+
+    model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model")
+
+    judgments: List[Dict[str, Any]] = []
+
+    for row in rows:
+        result = run_judgment(row, model_name, judge_name)
+        if result and result["games"][0] and result["games"][1]:
+            judgments.append(result)
+
+    if not judgments:
+        print("❌ No valid judgments generated")
+        return rows
+
+    print(f"✅ Generated {len(judgments)} valid judgments")
+
+    mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
+    if mean_score == 0.0:
+        print("❌ No valid scores extracted")
+        return rows
+
+    print("\n##### LLM Judge Results (90th percentile CI) #####")
+    clean_model_name = model_name.split("/")[-1]
+    print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
+    print("original: 50.0% (CI: 50.0% - 50.0%)")
+
+    for row in rows:
+        if row.evaluation_result:
+            row.evaluation_result.score = mean_score
+            row.evaluation_result.standard_error = (upper_score - lower_score) / (2 * 1.645)
+        else:
+            row.evaluation_result = EvaluateResult(
+                score=mean_score,
+                reason="Aggregated LLM judge score",
+                metrics={
+                    "summary": MetricResult(score=mean_score, reason="Aggregated over judgments"),
+                },
+            )
+
+    return rows
diff --git a/examples/langgraph/test_tools_langsmith_trace.py b/examples/langgraph/test_tools_langsmith_trace.py
@@ -0,0 +1,48 @@
+import os
+import pytest
+
+
+@pytest.mark.skipif(os.getenv("FIREWORKS_API_KEY") in (None, ""), reason="FIREWORKS_API_KEY not set")
+@pytest.mark.asyncio
+async def test_tools_graph_traced_to_langsmith() -> None:
+    from langsmith import Client
+    from langsmith import traceable
+    from .tools_graph import build_tools_graph
+    from langchain_core.messages import HumanMessage
+
+    os.environ.setdefault("LANGSMITH_TRACING", "true")
+    os.environ.setdefault("LANGCHAIN_PROJECT", os.getenv("LS_PROJECT", "ep-langgraph-examples"))
+
+    app = build_tools_graph()
+
+    @traceable
+    async def run_once(prompt: str) -> dict:
+        # Run the graph once
+        _ = await app.ainvoke({"messages": [HumanMessage(content=prompt)]})
+        # Return a ChatML-like transcript including a tool response so LangSmith records role=tool
+        tool_args = '{"a":2,"b":3}'
+        return {
+            "messages": [
+                {"role": "user", "content": prompt},
+                {
+                    "role": "assistant",
+                    "content": "Tool Calls:\ncalculator_add\n" + tool_args,
+                    "tool_calls": [
+                        {
+                            "id": "call_1",
+                            "type": "function",
+                            "function": {"name": "calculator_add", "arguments": tool_args},
+                        }
+                    ],
+                },
+                {
+                    "role": "tool",
+                    "name": "calculator_add",
+                    "tool_call_id": "call_1",
+                    "content": "5",
+                },
+                {"role": "assistant", "content": "The result is 5."},
+            ]
+        }
+
+    await run_once("Use calculator_add to add 2 and 3")
diff --git a/examples/langgraph/tools_graph.py b/examples/langgraph/tools_graph.py
@@ -0,0 +1,68 @@
+from typing import Any, Dict, List
+from typing_extensions import TypedDict, Annotated
+
+
+def build_tools_graph() -> Any:
+    from langgraph.graph import StateGraph, END
+    from langgraph.graph.message import add_messages
+    from langchain_core.messages import BaseMessage
+    from langchain.chat_models import init_chat_model
+
+    class State(TypedDict):
+        messages: Annotated[List[BaseMessage], add_messages]
+
+    # Use fireworks provider; expects FIREWORKS_API_KEY
+    llm = init_chat_model(
+        "accounts/fireworks/models/kimi-k2-instruct",
+        model_provider="fireworks",
+        temperature=0.0,
+        tools=[
+            {
+                "type": "function",
+                "function": {
+                    "name": "calculator_add",
+                    "description": "Add two integers",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "a": {"type": "integer"},
+                            "b": {"type": "integer"},
+                        },
+                        "required": ["a", "b"],
+                    },
+                },
+            }
+        ],
+    )
+
+    async def tool_router(state: State, **_: Any) -> Dict[str, Any]:
+        msgs: List[BaseMessage] = state.get("messages", [])
+        resp = await llm.ainvoke(msgs)
+        # If tool call requested, synthesize tool result message
+        try:
+            tcs = getattr(resp, "tool_calls", None)
+            if tcs:
+                # naive parse for demo
+                a, b = 0, 0
+                try:
+                    import json
+
+                    args = json.loads(tcs[0].function.arguments)
+                    a = int(args.get("a", 0))
+                    b = int(args.get("b", 0))
+                except Exception:
+                    pass
+                result = a + b
+                from langchain_core.messages import ToolMessage
+
+                tool_msg = ToolMessage(content=str(result), tool_call_id=tcs[0].id, name=tcs[0].function.name)
+                return {"messages": [resp, tool_msg]}
+        except Exception:
+            pass
+        return {"messages": [resp]}
+
+    g = StateGraph(State)
+    g.add_node("tool_router", tool_router)
+    g.set_entry_point("tool_router")
+    g.add_edge("tool_router", END)
+    return g.compile()
diff --git a/examples/langsmith/README.md b/examples/langsmith/README.md
@@ -0,0 +1,24 @@
+# LangSmith Bootstrap Scripts
+
+These scripts are ONLY for dumping synthetic traces into LangSmith to exercise the adapter and quickstart examples.
+
+- `dump_traces_langsmith.py`: emits simple @traceable runs and an optional mini LangGraph echo flow.
+- `emit_tool_calls.py`: emits runs that include assistant tool calls and a tool response message.
+
+Usage:
+1) Set your API key:
+
+```bash
+export LANGSMITH_API_KEY=...
+export LANGSMITH_TRACING=true
+export LS_PROJECT=ep-langgraph-examples
+```
+
+2) Run emitters:
+
+```bash
+python examples/langsmith/dump_traces_langsmith.py
+python examples/langsmith/emit_tool_calls.py
+```
+
+These are not production examples; they exist to seed LangSmith with traces that the adapter can consume.
diff --git a/examples/langsmith/dump_traces_langsmith.py b/examples/langsmith/dump_traces_langsmith.py
@@ -0,0 +1,115 @@
+"""Quick script to send a few throwaway traces to LangSmith.
+
+Usage:
+  export LANGSMITH_API_KEY=...  # required
+  export LANGSMITH_TRACING=true  # recommended
+  python python-sdk/examples/langsmith/dump_traces_langsmith.py
+
+Notes:
+- This does not require any external model keys. It logs a few synthetic
+  traced function calls, and optionally a tiny LangGraph flow if available.
+"""
+
+import asyncio
+import os
+from typing import Any, Dict, List
+import importlib
+
+
+def _ensure_env_defaults() -> None:
+    # Prefer modern env vars; fall back maintained for compatibility.
+    if os.environ.get("LANGSMITH_TRACING") is None:
+        os.environ["LANGSMITH_TRACING"] = "true"
+    # Project name helps organize traces in the LangSmith UI
+    os.environ.setdefault("LANGCHAIN_PROJECT", "ep-langgraph-examples")
+
+
+def _log_synthetic_traces() -> None:
+    traceable = None
+    try:
+        mod = importlib.import_module("langsmith")
+        traceable = getattr(mod, "traceable", None)
+    except ImportError:
+        pass
+    if traceable is None:
+        print("LangSmith not installed; skipping @traceable demo. `pip install langsmith`.")
+        return
+
+    @traceable(name="toy_pipeline")
+    def toy_pipeline(user_input: str) -> Dict[str, Any]:
+        reversed_text = user_input[::-1]
+        upper_text = reversed_text.upper()
+        return {"result": upper_text, "len": len(upper_text)}
+
+    print("Emitting synthetic traces via @traceable...")
+    toy_pipeline("hello langsmith")
+    toy_pipeline("trace number two")
+    toy_pipeline("final short run")
+
+
+async def _maybe_run_tiny_langgraph() -> None:
+    """Optionally run a tiny LangGraph flow to log a couple of runs.
+
+    This avoids any external LLM providers by using a pure-Python node.
+    """
+    try:
+        graph_mod = importlib.import_module("langgraph.graph")
+        msg_mod = importlib.import_module("langgraph.graph.message")
+        lc_msgs = importlib.import_module("langchain_core.messages")
+        te_mod = importlib.import_module("typing_extensions")
+    except ImportError:
+        print("LangGraph/LangChain not installed; skipping tiny graph demo. `pip install langgraph langchain-core`.")
+        return
+
+    END = getattr(graph_mod, "END")
+    StateGraph = getattr(graph_mod, "StateGraph")
+    add_messages = getattr(msg_mod, "add_messages")
+    AIMessage = getattr(lc_msgs, "AIMessage")
+    BaseMessage = getattr(lc_msgs, "BaseMessage")
+    HumanMessage = getattr(lc_msgs, "HumanMessage")
+    Annotated = getattr(te_mod, "Annotated")
+    TypedDict = getattr(te_mod, "TypedDict")
+
+    class State(TypedDict):  # type: ignore[misc]
+        messages: Annotated[List[BaseMessage], add_messages]  # type: ignore[index]
+
+    async def echo_node(state: State, **_: Any) -> Dict[str, Any]:
+        messages: List[BaseMessage] = state.get("messages", [])
+        last_user = next((m for m in reversed(messages) if isinstance(m, HumanMessage)), None)
+        content = getattr(last_user, "content", "")
+        reply = AIMessage(content=f"Echo: {content}")
+        return {"messages": [reply]}
+
+    graph = StateGraph(State)
+    graph.add_node("echo", echo_node)
+    graph.set_entry_point("echo")
+    graph.add_edge("echo", END)
+    app = graph.compile()
+
+    print("Emitting a couple LangGraph runs...")
+    await app.ainvoke({"messages": [HumanMessage(content="hi there")]})
+    await app.ainvoke({"messages": [HumanMessage(content="how are you?")]})
+
+
+def main() -> None:
+    _ensure_env_defaults()
+
+    if not os.getenv("LANGSMITH_API_KEY") and not os.getenv("LANGCHAIN_API_KEY"):
+        print("Missing LangSmith API key. Set LANGSMITH_API_KEY (or LANGCHAIN_API_KEY) and rerun.")
+        return
+
+    _log_synthetic_traces()
+
+    try:
+        asyncio.run(_maybe_run_tiny_langgraph())
+    except RuntimeError:
+        # Fallback for event loop already running (e.g. in notebooks)
+        loop = asyncio.get_event_loop()
+        loop.create_task(_maybe_run_tiny_langgraph())
+        loop.run_until_complete(asyncio.sleep(0.1))
+
+    print("Done. Visit LangSmith to see your new traces.")
+
+
+if __name__ == "__main__":
+    main()