Skip to content

Commit 5d4daa6

Browse files
committed
Langsmith example
1 parent 908d14a commit 5d4daa6

File tree

9 files changed

+1095
-0
lines changed

9 files changed

+1095
-0
lines changed
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
"""LangSmith adapter for Eval Protocol.
2+
3+
This adapter pulls runs from LangSmith and converts them to EvaluationRow format,
4+
mirroring the behavior of the Langfuse adapter.
5+
6+
It supports extracting chat messages from inputs/outputs, and optionally includes
7+
tool calls and tool messages where present.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import logging
13+
from typing import Any, Dict, List, Optional
14+
15+
from eval_protocol.models import EvaluationRow, InputMetadata, Message
16+
17+
logger = logging.getLogger(__name__)
18+
19+
try:
20+
from langsmith import Client # type: ignore
21+
22+
LANGSMITH_AVAILABLE = True
23+
except ImportError:
24+
LANGSMITH_AVAILABLE = False
25+
26+
27+
class LangSmithAdapter:
28+
"""Adapter to pull data from LangSmith and convert to EvaluationRow format.
29+
30+
By default, fetches root runs from a project and maps inputs/outputs into
31+
`Message` objects. It supports a variety of input/output shapes commonly
32+
emitted by LangChain/LangGraph integrations, including:
33+
- inputs: { messages: [...] } | { prompt } | { user_input } | { input } | str | list[dict]
34+
- outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict]
35+
"""
36+
37+
def __init__(self, client: Optional[Client] = None) -> None:
38+
if not LANGSMITH_AVAILABLE:
39+
raise ImportError("LangSmith not installed. Install with: pip install langsmith")
40+
self.client = client or Client()
41+
42+
def get_evaluation_rows(
43+
self,
44+
*,
45+
project_name: str,
46+
limit: int = 50,
47+
include_tool_calls: bool = True,
48+
) -> List[EvaluationRow]:
49+
"""Pull runs from LangSmith and convert to EvaluationRow format.
50+
51+
Args:
52+
project_name: LangSmith project to read runs from
53+
limit: Maximum number of rows to return
54+
include_tool_calls: Whether to include tool calling information when present
55+
"""
56+
rows: List[EvaluationRow] = []
57+
58+
# Prefer root runs; they usually contain messages in inputs/outputs when tracing app-level flows
59+
runs = list(
60+
self.client.list_runs(
61+
project_name=project_name,
62+
is_root=True,
63+
limit=limit,
64+
select=["id", "inputs", "outputs"],
65+
)
66+
)
67+
68+
for r in runs:
69+
try:
70+
inp = getattr(r, "inputs", None)
71+
out = getattr(r, "outputs", None)
72+
73+
ep_messages: List[Message] = []
74+
# Prefer canonical conversation from outputs.messages if present to avoid duplicates
75+
if isinstance(out, dict) and isinstance(out.get("messages"), list):
76+
ep_messages.extend(
77+
self._extract_messages_from_payload(
78+
{"messages": out["messages"]}, include_tool_calls, is_output=True
79+
)
80+
)
81+
else:
82+
# Inputs → user messages
83+
ep_messages.extend(self._extract_messages_from_payload(inp, include_tool_calls))
84+
# Outputs → assistant (and possible tool messages)
85+
ep_messages.extend(self._extract_messages_from_payload(out, include_tool_calls, is_output=True))
86+
87+
# Deduplicate consecutive identical user messages (common echo pattern)
88+
def _canon(text: Any) -> str:
89+
try:
90+
return " ".join(str(text or "").strip().lower().split())
91+
except Exception:
92+
return str(text or "")
93+
94+
deduped: List[Message] = []
95+
for m in ep_messages:
96+
if deduped and m.role == "user" and deduped[-1].role == "user":
97+
if _canon(m.content) == _canon(deduped[-1].content):
98+
continue
99+
deduped.append(m)
100+
ep_messages = deduped
101+
102+
if not ep_messages:
103+
continue
104+
105+
rows.append(
106+
EvaluationRow(
107+
messages=ep_messages,
108+
input_metadata=InputMetadata(
109+
session_data={
110+
"langsmith_run_id": str(getattr(r, "id", "")),
111+
"langsmith_project": project_name,
112+
}
113+
),
114+
)
115+
)
116+
except Exception as e:
117+
logger.warning("Failed to convert run %s: %s", getattr(r, "id", ""), e)
118+
continue
119+
120+
return rows
121+
122+
def _extract_messages_from_payload(
123+
self, payload: Any, include_tool_calls: bool, *, is_output: bool = False
124+
) -> List[Message]:
125+
messages: List[Message] = []
126+
127+
def _dict_to_message(msg_dict: Dict[str, Any]) -> Message:
128+
# Role
129+
role = msg_dict.get("role")
130+
if role is None:
131+
# Map LangChain types to roles if available
132+
msg_type = msg_dict.get("type")
133+
if msg_type == "human":
134+
role = "user"
135+
elif msg_type == "ai":
136+
role = "assistant"
137+
else:
138+
role = "assistant" if is_output else "user"
139+
140+
content = msg_dict.get("content")
141+
# LangChain content parts
142+
if isinstance(content, list):
143+
text = " ".join([part.get("text", "") for part in content if isinstance(part, dict)])
144+
content = text or str(content)
145+
146+
name = msg_dict.get("name")
147+
148+
tool_calls = None
149+
tool_call_id = None
150+
function_call = None
151+
if include_tool_calls:
152+
if "tool_calls" in msg_dict and isinstance(msg_dict["tool_calls"], list):
153+
try:
154+
from openai.types.chat.chat_completion_message_tool_call import (
155+
ChatCompletionMessageToolCall,
156+
Function as ChatToolFunction,
157+
)
158+
159+
typed_calls: List[ChatCompletionMessageToolCall] = []
160+
for tc in msg_dict["tool_calls"]:
161+
# Extract id/type/function fields from dicts or provider-native objects
162+
if isinstance(tc, dict):
163+
tc_id = tc.get("id", None)
164+
tc_type = tc.get("type", "function") or "function"
165+
fn = tc.get("function", {}) or {}
166+
fn_name = fn.get("name", None)
167+
fn_args = fn.get("arguments", None)
168+
else:
169+
tc_id = getattr(tc, "id", None)
170+
tc_type = getattr(tc, "type", None) or "function"
171+
f = getattr(tc, "function", None)
172+
fn_name = getattr(f, "name", None) if f is not None else None
173+
fn_args = getattr(f, "arguments", None) if f is not None else None
174+
175+
# Build typed function object (arguments must be a string per OpenAI type)
176+
fn_obj = ChatToolFunction(
177+
name=str(fn_name) if fn_name is not None else "",
178+
arguments=str(fn_args) if fn_args is not None else "",
179+
)
180+
typed_calls.append(
181+
ChatCompletionMessageToolCall(
182+
id=str(tc_id) if tc_id is not None else "",
183+
type="function",
184+
function=fn_obj,
185+
)
186+
)
187+
tool_calls = typed_calls
188+
except Exception:
189+
# If OpenAI types unavailable, leave None to satisfy type checker
190+
tool_calls = None
191+
if "tool_call_id" in msg_dict:
192+
tool_call_id = msg_dict.get("tool_call_id")
193+
if "function_call" in msg_dict:
194+
function_call = msg_dict.get("function_call")
195+
196+
return Message(
197+
role=str(role),
198+
content=str(content) if content is not None else "",
199+
name=name,
200+
tool_call_id=tool_call_id,
201+
tool_calls=tool_calls,
202+
function_call=function_call,
203+
)
204+
205+
if isinstance(payload, dict):
206+
# Common patterns
207+
if isinstance(payload.get("messages"), list):
208+
for m in payload["messages"]:
209+
if isinstance(m, dict):
210+
messages.append(_dict_to_message(m))
211+
else:
212+
messages.append(Message(role="assistant" if is_output else "user", content=str(m)))
213+
elif "prompt" in payload and isinstance(payload["prompt"], str):
214+
messages.append(Message(role="user" if not is_output else "assistant", content=str(payload["prompt"])))
215+
elif "user_input" in payload and isinstance(payload["user_input"], str):
216+
messages.append(
217+
Message(role="user" if not is_output else "assistant", content=str(payload["user_input"]))
218+
)
219+
elif "input" in payload and isinstance(payload["input"], str):
220+
messages.append(Message(role="user" if not is_output else "assistant", content=str(payload["input"])))
221+
elif "content" in payload and isinstance(payload["content"], str):
222+
messages.append(Message(role="assistant", content=str(payload["content"])))
223+
elif "result" in payload and isinstance(payload["result"], str):
224+
messages.append(Message(role="assistant", content=str(payload["result"])))
225+
elif "answer" in payload and isinstance(payload["answer"], str):
226+
messages.append(Message(role="assistant", content=str(payload["answer"])))
227+
elif "output" in payload and isinstance(payload["output"], str):
228+
messages.append(Message(role="assistant", content=str(payload["output"])))
229+
else:
230+
# Fallback: stringify
231+
messages.append(Message(role="assistant" if is_output else "user", content=str(payload)))
232+
elif isinstance(payload, list):
233+
for m in payload:
234+
if isinstance(m, dict):
235+
messages.append(_dict_to_message(m))
236+
else:
237+
messages.append(Message(role="assistant" if is_output else "user", content=str(m)))
238+
elif isinstance(payload, str):
239+
messages.append(Message(role="assistant" if is_output else "user", content=payload))
240+
241+
return messages
242+
243+
244+
def create_langsmith_adapter() -> LangSmithAdapter:
245+
return LangSmithAdapter()
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
"""
2+
LLM Judge quickstart that PULLS DATA FROM LANGSMITH and persists results locally via Eval Protocol.
3+
4+
This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses
5+
LangSmith datasets/examples as the source of evaluation rows.
6+
7+
Setup:
8+
pip install -U langsmith
9+
10+
Env vars:
11+
export LANGSMITH_API_KEY=... # required to fetch examples
12+
export LS_DATASET="ep_langsmith_demo_ds" # dataset to pull examples from
13+
14+
Judge model keys:
15+
- Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY
16+
- Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY
17+
18+
Run:
19+
pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s
20+
"""
21+
22+
import os
23+
from typing import Any, Dict, List, Optional
24+
25+
import pytest
26+
27+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
28+
from eval_protocol.pytest import evaluation_test
29+
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
30+
from eval_protocol.quickstart.utils import (
31+
split_multi_turn_rows,
32+
JUDGE_CONFIGS,
33+
calculate_bootstrap_scores,
34+
run_judgment,
35+
)
36+
from eval_protocol.adapters.langsmith import LangSmithAdapter
37+
38+
39+
def fetch_langsmith_traces_as_evaluation_rows(
40+
project_name: Optional[str] = None,
41+
limit: int = 20,
42+
) -> List[EvaluationRow]:
43+
"""Fetch LangSmith root runs and convert to EvaluationRow, mirroring Langfuse adapter shape.
44+
45+
- Extract messages from run.inputs and run.outputs
46+
- Append assistant message from outputs so split_multi_turn_rows can derive ground_truth
47+
- Store run_id in input_metadata.session_data
48+
"""
49+
project = project_name or os.getenv("LS_PROJECT", "ep-langgraph-examples")
50+
try:
51+
adapter = LangSmithAdapter()
52+
return adapter.get_evaluation_rows(project_name=project, limit=limit, include_tool_calls=True)
53+
except Exception as e:
54+
print(f"❌ LangSmithAdapter failed: {e}")
55+
return []
56+
57+
58+
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
59+
@pytest.mark.asyncio
60+
@evaluation_test(
61+
input_rows=[fetch_langsmith_traces_as_evaluation_rows()],
62+
completion_params=[
63+
{
64+
"model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
65+
},
66+
{
67+
"max_tokens": 131000,
68+
"extra_body": {"reasoning_effort": "low"},
69+
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
70+
},
71+
],
72+
rollout_processor=SingleTurnRolloutProcessor(),
73+
preprocess_fn=split_multi_turn_rows,
74+
mode="all",
75+
)
76+
async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[EvaluationRow]:
77+
"""LLM Judge evaluation over LangSmith-sourced rows, persisted locally by Eval Protocol.
78+
79+
Mirrors quickstart/llm_judge.py, using Arena-Hard-Auto style pairwise judgment.
80+
"""
81+
82+
judge_name = "gemini-2.5-pro"
83+
84+
if not rows:
85+
print("❌ No evaluation rows provided")
86+
return rows
87+
88+
print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging (LangSmith source)...")
89+
90+
model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model")
91+
92+
judgments: List[Dict[str, Any]] = []
93+
94+
for row in rows:
95+
result = run_judgment(row, model_name, judge_name)
96+
if result and result["games"][0] and result["games"][1]:
97+
judgments.append(result)
98+
99+
if not judgments:
100+
print("❌ No valid judgments generated")
101+
return rows
102+
103+
print(f"✅ Generated {len(judgments)} valid judgments")
104+
105+
mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
106+
if mean_score == 0.0:
107+
print("❌ No valid scores extracted")
108+
return rows
109+
110+
print("\n##### LLM Judge Results (90th percentile CI) #####")
111+
clean_model_name = model_name.split("/")[-1]
112+
print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
113+
print("original: 50.0% (CI: 50.0% - 50.0%)")
114+
115+
for row in rows:
116+
if row.evaluation_result:
117+
row.evaluation_result.score = mean_score
118+
row.evaluation_result.standard_error = (upper_score - lower_score) / (2 * 1.645)
119+
else:
120+
row.evaluation_result = EvaluateResult(
121+
score=mean_score,
122+
reason="Aggregated LLM judge score",
123+
metrics={
124+
"summary": MetricResult(score=mean_score, reason="Aggregated over judgments"),
125+
},
126+
)
127+
128+
return rows

0 commit comments

Comments
 (0)