Skip to content

Commit c40dc2e

Browse files
authored
Langsmith example (#176)
* Langsmith example * langsmith changes * update lock * formatting
1 parent 33c94b8 commit c40dc2e

File tree

11 files changed

+1265
-1
lines changed

11 files changed

+1265
-1
lines changed

eval_protocol/adapters/langsmith.py

Lines changed: 407 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
"""
2+
LLM Judge quickstart that PULLS DATA FROM LANGSMITH and persists results locally via Eval Protocol.
3+
4+
This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses
5+
LangSmith datasets/examples as the source of evaluation rows.
6+
7+
Setup:
8+
pip install -U langsmith
9+
10+
Env vars:
11+
export LANGSMITH_API_KEY=... # required to fetch examples
12+
export LS_DATASET="ep_langsmith_demo_ds" # dataset to pull examples from
13+
14+
Judge model keys:
15+
- Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY
16+
- Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY
17+
18+
Run:
19+
pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s
20+
"""
21+
22+
import os
23+
from typing import Any, Dict, List, Optional
24+
25+
import pytest
26+
27+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
28+
from eval_protocol.pytest import evaluation_test
29+
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
30+
from eval_protocol.quickstart.utils import (
31+
split_multi_turn_rows,
32+
JUDGE_CONFIGS,
33+
calculate_bootstrap_scores,
34+
run_judgment,
35+
)
36+
from eval_protocol.adapters.langsmith import LangSmithAdapter
37+
38+
39+
def fetch_langsmith_traces_as_evaluation_rows(
40+
project_name: Optional[str] = None,
41+
limit: int = 20,
42+
) -> List[EvaluationRow]:
43+
"""Fetch LangSmith root runs and convert to EvaluationRow, mirroring Langfuse adapter shape.
44+
45+
- Extract messages from run.inputs and run.outputs
46+
- Append assistant message from outputs so split_multi_turn_rows can derive ground_truth
47+
- Store run_id in input_metadata.session_data
48+
"""
49+
project = project_name or os.getenv("LS_PROJECT", "ep-langgraph-examples")
50+
try:
51+
adapter = LangSmithAdapter()
52+
return adapter.get_evaluation_rows(project_name=project, limit=limit, include_tool_calls=True)
53+
except Exception as e:
54+
print(f"❌ LangSmithAdapter failed: {e}")
55+
return []
56+
57+
58+
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
59+
@pytest.mark.asyncio
60+
@evaluation_test(
61+
input_rows=[fetch_langsmith_traces_as_evaluation_rows()],
62+
completion_params=[
63+
{
64+
"model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
65+
},
66+
{
67+
"max_tokens": 131000,
68+
"extra_body": {"reasoning_effort": "low"},
69+
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
70+
},
71+
],
72+
rollout_processor=SingleTurnRolloutProcessor(),
73+
preprocess_fn=split_multi_turn_rows,
74+
mode="all",
75+
)
76+
async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[EvaluationRow]:
77+
"""LLM Judge evaluation over LangSmith-sourced rows, persisted locally by Eval Protocol.
78+
79+
Mirrors quickstart/llm_judge.py, using Arena-Hard-Auto style pairwise judgment.
80+
"""
81+
82+
judge_name = "gemini-2.5-pro"
83+
84+
if not rows:
85+
print("❌ No evaluation rows provided")
86+
return rows
87+
88+
print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging (LangSmith source)...")
89+
90+
model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model")
91+
92+
judgments: List[Dict[str, Any]] = []
93+
94+
for row in rows:
95+
result = run_judgment(row, model_name, judge_name)
96+
if result and result["games"][0] and result["games"][1]:
97+
judgments.append(result)
98+
99+
if not judgments:
100+
print("❌ No valid judgments generated")
101+
return rows
102+
103+
print(f"✅ Generated {len(judgments)} valid judgments")
104+
105+
mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
106+
if mean_score == 0.0:
107+
print("❌ No valid scores extracted")
108+
return rows
109+
110+
print("\n##### LLM Judge Results (90th percentile CI) #####")
111+
clean_model_name = model_name.split("/")[-1]
112+
print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
113+
print("original: 50.0% (CI: 50.0% - 50.0%)")
114+
115+
for row in rows:
116+
if row.evaluation_result:
117+
row.evaluation_result.score = mean_score
118+
row.evaluation_result.standard_error = (upper_score - lower_score) / (2 * 1.645)
119+
else:
120+
row.evaluation_result = EvaluateResult(
121+
score=mean_score,
122+
reason="Aggregated LLM judge score",
123+
metrics={
124+
"summary": MetricResult(score=mean_score, reason="Aggregated over judgments"),
125+
},
126+
)
127+
128+
return rows
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import os
2+
import pytest
3+
4+
5+
@pytest.mark.skipif(os.getenv("FIREWORKS_API_KEY") in (None, ""), reason="FIREWORKS_API_KEY not set")
6+
@pytest.mark.asyncio
7+
async def test_tools_graph_traced_to_langsmith() -> None:
8+
from langsmith import Client
9+
from langsmith import traceable
10+
from .tools_graph import build_tools_graph
11+
from langchain_core.messages import HumanMessage
12+
13+
os.environ.setdefault("LANGSMITH_TRACING", "true")
14+
os.environ.setdefault("LANGCHAIN_PROJECT", os.getenv("LS_PROJECT", "ep-langgraph-examples"))
15+
16+
app = build_tools_graph()
17+
18+
@traceable
19+
async def run_once(prompt: str) -> dict:
20+
# Run the graph once
21+
_ = await app.ainvoke({"messages": [HumanMessage(content=prompt)]})
22+
# Return a ChatML-like transcript including a tool response so LangSmith records role=tool
23+
tool_args = '{"a":2,"b":3}'
24+
return {
25+
"messages": [
26+
{"role": "user", "content": prompt},
27+
{
28+
"role": "assistant",
29+
"content": "Tool Calls:\ncalculator_add\n" + tool_args,
30+
"tool_calls": [
31+
{
32+
"id": "call_1",
33+
"type": "function",
34+
"function": {"name": "calculator_add", "arguments": tool_args},
35+
}
36+
],
37+
},
38+
{
39+
"role": "tool",
40+
"name": "calculator_add",
41+
"tool_call_id": "call_1",
42+
"content": "5",
43+
},
44+
{"role": "assistant", "content": "The result is 5."},
45+
]
46+
}
47+
48+
await run_once("Use calculator_add to add 2 and 3")

examples/langgraph/tools_graph.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from typing import Any, Dict, List
2+
from typing_extensions import TypedDict, Annotated
3+
4+
5+
def build_tools_graph() -> Any:
6+
from langgraph.graph import StateGraph, END
7+
from langgraph.graph.message import add_messages
8+
from langchain_core.messages import BaseMessage
9+
from langchain.chat_models import init_chat_model
10+
11+
class State(TypedDict):
12+
messages: Annotated[List[BaseMessage], add_messages]
13+
14+
# Use fireworks provider; expects FIREWORKS_API_KEY
15+
llm = init_chat_model(
16+
"accounts/fireworks/models/kimi-k2-instruct",
17+
model_provider="fireworks",
18+
temperature=0.0,
19+
tools=[
20+
{
21+
"type": "function",
22+
"function": {
23+
"name": "calculator_add",
24+
"description": "Add two integers",
25+
"parameters": {
26+
"type": "object",
27+
"properties": {
28+
"a": {"type": "integer"},
29+
"b": {"type": "integer"},
30+
},
31+
"required": ["a", "b"],
32+
},
33+
},
34+
}
35+
],
36+
)
37+
38+
async def tool_router(state: State, **_: Any) -> Dict[str, Any]:
39+
msgs: List[BaseMessage] = state.get("messages", [])
40+
resp = await llm.ainvoke(msgs)
41+
# If tool call requested, synthesize tool result message
42+
try:
43+
tcs = getattr(resp, "tool_calls", None)
44+
if tcs:
45+
# naive parse for demo
46+
a, b = 0, 0
47+
try:
48+
import json
49+
50+
args = json.loads(tcs[0].function.arguments)
51+
a = int(args.get("a", 0))
52+
b = int(args.get("b", 0))
53+
except Exception:
54+
pass
55+
result = a + b
56+
from langchain_core.messages import ToolMessage
57+
58+
tool_msg = ToolMessage(content=str(result), tool_call_id=tcs[0].id, name=tcs[0].function.name)
59+
return {"messages": [resp, tool_msg]}
60+
except Exception:
61+
pass
62+
return {"messages": [resp]}
63+
64+
g = StateGraph(State)
65+
g.add_node("tool_router", tool_router)
66+
g.set_entry_point("tool_router")
67+
g.add_edge("tool_router", END)
68+
return g.compile()

examples/langsmith/README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# LangSmith Bootstrap Scripts
2+
3+
These scripts are ONLY for dumping synthetic traces into LangSmith to exercise the adapter and quickstart examples.
4+
5+
- `dump_traces_langsmith.py`: emits simple @traceable runs and an optional mini LangGraph echo flow.
6+
- `emit_tool_calls.py`: emits runs that include assistant tool calls and a tool response message.
7+
8+
Usage:
9+
1) Set your API key:
10+
11+
```bash
12+
export LANGSMITH_API_KEY=...
13+
export LANGSMITH_TRACING=true
14+
export LS_PROJECT=ep-langgraph-examples
15+
```
16+
17+
2) Run emitters:
18+
19+
```bash
20+
python examples/langsmith/dump_traces_langsmith.py
21+
python examples/langsmith/emit_tool_calls.py
22+
```
23+
24+
These are not production examples; they exist to seed LangSmith with traces that the adapter can consume.
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
"""Quick script to send a few throwaway traces to LangSmith.
2+
3+
Usage:
4+
export LANGSMITH_API_KEY=... # required
5+
export LANGSMITH_TRACING=true # recommended
6+
python python-sdk/examples/langsmith/dump_traces_langsmith.py
7+
8+
Notes:
9+
- This does not require any external model keys. It logs a few synthetic
10+
traced function calls, and optionally a tiny LangGraph flow if available.
11+
"""
12+
13+
import asyncio
14+
import os
15+
from typing import Any, Dict, List
16+
import importlib
17+
18+
19+
def _ensure_env_defaults() -> None:
20+
# Prefer modern env vars; fall back maintained for compatibility.
21+
if os.environ.get("LANGSMITH_TRACING") is None:
22+
os.environ["LANGSMITH_TRACING"] = "true"
23+
# Project name helps organize traces in the LangSmith UI
24+
os.environ.setdefault("LANGCHAIN_PROJECT", "ep-langgraph-examples")
25+
26+
27+
def _log_synthetic_traces() -> None:
28+
traceable = None
29+
try:
30+
mod = importlib.import_module("langsmith")
31+
traceable = getattr(mod, "traceable", None)
32+
except ImportError:
33+
pass
34+
if traceable is None:
35+
print("LangSmith not installed; skipping @traceable demo. `pip install langsmith`.")
36+
return
37+
38+
@traceable(name="toy_pipeline")
39+
def toy_pipeline(user_input: str) -> Dict[str, Any]:
40+
reversed_text = user_input[::-1]
41+
upper_text = reversed_text.upper()
42+
return {"result": upper_text, "len": len(upper_text)}
43+
44+
print("Emitting synthetic traces via @traceable...")
45+
toy_pipeline("hello langsmith")
46+
toy_pipeline("trace number two")
47+
toy_pipeline("final short run")
48+
49+
50+
async def _maybe_run_tiny_langgraph() -> None:
51+
"""Optionally run a tiny LangGraph flow to log a couple of runs.
52+
53+
This avoids any external LLM providers by using a pure-Python node.
54+
"""
55+
try:
56+
graph_mod = importlib.import_module("langgraph.graph")
57+
msg_mod = importlib.import_module("langgraph.graph.message")
58+
lc_msgs = importlib.import_module("langchain_core.messages")
59+
te_mod = importlib.import_module("typing_extensions")
60+
except ImportError:
61+
print("LangGraph/LangChain not installed; skipping tiny graph demo. `pip install langgraph langchain-core`.")
62+
return
63+
64+
END = getattr(graph_mod, "END")
65+
StateGraph = getattr(graph_mod, "StateGraph")
66+
add_messages = getattr(msg_mod, "add_messages")
67+
AIMessage = getattr(lc_msgs, "AIMessage")
68+
BaseMessage = getattr(lc_msgs, "BaseMessage")
69+
HumanMessage = getattr(lc_msgs, "HumanMessage")
70+
Annotated = getattr(te_mod, "Annotated")
71+
TypedDict = getattr(te_mod, "TypedDict")
72+
73+
class State(TypedDict): # type: ignore[misc]
74+
messages: Annotated[List[BaseMessage], add_messages] # type: ignore[index]
75+
76+
async def echo_node(state: State, **_: Any) -> Dict[str, Any]:
77+
messages: List[BaseMessage] = state.get("messages", [])
78+
last_user = next((m for m in reversed(messages) if isinstance(m, HumanMessage)), None)
79+
content = getattr(last_user, "content", "")
80+
reply = AIMessage(content=f"Echo: {content}")
81+
return {"messages": [reply]}
82+
83+
graph = StateGraph(State)
84+
graph.add_node("echo", echo_node)
85+
graph.set_entry_point("echo")
86+
graph.add_edge("echo", END)
87+
app = graph.compile()
88+
89+
print("Emitting a couple LangGraph runs...")
90+
await app.ainvoke({"messages": [HumanMessage(content="hi there")]})
91+
await app.ainvoke({"messages": [HumanMessage(content="how are you?")]})
92+
93+
94+
def main() -> None:
95+
_ensure_env_defaults()
96+
97+
if not os.getenv("LANGSMITH_API_KEY") and not os.getenv("LANGCHAIN_API_KEY"):
98+
print("Missing LangSmith API key. Set LANGSMITH_API_KEY (or LANGCHAIN_API_KEY) and rerun.")
99+
return
100+
101+
_log_synthetic_traces()
102+
103+
try:
104+
asyncio.run(_maybe_run_tiny_langgraph())
105+
except RuntimeError:
106+
# Fallback for event loop already running (e.g. in notebooks)
107+
loop = asyncio.get_event_loop()
108+
loop.create_task(_maybe_run_tiny_langgraph())
109+
loop.run_until_complete(asyncio.sleep(0.1))
110+
111+
print("Done. Visit LangSmith to see your new traces.")
112+
113+
114+
if __name__ == "__main__":
115+
main()

0 commit comments

Comments
 (0)