Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
407 changes: 407 additions & 0 deletions eval_protocol/adapters/langsmith.py

Large diffs are not rendered by default.

128 changes: 128 additions & 0 deletions eval_protocol/quickstart/llm_judge_langsmith.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""
LLM Judge quickstart that PULLS DATA FROM LANGSMITH and persists results locally via Eval Protocol.

This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses
LangSmith datasets/examples as the source of evaluation rows.

Setup:
pip install -U langsmith

Env vars:
export LANGSMITH_API_KEY=... # required to fetch examples
export LS_DATASET="ep_langsmith_demo_ds" # dataset to pull examples from

Judge model keys:
- Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY
- Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY

Run:
pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s
"""

import os
from typing import Any, Dict, List, Optional

import pytest

from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
from eval_protocol.quickstart.utils import (
split_multi_turn_rows,
JUDGE_CONFIGS,
calculate_bootstrap_scores,
run_judgment,

Check failure on line 34 in eval_protocol/quickstart/llm_judge_langsmith.py

View workflow job for this annotation

GitHub Actions / Lint & Type Check

"run_judgment" is unknown import symbol (reportAttributeAccessIssue)
)
from eval_protocol.adapters.langsmith import LangSmithAdapter


def fetch_langsmith_traces_as_evaluation_rows(
project_name: Optional[str] = None,
limit: int = 20,
) -> List[EvaluationRow]:
"""Fetch LangSmith root runs and convert to EvaluationRow, mirroring Langfuse adapter shape.

- Extract messages from run.inputs and run.outputs
- Append assistant message from outputs so split_multi_turn_rows can derive ground_truth
- Store run_id in input_metadata.session_data
"""
project = project_name or os.getenv("LS_PROJECT", "ep-langgraph-examples")
try:
adapter = LangSmithAdapter()
return adapter.get_evaluation_rows(project_name=project, limit=limit, include_tool_calls=True)
except Exception as e:
print(f"❌ LangSmithAdapter failed: {e}")
return []


@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
@pytest.mark.asyncio
@evaluation_test(
input_rows=[fetch_langsmith_traces_as_evaluation_rows()],
completion_params=[
{
"model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
},
{
"max_tokens": 131000,
"extra_body": {"reasoning_effort": "low"},
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
},
],
rollout_processor=SingleTurnRolloutProcessor(),
preprocess_fn=split_multi_turn_rows,
mode="all",
)
async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[EvaluationRow]:
"""LLM Judge evaluation over LangSmith-sourced rows, persisted locally by Eval Protocol.

Mirrors quickstart/llm_judge.py, using Arena-Hard-Auto style pairwise judgment.
"""

judge_name = "gemini-2.5-pro"

if not rows:
print("❌ No evaluation rows provided")
return rows

print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging (LangSmith source)...")

model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model")

judgments: List[Dict[str, Any]] = []

for row in rows:
result = run_judgment(row, model_name, judge_name)
if result and result["games"][0] and result["games"][1]:
judgments.append(result)

if not judgments:
print("❌ No valid judgments generated")
return rows

print(f"✅ Generated {len(judgments)} valid judgments")

mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)

Check failure on line 105 in eval_protocol/quickstart/llm_judge_langsmith.py

View workflow job for this annotation

GitHub Actions / Lint & Type Check

"None" is not iterable   "__iter__" method not defined (reportGeneralTypeIssues)
if mean_score == 0.0:
print("❌ No valid scores extracted")
return rows

print("\n##### LLM Judge Results (90th percentile CI) #####")
clean_model_name = model_name.split("/")[-1]
print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
print("original: 50.0% (CI: 50.0% - 50.0%)")

for row in rows:
if row.evaluation_result:
row.evaluation_result.score = mean_score
row.evaluation_result.standard_error = (upper_score - lower_score) / (2 * 1.645)
else:
row.evaluation_result = EvaluateResult(
score=mean_score,
reason="Aggregated LLM judge score",
metrics={
"summary": MetricResult(score=mean_score, reason="Aggregated over judgments"),
},
)

return rows
48 changes: 48 additions & 0 deletions examples/langgraph/test_tools_langsmith_trace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
import pytest


@pytest.mark.skipif(os.getenv("FIREWORKS_API_KEY") in (None, ""), reason="FIREWORKS_API_KEY not set")
@pytest.mark.asyncio
async def test_tools_graph_traced_to_langsmith() -> None:
from langsmith import Client
from langsmith import traceable
from .tools_graph import build_tools_graph
from langchain_core.messages import HumanMessage

os.environ.setdefault("LANGSMITH_TRACING", "true")
os.environ.setdefault("LANGCHAIN_PROJECT", os.getenv("LS_PROJECT", "ep-langgraph-examples"))

app = build_tools_graph()

@traceable
async def run_once(prompt: str) -> dict:
# Run the graph once
_ = await app.ainvoke({"messages": [HumanMessage(content=prompt)]})
# Return a ChatML-like transcript including a tool response so LangSmith records role=tool
tool_args = '{"a":2,"b":3}'
return {
"messages": [
{"role": "user", "content": prompt},
{
"role": "assistant",
"content": "Tool Calls:\ncalculator_add\n" + tool_args,
"tool_calls": [
{
"id": "call_1",
"type": "function",
"function": {"name": "calculator_add", "arguments": tool_args},
}
],
},
{
"role": "tool",
"name": "calculator_add",
"tool_call_id": "call_1",
"content": "5",
},
{"role": "assistant", "content": "The result is 5."},
]
}

await run_once("Use calculator_add to add 2 and 3")
68 changes: 68 additions & 0 deletions examples/langgraph/tools_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from typing import Any, Dict, List
from typing_extensions import TypedDict, Annotated


def build_tools_graph() -> Any:
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage
from langchain.chat_models import init_chat_model

class State(TypedDict):
messages: Annotated[List[BaseMessage], add_messages]

# Use fireworks provider; expects FIREWORKS_API_KEY
llm = init_chat_model(
"accounts/fireworks/models/kimi-k2-instruct",
model_provider="fireworks",
temperature=0.0,
tools=[
{
"type": "function",
"function": {
"name": "calculator_add",
"description": "Add two integers",
"parameters": {
"type": "object",
"properties": {
"a": {"type": "integer"},
"b": {"type": "integer"},
},
"required": ["a", "b"],
},
},
}
],
)

async def tool_router(state: State, **_: Any) -> Dict[str, Any]:
msgs: List[BaseMessage] = state.get("messages", [])
resp = await llm.ainvoke(msgs)
# If tool call requested, synthesize tool result message
try:
tcs = getattr(resp, "tool_calls", None)
if tcs:
# naive parse for demo
a, b = 0, 0
try:
import json

args = json.loads(tcs[0].function.arguments)
a = int(args.get("a", 0))
b = int(args.get("b", 0))
except Exception:
pass
result = a + b
from langchain_core.messages import ToolMessage

tool_msg = ToolMessage(content=str(result), tool_call_id=tcs[0].id, name=tcs[0].function.name)
return {"messages": [resp, tool_msg]}
except Exception:
pass
return {"messages": [resp]}

g = StateGraph(State)
g.add_node("tool_router", tool_router)
g.set_entry_point("tool_router")
g.add_edge("tool_router", END)
return g.compile()
24 changes: 24 additions & 0 deletions examples/langsmith/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# LangSmith Bootstrap Scripts

These scripts are ONLY for dumping synthetic traces into LangSmith to exercise the adapter and quickstart examples.

- `dump_traces_langsmith.py`: emits simple @traceable runs and an optional mini LangGraph echo flow.
- `emit_tool_calls.py`: emits runs that include assistant tool calls and a tool response message.

Usage:
1) Set your API key:

```bash
export LANGSMITH_API_KEY=...
export LANGSMITH_TRACING=true
export LS_PROJECT=ep-langgraph-examples
```

2) Run emitters:

```bash
python examples/langsmith/dump_traces_langsmith.py
python examples/langsmith/emit_tool_calls.py
```

These are not production examples; they exist to seed LangSmith with traces that the adapter can consume.
115 changes: 115 additions & 0 deletions examples/langsmith/dump_traces_langsmith.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Quick script to send a few throwaway traces to LangSmith.

Usage:
export LANGSMITH_API_KEY=... # required
export LANGSMITH_TRACING=true # recommended
python python-sdk/examples/langsmith/dump_traces_langsmith.py

Notes:
- This does not require any external model keys. It logs a few synthetic
traced function calls, and optionally a tiny LangGraph flow if available.
"""

import asyncio
import os
from typing import Any, Dict, List
import importlib


def _ensure_env_defaults() -> None:
# Prefer modern env vars; fall back maintained for compatibility.
if os.environ.get("LANGSMITH_TRACING") is None:
os.environ["LANGSMITH_TRACING"] = "true"
# Project name helps organize traces in the LangSmith UI
os.environ.setdefault("LANGCHAIN_PROJECT", "ep-langgraph-examples")


def _log_synthetic_traces() -> None:
traceable = None
try:
mod = importlib.import_module("langsmith")
traceable = getattr(mod, "traceable", None)
except ImportError:
pass
if traceable is None:
print("LangSmith not installed; skipping @traceable demo. `pip install langsmith`.")
return

@traceable(name="toy_pipeline")
def toy_pipeline(user_input: str) -> Dict[str, Any]:
reversed_text = user_input[::-1]
upper_text = reversed_text.upper()
return {"result": upper_text, "len": len(upper_text)}

print("Emitting synthetic traces via @traceable...")
toy_pipeline("hello langsmith")
toy_pipeline("trace number two")
toy_pipeline("final short run")


async def _maybe_run_tiny_langgraph() -> None:
"""Optionally run a tiny LangGraph flow to log a couple of runs.

This avoids any external LLM providers by using a pure-Python node.
"""
try:
graph_mod = importlib.import_module("langgraph.graph")
msg_mod = importlib.import_module("langgraph.graph.message")
lc_msgs = importlib.import_module("langchain_core.messages")
te_mod = importlib.import_module("typing_extensions")
except ImportError:
print("LangGraph/LangChain not installed; skipping tiny graph demo. `pip install langgraph langchain-core`.")
return

END = getattr(graph_mod, "END")
StateGraph = getattr(graph_mod, "StateGraph")
add_messages = getattr(msg_mod, "add_messages")
AIMessage = getattr(lc_msgs, "AIMessage")
BaseMessage = getattr(lc_msgs, "BaseMessage")
HumanMessage = getattr(lc_msgs, "HumanMessage")
Annotated = getattr(te_mod, "Annotated")
TypedDict = getattr(te_mod, "TypedDict")

class State(TypedDict): # type: ignore[misc]
messages: Annotated[List[BaseMessage], add_messages] # type: ignore[index]

async def echo_node(state: State, **_: Any) -> Dict[str, Any]:
messages: List[BaseMessage] = state.get("messages", [])
last_user = next((m for m in reversed(messages) if isinstance(m, HumanMessage)), None)
content = getattr(last_user, "content", "")
reply = AIMessage(content=f"Echo: {content}")
return {"messages": [reply]}

graph = StateGraph(State)
graph.add_node("echo", echo_node)
graph.set_entry_point("echo")
graph.add_edge("echo", END)
app = graph.compile()

print("Emitting a couple LangGraph runs...")
await app.ainvoke({"messages": [HumanMessage(content="hi there")]})
await app.ainvoke({"messages": [HumanMessage(content="how are you?")]})


def main() -> None:
_ensure_env_defaults()

if not os.getenv("LANGSMITH_API_KEY") and not os.getenv("LANGCHAIN_API_KEY"):
print("Missing LangSmith API key. Set LANGSMITH_API_KEY (or LANGCHAIN_API_KEY) and rerun.")
return

_log_synthetic_traces()

try:
asyncio.run(_maybe_run_tiny_langgraph())
except RuntimeError:
# Fallback for event loop already running (e.g. in notebooks)
loop = asyncio.get_event_loop()
loop.create_task(_maybe_run_tiny_langgraph())
loop.run_until_complete(asyncio.sleep(0.1))

print("Done. Visit LangSmith to see your new traces.")


if __name__ == "__main__":
main()
Loading
Loading