-
Notifications
You must be signed in to change notification settings - Fork 16
Langsmith example #176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Langsmith example #176
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,128 @@ | ||
| """ | ||
| LLM Judge quickstart that PULLS DATA FROM LANGSMITH and persists results locally via Eval Protocol. | ||
|
|
||
| This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses | ||
| LangSmith datasets/examples as the source of evaluation rows. | ||
|
|
||
| Setup: | ||
| pip install -U langsmith | ||
|
|
||
| Env vars: | ||
| export LANGSMITH_API_KEY=... # required to fetch examples | ||
| export LS_DATASET="ep_langsmith_demo_ds" # dataset to pull examples from | ||
|
|
||
| Judge model keys: | ||
| - Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY | ||
| - Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY | ||
|
|
||
| Run: | ||
| pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s | ||
| """ | ||
|
|
||
| import os | ||
| from typing import Any, Dict, List, Optional | ||
|
|
||
| import pytest | ||
|
|
||
| from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult | ||
| from eval_protocol.pytest import evaluation_test | ||
| from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor | ||
| from eval_protocol.quickstart.utils import ( | ||
| split_multi_turn_rows, | ||
| JUDGE_CONFIGS, | ||
| calculate_bootstrap_scores, | ||
| run_judgment, | ||
| ) | ||
| from eval_protocol.adapters.langsmith import LangSmithAdapter | ||
|
|
||
|
|
||
| def fetch_langsmith_traces_as_evaluation_rows( | ||
| project_name: Optional[str] = None, | ||
| limit: int = 20, | ||
| ) -> List[EvaluationRow]: | ||
| """Fetch LangSmith root runs and convert to EvaluationRow, mirroring Langfuse adapter shape. | ||
|
|
||
| - Extract messages from run.inputs and run.outputs | ||
| - Append assistant message from outputs so split_multi_turn_rows can derive ground_truth | ||
| - Store run_id in input_metadata.session_data | ||
| """ | ||
| project = project_name or os.getenv("LS_PROJECT", "ep-langgraph-examples") | ||
| try: | ||
| adapter = LangSmithAdapter() | ||
| return adapter.get_evaluation_rows(project_name=project, limit=limit, include_tool_calls=True) | ||
| except Exception as e: | ||
| print(f"❌ LangSmithAdapter failed: {e}") | ||
| return [] | ||
|
|
||
|
|
||
| @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") | ||
| @pytest.mark.asyncio | ||
| @evaluation_test( | ||
| input_rows=[fetch_langsmith_traces_as_evaluation_rows()], | ||
| completion_params=[ | ||
| { | ||
| "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507", | ||
| }, | ||
| { | ||
| "max_tokens": 131000, | ||
| "extra_body": {"reasoning_effort": "low"}, | ||
| "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", | ||
| }, | ||
| ], | ||
| rollout_processor=SingleTurnRolloutProcessor(), | ||
| preprocess_fn=split_multi_turn_rows, | ||
| mode="all", | ||
| ) | ||
| async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[EvaluationRow]: | ||
| """LLM Judge evaluation over LangSmith-sourced rows, persisted locally by Eval Protocol. | ||
|
|
||
| Mirrors quickstart/llm_judge.py, using Arena-Hard-Auto style pairwise judgment. | ||
| """ | ||
|
|
||
| judge_name = "gemini-2.5-pro" | ||
|
|
||
| if not rows: | ||
| print("❌ No evaluation rows provided") | ||
| return rows | ||
|
|
||
| print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging (LangSmith source)...") | ||
|
|
||
| model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model") | ||
|
|
||
| judgments: List[Dict[str, Any]] = [] | ||
|
|
||
| for row in rows: | ||
| result = run_judgment(row, model_name, judge_name) | ||
| if result and result["games"][0] and result["games"][1]: | ||
| judgments.append(result) | ||
|
|
||
| if not judgments: | ||
| print("❌ No valid judgments generated") | ||
| return rows | ||
|
|
||
| print(f"✅ Generated {len(judgments)} valid judgments") | ||
|
|
||
| mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments) | ||
| if mean_score == 0.0: | ||
| print("❌ No valid scores extracted") | ||
| return rows | ||
|
|
||
| print("\n##### LLM Judge Results (90th percentile CI) #####") | ||
| clean_model_name = model_name.split("/")[-1] | ||
| print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})") | ||
| print("original: 50.0% (CI: 50.0% - 50.0%)") | ||
|
|
||
| for row in rows: | ||
| if row.evaluation_result: | ||
| row.evaluation_result.score = mean_score | ||
| row.evaluation_result.standard_error = (upper_score - lower_score) / (2 * 1.645) | ||
| else: | ||
| row.evaluation_result = EvaluateResult( | ||
| score=mean_score, | ||
| reason="Aggregated LLM judge score", | ||
| metrics={ | ||
| "summary": MetricResult(score=mean_score, reason="Aggregated over judgments"), | ||
| }, | ||
| ) | ||
|
|
||
| return rows | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| import os | ||
| import pytest | ||
|
|
||
|
|
||
| @pytest.mark.skipif(os.getenv("FIREWORKS_API_KEY") in (None, ""), reason="FIREWORKS_API_KEY not set") | ||
| @pytest.mark.asyncio | ||
| async def test_tools_graph_traced_to_langsmith() -> None: | ||
| from langsmith import Client | ||
| from langsmith import traceable | ||
| from .tools_graph import build_tools_graph | ||
| from langchain_core.messages import HumanMessage | ||
|
|
||
| os.environ.setdefault("LANGSMITH_TRACING", "true") | ||
| os.environ.setdefault("LANGCHAIN_PROJECT", os.getenv("LS_PROJECT", "ep-langgraph-examples")) | ||
|
|
||
| app = build_tools_graph() | ||
|
|
||
| @traceable | ||
| async def run_once(prompt: str) -> dict: | ||
| # Run the graph once | ||
| _ = await app.ainvoke({"messages": [HumanMessage(content=prompt)]}) | ||
| # Return a ChatML-like transcript including a tool response so LangSmith records role=tool | ||
| tool_args = '{"a":2,"b":3}' | ||
| return { | ||
| "messages": [ | ||
| {"role": "user", "content": prompt}, | ||
| { | ||
| "role": "assistant", | ||
| "content": "Tool Calls:\ncalculator_add\n" + tool_args, | ||
| "tool_calls": [ | ||
| { | ||
| "id": "call_1", | ||
| "type": "function", | ||
| "function": {"name": "calculator_add", "arguments": tool_args}, | ||
| } | ||
| ], | ||
| }, | ||
| { | ||
| "role": "tool", | ||
| "name": "calculator_add", | ||
| "tool_call_id": "call_1", | ||
| "content": "5", | ||
| }, | ||
| {"role": "assistant", "content": "The result is 5."}, | ||
| ] | ||
| } | ||
|
|
||
| await run_once("Use calculator_add to add 2 and 3") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,68 @@ | ||
| from typing import Any, Dict, List | ||
| from typing_extensions import TypedDict, Annotated | ||
|
|
||
|
|
||
| def build_tools_graph() -> Any: | ||
| from langgraph.graph import StateGraph, END | ||
| from langgraph.graph.message import add_messages | ||
| from langchain_core.messages import BaseMessage | ||
| from langchain.chat_models import init_chat_model | ||
|
|
||
| class State(TypedDict): | ||
| messages: Annotated[List[BaseMessage], add_messages] | ||
|
|
||
| # Use fireworks provider; expects FIREWORKS_API_KEY | ||
| llm = init_chat_model( | ||
| "accounts/fireworks/models/kimi-k2-instruct", | ||
| model_provider="fireworks", | ||
| temperature=0.0, | ||
| tools=[ | ||
| { | ||
| "type": "function", | ||
| "function": { | ||
| "name": "calculator_add", | ||
| "description": "Add two integers", | ||
| "parameters": { | ||
| "type": "object", | ||
| "properties": { | ||
| "a": {"type": "integer"}, | ||
| "b": {"type": "integer"}, | ||
| }, | ||
| "required": ["a", "b"], | ||
| }, | ||
| }, | ||
| } | ||
| ], | ||
| ) | ||
|
|
||
| async def tool_router(state: State, **_: Any) -> Dict[str, Any]: | ||
| msgs: List[BaseMessage] = state.get("messages", []) | ||
| resp = await llm.ainvoke(msgs) | ||
| # If tool call requested, synthesize tool result message | ||
| try: | ||
| tcs = getattr(resp, "tool_calls", None) | ||
| if tcs: | ||
| # naive parse for demo | ||
| a, b = 0, 0 | ||
| try: | ||
| import json | ||
|
|
||
| args = json.loads(tcs[0].function.arguments) | ||
| a = int(args.get("a", 0)) | ||
| b = int(args.get("b", 0)) | ||
| except Exception: | ||
| pass | ||
| result = a + b | ||
| from langchain_core.messages import ToolMessage | ||
|
|
||
| tool_msg = ToolMessage(content=str(result), tool_call_id=tcs[0].id, name=tcs[0].function.name) | ||
| return {"messages": [resp, tool_msg]} | ||
| except Exception: | ||
| pass | ||
| return {"messages": [resp]} | ||
|
|
||
| g = StateGraph(State) | ||
| g.add_node("tool_router", tool_router) | ||
| g.set_entry_point("tool_router") | ||
| g.add_edge("tool_router", END) | ||
| return g.compile() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| # LangSmith Bootstrap Scripts | ||
|
|
||
| These scripts are ONLY for dumping synthetic traces into LangSmith to exercise the adapter and quickstart examples. | ||
|
|
||
| - `dump_traces_langsmith.py`: emits simple @traceable runs and an optional mini LangGraph echo flow. | ||
| - `emit_tool_calls.py`: emits runs that include assistant tool calls and a tool response message. | ||
|
|
||
| Usage: | ||
| 1) Set your API key: | ||
|
|
||
| ```bash | ||
| export LANGSMITH_API_KEY=... | ||
| export LANGSMITH_TRACING=true | ||
| export LS_PROJECT=ep-langgraph-examples | ||
| ``` | ||
|
|
||
| 2) Run emitters: | ||
|
|
||
| ```bash | ||
| python examples/langsmith/dump_traces_langsmith.py | ||
| python examples/langsmith/emit_tool_calls.py | ||
| ``` | ||
|
|
||
| These are not production examples; they exist to seed LangSmith with traces that the adapter can consume. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,115 @@ | ||
| """Quick script to send a few throwaway traces to LangSmith. | ||
|
|
||
| Usage: | ||
| export LANGSMITH_API_KEY=... # required | ||
| export LANGSMITH_TRACING=true # recommended | ||
| python python-sdk/examples/langsmith/dump_traces_langsmith.py | ||
|
|
||
| Notes: | ||
| - This does not require any external model keys. It logs a few synthetic | ||
| traced function calls, and optionally a tiny LangGraph flow if available. | ||
| """ | ||
|
|
||
| import asyncio | ||
| import os | ||
| from typing import Any, Dict, List | ||
| import importlib | ||
|
|
||
|
|
||
| def _ensure_env_defaults() -> None: | ||
| # Prefer modern env vars; fall back maintained for compatibility. | ||
| if os.environ.get("LANGSMITH_TRACING") is None: | ||
| os.environ["LANGSMITH_TRACING"] = "true" | ||
| # Project name helps organize traces in the LangSmith UI | ||
| os.environ.setdefault("LANGCHAIN_PROJECT", "ep-langgraph-examples") | ||
|
|
||
|
|
||
| def _log_synthetic_traces() -> None: | ||
| traceable = None | ||
| try: | ||
| mod = importlib.import_module("langsmith") | ||
| traceable = getattr(mod, "traceable", None) | ||
| except ImportError: | ||
| pass | ||
| if traceable is None: | ||
| print("LangSmith not installed; skipping @traceable demo. `pip install langsmith`.") | ||
| return | ||
|
|
||
| @traceable(name="toy_pipeline") | ||
| def toy_pipeline(user_input: str) -> Dict[str, Any]: | ||
| reversed_text = user_input[::-1] | ||
| upper_text = reversed_text.upper() | ||
| return {"result": upper_text, "len": len(upper_text)} | ||
|
|
||
| print("Emitting synthetic traces via @traceable...") | ||
| toy_pipeline("hello langsmith") | ||
| toy_pipeline("trace number two") | ||
| toy_pipeline("final short run") | ||
|
|
||
|
|
||
| async def _maybe_run_tiny_langgraph() -> None: | ||
| """Optionally run a tiny LangGraph flow to log a couple of runs. | ||
|
|
||
| This avoids any external LLM providers by using a pure-Python node. | ||
| """ | ||
| try: | ||
| graph_mod = importlib.import_module("langgraph.graph") | ||
| msg_mod = importlib.import_module("langgraph.graph.message") | ||
| lc_msgs = importlib.import_module("langchain_core.messages") | ||
| te_mod = importlib.import_module("typing_extensions") | ||
| except ImportError: | ||
| print("LangGraph/LangChain not installed; skipping tiny graph demo. `pip install langgraph langchain-core`.") | ||
| return | ||
|
|
||
| END = getattr(graph_mod, "END") | ||
| StateGraph = getattr(graph_mod, "StateGraph") | ||
| add_messages = getattr(msg_mod, "add_messages") | ||
| AIMessage = getattr(lc_msgs, "AIMessage") | ||
| BaseMessage = getattr(lc_msgs, "BaseMessage") | ||
| HumanMessage = getattr(lc_msgs, "HumanMessage") | ||
| Annotated = getattr(te_mod, "Annotated") | ||
| TypedDict = getattr(te_mod, "TypedDict") | ||
|
|
||
| class State(TypedDict): # type: ignore[misc] | ||
| messages: Annotated[List[BaseMessage], add_messages] # type: ignore[index] | ||
|
|
||
| async def echo_node(state: State, **_: Any) -> Dict[str, Any]: | ||
| messages: List[BaseMessage] = state.get("messages", []) | ||
| last_user = next((m for m in reversed(messages) if isinstance(m, HumanMessage)), None) | ||
| content = getattr(last_user, "content", "") | ||
| reply = AIMessage(content=f"Echo: {content}") | ||
| return {"messages": [reply]} | ||
|
|
||
| graph = StateGraph(State) | ||
| graph.add_node("echo", echo_node) | ||
| graph.set_entry_point("echo") | ||
| graph.add_edge("echo", END) | ||
| app = graph.compile() | ||
|
|
||
| print("Emitting a couple LangGraph runs...") | ||
| await app.ainvoke({"messages": [HumanMessage(content="hi there")]}) | ||
| await app.ainvoke({"messages": [HumanMessage(content="how are you?")]}) | ||
|
|
||
|
|
||
| def main() -> None: | ||
| _ensure_env_defaults() | ||
|
|
||
| if not os.getenv("LANGSMITH_API_KEY") and not os.getenv("LANGCHAIN_API_KEY"): | ||
| print("Missing LangSmith API key. Set LANGSMITH_API_KEY (or LANGCHAIN_API_KEY) and rerun.") | ||
| return | ||
|
|
||
| _log_synthetic_traces() | ||
|
|
||
| try: | ||
| asyncio.run(_maybe_run_tiny_langgraph()) | ||
| except RuntimeError: | ||
| # Fallback for event loop already running (e.g. in notebooks) | ||
| loop = asyncio.get_event_loop() | ||
| loop.create_task(_maybe_run_tiny_langgraph()) | ||
| loop.run_until_complete(asyncio.sleep(0.1)) | ||
|
|
||
| print("Done. Visit LangSmith to see your new traces.") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.