From 5d4daa66f64598bfbc79955b879ac41a95a71036 Mon Sep 17 00:00:00 2001 From: benjibc Date: Mon, 15 Sep 2025 05:10:20 +0000 Subject: [PATCH 1/4] Langsmith example --- eval_protocol/adapters/langsmith.py | 245 ++++++++++++++++++ .../quickstart/llm_judge_langsmith.py | 128 +++++++++ .../langgraph/test_tools_langsmith_trace.py | 48 ++++ examples/langgraph/tools_graph.py | 68 +++++ examples/langsmith/README.md | 24 ++ examples/langsmith/dump_traces_langsmith.py | 115 ++++++++ examples/langsmith/emit_tool_calls.py | 116 +++++++++ .../langsmith/llm_judge_from_langsmith.py | 168 ++++++++++++ tests/adapters/test_langsmith_adapter.py | 183 +++++++++++++ 9 files changed, 1095 insertions(+) create mode 100644 eval_protocol/adapters/langsmith.py create mode 100644 eval_protocol/quickstart/llm_judge_langsmith.py create mode 100644 examples/langgraph/test_tools_langsmith_trace.py create mode 100644 examples/langgraph/tools_graph.py create mode 100644 examples/langsmith/README.md create mode 100644 examples/langsmith/dump_traces_langsmith.py create mode 100644 examples/langsmith/emit_tool_calls.py create mode 100644 examples/langsmith/llm_judge_from_langsmith.py create mode 100644 tests/adapters/test_langsmith_adapter.py diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py new file mode 100644 index 00000000..7556e1d9 --- /dev/null +++ b/eval_protocol/adapters/langsmith.py @@ -0,0 +1,245 @@ +"""LangSmith adapter for Eval Protocol. + +This adapter pulls runs from LangSmith and converts them to EvaluationRow format, +mirroring the behavior of the Langfuse adapter. + +It supports extracting chat messages from inputs/outputs, and optionally includes +tool calls and tool messages where present. +""" + +from __future__ import annotations + +import logging +from typing import Any, Dict, List, Optional + +from eval_protocol.models import EvaluationRow, InputMetadata, Message + +logger = logging.getLogger(__name__) + +try: + from langsmith import Client # type: ignore + + LANGSMITH_AVAILABLE = True +except ImportError: + LANGSMITH_AVAILABLE = False + + +class LangSmithAdapter: + """Adapter to pull data from LangSmith and convert to EvaluationRow format. + + By default, fetches root runs from a project and maps inputs/outputs into + `Message` objects. It supports a variety of input/output shapes commonly + emitted by LangChain/LangGraph integrations, including: + - inputs: { messages: [...] } | { prompt } | { user_input } | { input } | str | list[dict] + - outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict] + """ + + def __init__(self, client: Optional[Client] = None) -> None: + if not LANGSMITH_AVAILABLE: + raise ImportError("LangSmith not installed. Install with: pip install langsmith") + self.client = client or Client() + + def get_evaluation_rows( + self, + *, + project_name: str, + limit: int = 50, + include_tool_calls: bool = True, + ) -> List[EvaluationRow]: + """Pull runs from LangSmith and convert to EvaluationRow format. + + Args: + project_name: LangSmith project to read runs from + limit: Maximum number of rows to return + include_tool_calls: Whether to include tool calling information when present + """ + rows: List[EvaluationRow] = [] + + # Prefer root runs; they usually contain messages in inputs/outputs when tracing app-level flows + runs = list( + self.client.list_runs( + project_name=project_name, + is_root=True, + limit=limit, + select=["id", "inputs", "outputs"], + ) + ) + + for r in runs: + try: + inp = getattr(r, "inputs", None) + out = getattr(r, "outputs", None) + + ep_messages: List[Message] = [] + # Prefer canonical conversation from outputs.messages if present to avoid duplicates + if isinstance(out, dict) and isinstance(out.get("messages"), list): + ep_messages.extend( + self._extract_messages_from_payload( + {"messages": out["messages"]}, include_tool_calls, is_output=True + ) + ) + else: + # Inputs → user messages + ep_messages.extend(self._extract_messages_from_payload(inp, include_tool_calls)) + # Outputs → assistant (and possible tool messages) + ep_messages.extend(self._extract_messages_from_payload(out, include_tool_calls, is_output=True)) + + # Deduplicate consecutive identical user messages (common echo pattern) + def _canon(text: Any) -> str: + try: + return " ".join(str(text or "").strip().lower().split()) + except Exception: + return str(text or "") + + deduped: List[Message] = [] + for m in ep_messages: + if deduped and m.role == "user" and deduped[-1].role == "user": + if _canon(m.content) == _canon(deduped[-1].content): + continue + deduped.append(m) + ep_messages = deduped + + if not ep_messages: + continue + + rows.append( + EvaluationRow( + messages=ep_messages, + input_metadata=InputMetadata( + session_data={ + "langsmith_run_id": str(getattr(r, "id", "")), + "langsmith_project": project_name, + } + ), + ) + ) + except Exception as e: + logger.warning("Failed to convert run %s: %s", getattr(r, "id", ""), e) + continue + + return rows + + def _extract_messages_from_payload( + self, payload: Any, include_tool_calls: bool, *, is_output: bool = False + ) -> List[Message]: + messages: List[Message] = [] + + def _dict_to_message(msg_dict: Dict[str, Any]) -> Message: + # Role + role = msg_dict.get("role") + if role is None: + # Map LangChain types to roles if available + msg_type = msg_dict.get("type") + if msg_type == "human": + role = "user" + elif msg_type == "ai": + role = "assistant" + else: + role = "assistant" if is_output else "user" + + content = msg_dict.get("content") + # LangChain content parts + if isinstance(content, list): + text = " ".join([part.get("text", "") for part in content if isinstance(part, dict)]) + content = text or str(content) + + name = msg_dict.get("name") + + tool_calls = None + tool_call_id = None + function_call = None + if include_tool_calls: + if "tool_calls" in msg_dict and isinstance(msg_dict["tool_calls"], list): + try: + from openai.types.chat.chat_completion_message_tool_call import ( + ChatCompletionMessageToolCall, + Function as ChatToolFunction, + ) + + typed_calls: List[ChatCompletionMessageToolCall] = [] + for tc in msg_dict["tool_calls"]: + # Extract id/type/function fields from dicts or provider-native objects + if isinstance(tc, dict): + tc_id = tc.get("id", None) + tc_type = tc.get("type", "function") or "function" + fn = tc.get("function", {}) or {} + fn_name = fn.get("name", None) + fn_args = fn.get("arguments", None) + else: + tc_id = getattr(tc, "id", None) + tc_type = getattr(tc, "type", None) or "function" + f = getattr(tc, "function", None) + fn_name = getattr(f, "name", None) if f is not None else None + fn_args = getattr(f, "arguments", None) if f is not None else None + + # Build typed function object (arguments must be a string per OpenAI type) + fn_obj = ChatToolFunction( + name=str(fn_name) if fn_name is not None else "", + arguments=str(fn_args) if fn_args is not None else "", + ) + typed_calls.append( + ChatCompletionMessageToolCall( + id=str(tc_id) if tc_id is not None else "", + type="function", + function=fn_obj, + ) + ) + tool_calls = typed_calls + except Exception: + # If OpenAI types unavailable, leave None to satisfy type checker + tool_calls = None + if "tool_call_id" in msg_dict: + tool_call_id = msg_dict.get("tool_call_id") + if "function_call" in msg_dict: + function_call = msg_dict.get("function_call") + + return Message( + role=str(role), + content=str(content) if content is not None else "", + name=name, + tool_call_id=tool_call_id, + tool_calls=tool_calls, + function_call=function_call, + ) + + if isinstance(payload, dict): + # Common patterns + if isinstance(payload.get("messages"), list): + for m in payload["messages"]: + if isinstance(m, dict): + messages.append(_dict_to_message(m)) + else: + messages.append(Message(role="assistant" if is_output else "user", content=str(m))) + elif "prompt" in payload and isinstance(payload["prompt"], str): + messages.append(Message(role="user" if not is_output else "assistant", content=str(payload["prompt"]))) + elif "user_input" in payload and isinstance(payload["user_input"], str): + messages.append( + Message(role="user" if not is_output else "assistant", content=str(payload["user_input"])) + ) + elif "input" in payload and isinstance(payload["input"], str): + messages.append(Message(role="user" if not is_output else "assistant", content=str(payload["input"]))) + elif "content" in payload and isinstance(payload["content"], str): + messages.append(Message(role="assistant", content=str(payload["content"]))) + elif "result" in payload and isinstance(payload["result"], str): + messages.append(Message(role="assistant", content=str(payload["result"]))) + elif "answer" in payload and isinstance(payload["answer"], str): + messages.append(Message(role="assistant", content=str(payload["answer"]))) + elif "output" in payload and isinstance(payload["output"], str): + messages.append(Message(role="assistant", content=str(payload["output"]))) + else: + # Fallback: stringify + messages.append(Message(role="assistant" if is_output else "user", content=str(payload))) + elif isinstance(payload, list): + for m in payload: + if isinstance(m, dict): + messages.append(_dict_to_message(m)) + else: + messages.append(Message(role="assistant" if is_output else "user", content=str(m))) + elif isinstance(payload, str): + messages.append(Message(role="assistant" if is_output else "user", content=payload)) + + return messages + + +def create_langsmith_adapter() -> LangSmithAdapter: + return LangSmithAdapter() diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py new file mode 100644 index 00000000..f4efb7f5 --- /dev/null +++ b/eval_protocol/quickstart/llm_judge_langsmith.py @@ -0,0 +1,128 @@ +""" +LLM Judge quickstart that PULLS DATA FROM LANGSMITH and persists results locally via Eval Protocol. + +This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses +LangSmith datasets/examples as the source of evaluation rows. + +Setup: + pip install -U langsmith + +Env vars: + export LANGSMITH_API_KEY=... # required to fetch examples + export LS_DATASET="ep_langsmith_demo_ds" # dataset to pull examples from + +Judge model keys: + - Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY + - Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY + +Run: + pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s +""" + +import os +from typing import Any, Dict, List, Optional + +import pytest + +from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor +from eval_protocol.quickstart.utils import ( + split_multi_turn_rows, + JUDGE_CONFIGS, + calculate_bootstrap_scores, + run_judgment, +) +from eval_protocol.adapters.langsmith import LangSmithAdapter + + +def fetch_langsmith_traces_as_evaluation_rows( + project_name: Optional[str] = None, + limit: int = 20, +) -> List[EvaluationRow]: + """Fetch LangSmith root runs and convert to EvaluationRow, mirroring Langfuse adapter shape. + + - Extract messages from run.inputs and run.outputs + - Append assistant message from outputs so split_multi_turn_rows can derive ground_truth + - Store run_id in input_metadata.session_data + """ + project = project_name or os.getenv("LS_PROJECT", "ep-langgraph-examples") + try: + adapter = LangSmithAdapter() + return adapter.get_evaluation_rows(project_name=project, limit=limit, include_tool_calls=True) + except Exception as e: + print(f"❌ LangSmithAdapter failed: {e}") + return [] + + +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") +@pytest.mark.asyncio +@evaluation_test( + input_rows=[fetch_langsmith_traces_as_evaluation_rows()], + completion_params=[ + { + "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507", + }, + { + "max_tokens": 131000, + "extra_body": {"reasoning_effort": "low"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + }, + ], + rollout_processor=SingleTurnRolloutProcessor(), + preprocess_fn=split_multi_turn_rows, + mode="all", +) +async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[EvaluationRow]: + """LLM Judge evaluation over LangSmith-sourced rows, persisted locally by Eval Protocol. + + Mirrors quickstart/llm_judge.py, using Arena-Hard-Auto style pairwise judgment. + """ + + judge_name = "gemini-2.5-pro" + + if not rows: + print("❌ No evaluation rows provided") + return rows + + print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging (LangSmith source)...") + + model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model") + + judgments: List[Dict[str, Any]] = [] + + for row in rows: + result = run_judgment(row, model_name, judge_name) + if result and result["games"][0] and result["games"][1]: + judgments.append(result) + + if not judgments: + print("❌ No valid judgments generated") + return rows + + print(f"✅ Generated {len(judgments)} valid judgments") + + mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments) + if mean_score == 0.0: + print("❌ No valid scores extracted") + return rows + + print("\n##### LLM Judge Results (90th percentile CI) #####") + clean_model_name = model_name.split("/")[-1] + print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})") + print("original: 50.0% (CI: 50.0% - 50.0%)") + + for row in rows: + if row.evaluation_result: + row.evaluation_result.score = mean_score + row.evaluation_result.standard_error = (upper_score - lower_score) / (2 * 1.645) + else: + row.evaluation_result = EvaluateResult( + score=mean_score, + reason="Aggregated LLM judge score", + metrics={ + "summary": MetricResult(score=mean_score, reason="Aggregated over judgments"), + }, + ) + + return rows diff --git a/examples/langgraph/test_tools_langsmith_trace.py b/examples/langgraph/test_tools_langsmith_trace.py new file mode 100644 index 00000000..064b639b --- /dev/null +++ b/examples/langgraph/test_tools_langsmith_trace.py @@ -0,0 +1,48 @@ +import os +import pytest + + +@pytest.mark.skipif(os.getenv("FIREWORKS_API_KEY") in (None, ""), reason="FIREWORKS_API_KEY not set") +@pytest.mark.asyncio +async def test_tools_graph_traced_to_langsmith() -> None: + from langsmith import Client + from langsmith import traceable + from .tools_graph import build_tools_graph + from langchain_core.messages import HumanMessage + + os.environ.setdefault("LANGSMITH_TRACING", "true") + os.environ.setdefault("LANGCHAIN_PROJECT", os.getenv("LS_PROJECT", "ep-langgraph-examples")) + + app = build_tools_graph() + + @traceable + async def run_once(prompt: str) -> dict: + # Run the graph once + _ = await app.ainvoke({"messages": [HumanMessage(content=prompt)]}) + # Return a ChatML-like transcript including a tool response so LangSmith records role=tool + tool_args = '{"a":2,"b":3}' + return { + "messages": [ + {"role": "user", "content": prompt}, + { + "role": "assistant", + "content": "Tool Calls:\ncalculator_add\n" + tool_args, + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": {"name": "calculator_add", "arguments": tool_args}, + } + ], + }, + { + "role": "tool", + "name": "calculator_add", + "tool_call_id": "call_1", + "content": "5", + }, + {"role": "assistant", "content": "The result is 5."}, + ] + } + + await run_once("Use calculator_add to add 2 and 3") diff --git a/examples/langgraph/tools_graph.py b/examples/langgraph/tools_graph.py new file mode 100644 index 00000000..523e613f --- /dev/null +++ b/examples/langgraph/tools_graph.py @@ -0,0 +1,68 @@ +from typing import Any, Dict, List +from typing_extensions import TypedDict, Annotated + + +def build_tools_graph() -> Any: + from langgraph.graph import StateGraph, END + from langgraph.graph.message import add_messages + from langchain_core.messages import BaseMessage + from langchain.chat_models import init_chat_model + + class State(TypedDict): + messages: Annotated[List[BaseMessage], add_messages] + + # Use fireworks provider; expects FIREWORKS_API_KEY + llm = init_chat_model( + "accounts/fireworks/models/kimi-k2-instruct", + model_provider="fireworks", + temperature=0.0, + tools=[ + { + "type": "function", + "function": { + "name": "calculator_add", + "description": "Add two integers", + "parameters": { + "type": "object", + "properties": { + "a": {"type": "integer"}, + "b": {"type": "integer"}, + }, + "required": ["a", "b"], + }, + }, + } + ], + ) + + async def tool_router(state: State, **_: Any) -> Dict[str, Any]: + msgs: List[BaseMessage] = state.get("messages", []) + resp = await llm.ainvoke(msgs) + # If tool call requested, synthesize tool result message + try: + tcs = getattr(resp, "tool_calls", None) + if tcs: + # naive parse for demo + a, b = 0, 0 + try: + import json + + args = json.loads(tcs[0].function.arguments) + a = int(args.get("a", 0)) + b = int(args.get("b", 0)) + except Exception: + pass + result = a + b + from langchain_core.messages import ToolMessage + + tool_msg = ToolMessage(content=str(result), tool_call_id=tcs[0].id, name=tcs[0].function.name) + return {"messages": [resp, tool_msg]} + except Exception: + pass + return {"messages": [resp]} + + g = StateGraph(State) + g.add_node("tool_router", tool_router) + g.set_entry_point("tool_router") + g.add_edge("tool_router", END) + return g.compile() diff --git a/examples/langsmith/README.md b/examples/langsmith/README.md new file mode 100644 index 00000000..079cd874 --- /dev/null +++ b/examples/langsmith/README.md @@ -0,0 +1,24 @@ +# LangSmith Bootstrap Scripts + +These scripts are ONLY for dumping synthetic traces into LangSmith to exercise the adapter and quickstart examples. + +- `dump_traces_langsmith.py`: emits simple @traceable runs and an optional mini LangGraph echo flow. +- `emit_tool_calls.py`: emits runs that include assistant tool calls and a tool response message. + +Usage: +1) Set your API key: + +```bash +export LANGSMITH_API_KEY=... +export LANGSMITH_TRACING=true +export LS_PROJECT=ep-langgraph-examples +``` + +2) Run emitters: + +```bash +python examples/langsmith/dump_traces_langsmith.py +python examples/langsmith/emit_tool_calls.py +``` + +These are not production examples; they exist to seed LangSmith with traces that the adapter can consume. diff --git a/examples/langsmith/dump_traces_langsmith.py b/examples/langsmith/dump_traces_langsmith.py new file mode 100644 index 00000000..68bca4f6 --- /dev/null +++ b/examples/langsmith/dump_traces_langsmith.py @@ -0,0 +1,115 @@ +"""Quick script to send a few throwaway traces to LangSmith. + +Usage: + export LANGSMITH_API_KEY=... # required + export LANGSMITH_TRACING=true # recommended + python python-sdk/examples/langsmith/dump_traces_langsmith.py + +Notes: +- This does not require any external model keys. It logs a few synthetic + traced function calls, and optionally a tiny LangGraph flow if available. +""" + +import asyncio +import os +from typing import Any, Dict, List +import importlib + + +def _ensure_env_defaults() -> None: + # Prefer modern env vars; fall back maintained for compatibility. + if os.environ.get("LANGSMITH_TRACING") is None: + os.environ["LANGSMITH_TRACING"] = "true" + # Project name helps organize traces in the LangSmith UI + os.environ.setdefault("LANGCHAIN_PROJECT", "ep-langgraph-examples") + + +def _log_synthetic_traces() -> None: + traceable = None + try: + mod = importlib.import_module("langsmith") + traceable = getattr(mod, "traceable", None) + except ImportError: + pass + if traceable is None: + print("LangSmith not installed; skipping @traceable demo. `pip install langsmith`.") + return + + @traceable(name="toy_pipeline") + def toy_pipeline(user_input: str) -> Dict[str, Any]: + reversed_text = user_input[::-1] + upper_text = reversed_text.upper() + return {"result": upper_text, "len": len(upper_text)} + + print("Emitting synthetic traces via @traceable...") + toy_pipeline("hello langsmith") + toy_pipeline("trace number two") + toy_pipeline("final short run") + + +async def _maybe_run_tiny_langgraph() -> None: + """Optionally run a tiny LangGraph flow to log a couple of runs. + + This avoids any external LLM providers by using a pure-Python node. + """ + try: + graph_mod = importlib.import_module("langgraph.graph") + msg_mod = importlib.import_module("langgraph.graph.message") + lc_msgs = importlib.import_module("langchain_core.messages") + te_mod = importlib.import_module("typing_extensions") + except ImportError: + print("LangGraph/LangChain not installed; skipping tiny graph demo. `pip install langgraph langchain-core`.") + return + + END = getattr(graph_mod, "END") + StateGraph = getattr(graph_mod, "StateGraph") + add_messages = getattr(msg_mod, "add_messages") + AIMessage = getattr(lc_msgs, "AIMessage") + BaseMessage = getattr(lc_msgs, "BaseMessage") + HumanMessage = getattr(lc_msgs, "HumanMessage") + Annotated = getattr(te_mod, "Annotated") + TypedDict = getattr(te_mod, "TypedDict") + + class State(TypedDict): # type: ignore[misc] + messages: Annotated[List[BaseMessage], add_messages] # type: ignore[index] + + async def echo_node(state: State, **_: Any) -> Dict[str, Any]: + messages: List[BaseMessage] = state.get("messages", []) + last_user = next((m for m in reversed(messages) if isinstance(m, HumanMessage)), None) + content = getattr(last_user, "content", "") + reply = AIMessage(content=f"Echo: {content}") + return {"messages": [reply]} + + graph = StateGraph(State) + graph.add_node("echo", echo_node) + graph.set_entry_point("echo") + graph.add_edge("echo", END) + app = graph.compile() + + print("Emitting a couple LangGraph runs...") + await app.ainvoke({"messages": [HumanMessage(content="hi there")]}) + await app.ainvoke({"messages": [HumanMessage(content="how are you?")]}) + + +def main() -> None: + _ensure_env_defaults() + + if not os.getenv("LANGSMITH_API_KEY") and not os.getenv("LANGCHAIN_API_KEY"): + print("Missing LangSmith API key. Set LANGSMITH_API_KEY (or LANGCHAIN_API_KEY) and rerun.") + return + + _log_synthetic_traces() + + try: + asyncio.run(_maybe_run_tiny_langgraph()) + except RuntimeError: + # Fallback for event loop already running (e.g. in notebooks) + loop = asyncio.get_event_loop() + loop.create_task(_maybe_run_tiny_langgraph()) + loop.run_until_complete(asyncio.sleep(0.1)) + + print("Done. Visit LangSmith to see your new traces.") + + +if __name__ == "__main__": + main() diff --git a/examples/langsmith/emit_tool_calls.py b/examples/langsmith/emit_tool_calls.py new file mode 100644 index 00000000..5cc474dc --- /dev/null +++ b/examples/langsmith/emit_tool_calls.py @@ -0,0 +1,116 @@ +"""Emit a few tool-call traces into LangSmith for adapter testing. + +Requirements: + export LANGSMITH_API_KEY=... + optional: export LANGCHAIN_PROJECT=ep-langgraph-examples (or set --project) + +Run: + python python-sdk/examples/langsmith/emit_tool_calls.py +""" + +import os +from typing import Any, Dict, List + + +def make_messages_with_tool_call(user_text: str) -> Dict[str, Any]: + """Return inputs/outputs shaped like LangChain messages with tool calls.""" + inputs = { + "messages": [ + { + "role": "user", + "content": user_text, + "type": "human", + } + ] + } + # Assistant proposes a tool call (function) + assistant_with_tool = { + "role": "assistant", + "content": "I'll call the calculator.", + "type": "ai", + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": { + "name": "calculator.add", + "arguments": '{"a": 2, "b": 3}', + }, + } + ], + } + # Tool response message + tool_message = { + "role": "tool", + "name": "calculator.add", + "tool_call_id": "call_1", + "content": "5", + } + # Final assistant message + final_assistant = { + "role": "assistant", + "content": "The result is 5.", + "type": "ai", + } + outputs = { + "messages": [ + inputs["messages"][0], + assistant_with_tool, + tool_message, + final_assistant, + ] + } + return {"inputs": inputs, "outputs": outputs} + + +def main() -> None: + try: + from langsmith import Client # type: ignore + except Exception as e: + print(f"Missing langsmith dependency: {e}") + return + + project = os.getenv("LANGCHAIN_PROJECT", os.getenv("LS_PROJECT", "ep-langgraph-examples")) + client = Client() + + samples: List[str] = [ + "Add 2 and 3", + "Compute 7 + 11", + "Sum 10 and 25", + ] + + for i, text in enumerate(samples, start=1): + payload = make_messages_with_tool_call(text) + name = f"tool-demo-{i}" + # Create a chain run as container + client.create_run(name=name, inputs=payload["inputs"], run_type="chain", project_name=project) + # Log an llm child run carrying the assistant/tool messages as outputs + client.create_run( + name=f"{name}-llm", + inputs=payload["inputs"], + run_type="llm", + project_name=project, + ) + # Finalize by writing one more chain run with the aggregated outputs + client.create_run( + name=f"{name}-final", + inputs=payload["inputs"], + run_type="chain", + project_name=project, + ) + # Note: For simplicity, we attach outputs only on the final chain run + # using update_run is possible, but create_run keeps the example lightweight + # and the adapter reads from root runs' inputs/outputs or messages arrays. + # Many LangSmith clients attach outputs via end_run; here we keep it minimal. + try: + # If available, end_run to attach outputs on the final run + client.end_run(outputs=payload["outputs"]) # type: ignore[arg-type] + except Exception: + # Fallback: best-effort; runs may still be visible with inputs and llm child + pass + + print(f"Emitted {len(samples)} tool-call demo traces to project '{project}'.") + + +if __name__ == "__main__": + main() diff --git a/examples/langsmith/llm_judge_from_langsmith.py b/examples/langsmith/llm_judge_from_langsmith.py new file mode 100644 index 00000000..e17f40f0 --- /dev/null +++ b/examples/langsmith/llm_judge_from_langsmith.py @@ -0,0 +1,168 @@ +"""Run a quick LLM-as-judge evaluation using LangSmith datasets and evaluators. + +This mirrors our Langfuse example: we define a tiny dataset, a trivial target, +and run a rubric-based LLM judge via LangSmith's evaluation API. + +Requirements: + pip install -U langsmith langchain-openai + +Env Vars: + export LANGSMITH_API_KEY=... # required + export OPENAI_API_KEY=... # optional; if absent uses heuristic judge + export LANGSMITH_TRACING=true # optional, to record runs + +Run: + python python-sdk/examples/langsmith/llm_judge_from_langsmith.py +""" + +from __future__ import annotations + +import os +from typing import Any, Dict +import importlib + + +def _ensure_env() -> None: + os.environ.setdefault("LANGCHAIN_PROJECT", "ep-langgraph-examples") + # Enable tracing so target runs + evaluator runs are visible in the UI + os.environ.setdefault("LANGSMITH_TRACING", "true") + + +def main() -> None: + _ensure_env() + + if not os.getenv("LANGSMITH_API_KEY") and not os.getenv("LANGCHAIN_API_KEY"): + raise SystemExit("Please set LANGSMITH_API_KEY (or LANGCHAIN_API_KEY).") + use_openai = bool(os.getenv("OPENAI_API_KEY")) + + # Import here to allow the script to print clearer errors if deps are missing. + try: + ls = importlib.import_module("langsmith") + eval_mod = importlib.import_module("langsmith.evaluation") + except ImportError as e: + raise SystemExit("Missing dependency. Please `pip install -U langsmith`. ") from e + + Client = getattr(ls, "Client") + evaluate = getattr(eval_mod, "evaluate") + + client = Client() + + dataset_name = "ep_langsmith_demo_ds" + # Create or get dataset + try: + dataset = client.create_dataset(dataset_name, description="EP demo dataset for LLM-as-judge") + except Exception: + dataset = client.read_dataset(dataset_name=dataset_name) + + # Seed examples (idempotent-ish: try to insert; duplicates are okay for demo) + examples = [ + ({"prompt": "Say hello to Bob."}, {"answer": "Hello Bob!"}), + ({"prompt": "What is 2+2?"}, {"answer": "4"}), + ( + {"prompt": "Respond with a haiku about spring."}, + {"answer": "Gentle rains arrive\nBuds whisper to warming winds\nEarth breathes life anew"}, + ), + ] + for inputs, outputs in examples: + try: + client.create_example(inputs=inputs, outputs=outputs, dataset_id=dataset.id) + except Exception: + # Ignore duplicate errors in throwaway demo + pass + + # Define the target function: pretend model that returns uppercase + def target_func(example_inputs: Dict[str, Any]) -> Dict[str, Any]: + text = example_inputs.get("prompt", "") + return {"answer": str(text).upper()} + + # Define an evaluator that either uses OpenAI (if available) or a heuristic fallback + import json + import re + from typing import cast + + def _normalize_text(text: str) -> str: + return re.sub(r"\s+", " ", text.strip().lower()) + + def heuristic_score(pred: str, ref: str) -> float: + if not ref: + return 0.0 if not pred else 0.5 + p = _normalize_text(pred) + r = _normalize_text(ref) + if p == r: + return 1.0 + if r in p: + return 0.8 + return 0.0 + + def llm_as_judge(run, example): # type: ignore[no-untyped-def] + # Extract strings + pred = "" + try: + out = run.outputs or {} + pred = cast(str, out.get("answer") or out.get("output") or "") + except Exception: + pred = "" + ref = "" + try: + ex_out = example.outputs or {} + ref = cast(str, ex_out.get("answer") or ex_out.get("output") or "") + except Exception: + ref = "" + + if not use_openai: + score = heuristic_score(pred, ref) + return {"key": "llm_judge", "score": float(score), "comment": "heuristic"} + + try: + from langchain_openai import ChatOpenAI # type: ignore + except Exception: + score = heuristic_score(pred, ref) + return {"key": "llm_judge", "score": float(score), "comment": "heuristic (no openai)"} + + llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0) + system = ( + "You are an impartial grader. Compare the candidate answer to the reference answer. " + "Return a JSON object with fields 'score' (float 0.0-1.0) and 'reason' (short string). " + "Award 1.0 for semantic equivalence, 0.8 for close paraphrase, else 0.0." + ) + user = json.dumps({"reference": ref, "candidate": pred}) + try: + resp = llm.invoke([{"role": "system", "content": system}, {"role": "user", "content": user}]) + content = getattr(resp, "content", "") + data = {} + try: + if isinstance(content, str): + data = json.loads(content) + else: + # langchain message content may be a list of dicts + data = json.loads(content[0].get("text", "{}")) # type: ignore[index] + except Exception: + data = {"score": heuristic_score(pred, ref), "reason": "fallback parse"} + score = float(max(0.0, min(1.0, float(data.get("score", 0.0))))) + reason = str(data.get("reason", ""))[:500] + return {"key": "llm_judge", "score": score, "comment": reason} + except Exception as e: + score = heuristic_score(pred, ref) + return {"key": "llm_judge", "score": float(score), "comment": f"heuristic (error: {e})"} + + print("Running evaluation... this will create an experiment in LangSmith.") + results = evaluate( + target_func, + data=dataset_name, + evaluators=[llm_as_judge], + experiment_prefix="ep-llm-judge-demo", + max_concurrency=4, + metadata={"source": "examples/langsmith"}, + ) + + print("Experiment URL:") + try: + print(results.get("url")) # type: ignore[reportUnknownMemberType] + except Exception: + pass + + print("Done. Visit LangSmith to review scores and details.") + + +if __name__ == "__main__": + main() diff --git a/tests/adapters/test_langsmith_adapter.py b/tests/adapters/test_langsmith_adapter.py new file mode 100644 index 00000000..2f32282a --- /dev/null +++ b/tests/adapters/test_langsmith_adapter.py @@ -0,0 +1,183 @@ +import types +from types import SimpleNamespace +from typing import Any, Dict, List + +import pytest + +from eval_protocol.adapters.langsmith import LangSmithAdapter +from eval_protocol.models import Message + + +class FakeClient: + def __init__(self, runs: List[Any]): + self._runs = runs + + def list_runs(self, *, project_name: str, is_root: bool, limit: int, select: List[str]): # type: ignore[no-untyped-def] + return iter(self._runs[:limit]) + + +def _msg(role: str, content: str, **kwargs: Any) -> Dict[str, Any]: + m = {"role": role, "content": content} + m.update(kwargs) + return m + + +def test_outputs_messages_preferred_and_dedup_user(): + # outputs.messages exists with duplicate consecutive user messages + runs = [ + SimpleNamespace( + id="r1", + inputs={"messages": [_msg("user", "hi")]}, + outputs={ + "messages": [ + _msg("user", "hi"), + _msg("user", "hi"), # duplicate + _msg("assistant", "hello"), + ] + }, + ) + ] + adapter = LangSmithAdapter(client=FakeClient(runs)) + rows = adapter.get_evaluation_rows(project_name="p", limit=10) + assert len(rows) == 1 + msgs = rows[0].messages + assert [m.role for m in msgs] == ["user", "assistant"] + assert msgs[0].content == "hi" + assert msgs[1].content == "hello" + + +def test_inputs_variants_prompt_user_input_input(): + runs = [ + SimpleNamespace(id="p1", inputs={"prompt": "A"}, outputs={"content": "OA"}), + SimpleNamespace(id="p2", inputs={"user_input": "B"}, outputs={"result": "OB"}), + SimpleNamespace(id="p3", inputs={"input": "C"}, outputs={"answer": "OC"}), + SimpleNamespace(id="p4", inputs="D", outputs="OD"), + ] + adapter = LangSmithAdapter(client=FakeClient(runs)) + rows = adapter.get_evaluation_rows(project_name="p", limit=10) + texts = [[(m.role, m.content) for m in r.messages] for r in rows] + assert ("user", "A") in texts[0] + assert ("assistant", "OA") in texts[0] + assert ("user", "B") in texts[1] + assert ("assistant", "OB") in texts[1] + assert ("user", "C") in texts[2] + assert ("assistant", "OC") in texts[2] + assert ("user", "D") in texts[3] + assert ("assistant", "OD") in texts[3] + + +def test_outputs_variants_and_list_payloads(): + runs = [ + SimpleNamespace(id="o1", inputs=[], outputs={"output": "X"}), + SimpleNamespace(id="o2", inputs=[_msg("user", "U")], outputs=[_msg("assistant", "V")]), + ] + adapter = LangSmithAdapter(client=FakeClient(runs)) + rows = adapter.get_evaluation_rows(project_name="p", limit=10) + msgs1 = rows[0].messages + assert any(m.role == "assistant" and m.content == "X" for m in msgs1) + msgs2 = rows[1].messages + assert any(m.role == "user" and m.content == "U" for m in msgs2) + assert any(m.role == "assistant" and m.content == "V" for m in msgs2) + + +def test_tool_calls_and_tool_role_preserved(): + tool_args = '{"a":2,"b":3}' + assistant_with_tool = _msg( + "assistant", + "Tool Calls:\ncalculator_add\n" + tool_args, + tool_calls=[ + { + "id": "call_1", + "type": "function", + "function": {"name": "calculator_add", "arguments": tool_args}, + } + ], + ) + tool_msg = {"role": "tool", "name": "calculator_add", "tool_call_id": "call_1", "content": "5"} + runs = [ + SimpleNamespace( + id="t1", + inputs={"messages": [_msg("user", "Add 2 and 3")]}, + outputs={ + "messages": [ + _msg("user", "Add 2 and 3"), + assistant_with_tool, + tool_msg, + _msg("assistant", "The result is 5."), + ] + }, + ) + ] + adapter = LangSmithAdapter(client=FakeClient(runs)) + rows = adapter.get_evaluation_rows(project_name="p", limit=10) + msgs = rows[0].messages + # Ensure tool role present + assert any(m.role == "tool" and (m.content or "").strip() == "5" for m in msgs) + # Ensure assistant with tool_calls preserved + assistants = [m for m in msgs if m.role == "assistant" and m.tool_calls] + assert len(assistants) >= 1 + tc = assistants[0].tool_calls[0] + # tool_calls may be provider-native objects; normalize via getattr first + fname = None + if hasattr(tc, "function"): + fn = getattr(tc, "function") + fname = getattr(fn, "name", None) + elif isinstance(tc, dict): + fname = tc.get("function", {}).get("name") + assert fname == "calculator_add" + + +def test_system_prompt_first_and_multiple_user_allowed(): + runs = [ + SimpleNamespace( + id="s1", + inputs={ + "messages": [ + _msg("system", "You are helpful"), + _msg("user", "hi"), + _msg("user", "hi again"), + ] + }, + outputs={"content": "hello there"}, + ) + ] + adapter = LangSmithAdapter(client=FakeClient(runs)) + rows = adapter.get_evaluation_rows(project_name="p", limit=10) + msgs = rows[0].messages + roles = [m.role for m in msgs] + assert roles[0] == "system" + # both user messages retained (not deduped since content differs) + assert roles[1] == "user" and roles[2] == "user" + assert roles[-1] == "assistant" + + +def test_parallel_tool_calls_normalized(): + # Two tool calls in a single assistant message + tool_args1 = '{"a":2,"b":3}' + tool_args2 = '{"a":4,"b":5}' + assistant_with_tools = _msg( + "assistant", + "Two calls", + tool_calls=[ + {"id": "c1", "type": "function", "function": {"name": "calculator_add", "arguments": tool_args1}}, + {"id": "c2", "type": "function", "function": {"name": "calculator_add", "arguments": tool_args2}}, + ], + ) + runs = [ + SimpleNamespace( + id="pt1", + inputs={"messages": [_msg("user", "sum two pairs")]}, + outputs={"messages": [assistant_with_tools]}, + ), + ] + adapter = LangSmithAdapter(client=FakeClient(runs)) + rows = adapter.get_evaluation_rows(project_name="p", limit=10) + msgs = rows[0].messages + assistants = [m for m in msgs if m.role == "assistant" and m.tool_calls] + assert len(assistants) == 1 + tcs = assistants[0].tool_calls + assert isinstance(tcs, list) and len(tcs) == 2 + names = [ + (getattr(tc, "function").name if hasattr(tc, "function") else tc.get("function", {}).get("name")) for tc in tcs + ] + assert names == ["calculator_add", "calculator_add"] From a65ab80162c38e841dde895a49a81eb6af311d77 Mon Sep 17 00:00:00 2001 From: benjibc Date: Tue, 16 Sep 2025 01:21:01 +0000 Subject: [PATCH 2/4] langsmith changes --- eval_protocol/adapters/langsmith.py | 194 +++++++++++++++++++++++++--- pyproject.toml | 3 + 2 files changed, 177 insertions(+), 20 deletions(-) diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py index 7556e1d9..2503e3ad 100644 --- a/eval_protocol/adapters/langsmith.py +++ b/eval_protocol/adapters/langsmith.py @@ -10,7 +10,7 @@ from __future__ import annotations import logging -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Iterable from eval_protocol.models import EvaluationRow, InputMetadata, Message @@ -36,7 +36,7 @@ class LangSmithAdapter: def __init__(self, client: Optional[Client] = None) -> None: if not LANGSMITH_AVAILABLE: - raise ImportError("LangSmith not installed. Install with: pip install langsmith") + raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'") self.client = client or Client() def get_evaluation_rows( @@ -45,6 +45,31 @@ def get_evaluation_rows( project_name: str, limit: int = 50, include_tool_calls: bool = True, + # Pass-through filters to list_runs to match LangSmith Client API + run_id: Optional[str] = None, + ids: Optional[List[str]] = None, + run_type: Optional[str] = None, + execution_order: Optional[int] = None, + parent_run_id: Optional[str] = None, + trace_id: Optional[str] = None, + trace_ids: Optional[List[str]] = None, + reference_example_id: Optional[str] = None, + session_name: Optional[str] = None, + error: Optional[bool] = None, + start_time: Optional[str] = None, + end_time: Optional[str] = None, + filter_expr: Optional[str] = None, # server-side filter DSL + tags: Optional[List[str]] = None, + metadata: Optional[Dict[str, Any]] = None, + feedback_keys: Optional[List[str]] = None, + feedback_source: Optional[str] = None, + tree_id: Optional[str] = None, + # ordering/pagination + offset: Optional[int] = None, + order_by: Optional[str] = None, + # selection + select: Optional[List[str]] = None, + **list_runs_kwargs: Any, ) -> List[EvaluationRow]: """Pull runs from LangSmith and convert to EvaluationRow format. @@ -55,17 +80,57 @@ def get_evaluation_rows( """ rows: List[EvaluationRow] = [] - # Prefer root runs; they usually contain messages in inputs/outputs when tracing app-level flows - runs = list( - self.client.list_runs( - project_name=project_name, - is_root=True, - limit=limit, - select=["id", "inputs", "outputs"], - ) - ) - + # Fetch runs with pass-through filters. Prefer root runs by default. + params: Dict[str, Any] = {"project_name": project_name, "limit": limit} + # Only include non-None params + if run_type is None: + params["is_root"] = True + for key, value in [ + ("id", run_id), + ("ids", ids), + ("run_type", run_type), + ("execution_order", execution_order), + ("parent_run_id", parent_run_id), + ("trace_id", trace_id), + ("trace_ids", trace_ids), + ("reference_example_id", reference_example_id), + ("session_name", session_name), + ("error", error), + ("start_time", start_time), + ("end_time", end_time), + ("filter", filter_expr), + ("tags", tags), + ("metadata", metadata), + ("feedback_keys", feedback_keys), + ("feedback_source", feedback_source), + ("tree_id", tree_id), + ("offset", offset), + ("order_by", order_by), + ]: + if value is not None: + params[key] = value + params["select"] = select or ["id", "inputs", "outputs", "trace_id"] + + # Merge any additional kwargs last to allow explicit overrides + if list_runs_kwargs: + for k, v in list_runs_kwargs.items(): + if v is not None: + params[k] = v + + runs_iter: Iterable[Any] = self.client.list_runs(**params) + + runs = list(runs_iter) + if not runs: + logger.warning("No LangSmith runs found for project '%s' with current filters", project_name) + return [] + + # Group by trace_id and pick the last run in each trace (assume iterator yields chronological) + trace_to_last_run: Dict[str, Any] = {} for r in runs: + t_id = str(getattr(r, "trace_id", "")) or str(getattr(r, "id", "")) + trace_to_last_run[t_id] = r + + for r in trace_to_last_run.values(): try: inp = getattr(r, "inputs", None) out = getattr(r, "outputs", None) @@ -86,10 +151,9 @@ def get_evaluation_rows( # Deduplicate consecutive identical user messages (common echo pattern) def _canon(text: Any) -> str: - try: - return " ".join(str(text or "").strip().lower().split()) - except Exception: - return str(text or "") + # Best-effort canonicalization; avoid broad exception handling warnings by handling types + text_str = str(text) if text is not None else "" + return " ".join(text_str.strip().lower().split()) deduped: List[Message] = [] for m in ep_messages: @@ -102,23 +166,115 @@ def _canon(text: Any) -> str: if not ep_messages: continue + tools = None + if include_tool_calls and isinstance(inp, dict): + # Try to extract tool schema if present in inputs + if "tools" in inp: + tools = inp["tools"] + rows.append( EvaluationRow( messages=ep_messages, + tools=tools, input_metadata=InputMetadata( session_data={ "langsmith_run_id": str(getattr(r, "id", "")), + "langsmith_trace_id": str(getattr(r, "trace_id", "")), "langsmith_project": project_name, } ), ) ) - except Exception as e: + except (AttributeError, ValueError, KeyError, TypeError) as e: logger.warning("Failed to convert run %s: %s", getattr(r, "id", ""), e) continue return rows + def get_evaluation_rows_by_ids( + self, + *, + run_ids: Optional[List[str]] = None, + trace_ids: Optional[List[str]] = None, + include_tool_calls: bool = True, + project_name: Optional[str] = None, + ) -> List[EvaluationRow]: + """Fetch specific runs or traces and convert to EvaluationRow. + + If both run_ids and trace_ids are provided, both sets are fetched. + """ + results: List[EvaluationRow] = [] + + fetched_runs: List[Any] = [] + try: + if run_ids: + fetched_runs.extend(list(self.client.list_runs(ids=run_ids, select=["id", "inputs", "outputs", "trace_id"])) ) + if trace_ids: + fetched_runs.extend(list(self.client.list_runs(trace_ids=trace_ids, select=["id", "inputs", "outputs", "trace_id"])) ) + except (AttributeError, ValueError, KeyError, TypeError) as e: + logger.warning("Failed to fetch runs by ids: %s", e) + return [] + + if not fetched_runs: + logger.warning("No LangSmith runs found for provided ids") + return [] + + # Prefer the last run per trace id + trace_to_last_run: Dict[str, Any] = {} + for r in fetched_runs: + t_id = str(getattr(r, "trace_id", "")) or str(getattr(r, "id", "")) + trace_to_last_run[t_id] = r + + for r in trace_to_last_run.values(): + try: + inp = getattr(r, "inputs", None) + out = getattr(r, "outputs", None) + + ep_messages: List[Message] = [] + if isinstance(out, dict) and isinstance(out.get("messages"), list): + ep_messages.extend(self._extract_messages_from_payload({"messages": out["messages"]}, include_tool_calls, is_output=True)) + else: + ep_messages.extend(self._extract_messages_from_payload(inp, include_tool_calls)) + ep_messages.extend(self._extract_messages_from_payload(out, include_tool_calls, is_output=True)) + + def _canon(text: Any) -> str: + text_str = str(text) if text is not None else "" + return " ".join(text_str.strip().lower().split()) + + deduped: List[Message] = [] + for m in ep_messages: + if deduped and m.role == "user" and deduped[-1].role == "user": + if _canon(m.content) == _canon(deduped[-1].content): + continue + deduped.append(m) + ep_messages = deduped + + if not ep_messages: + continue + + tools = None + if include_tool_calls and isinstance(inp, dict) and "tools" in inp: + tools = inp["tools"] + + results.append( + EvaluationRow( + messages=ep_messages, + tools=tools, + input_metadata=InputMetadata( + session_data={ + "langsmith_run_id": str(getattr(r, "id", "")), + "langsmith_trace_id": str(getattr(r, "trace_id", "")), + "langsmith_project": project_name or "", + } + ), + ) + ) + except (AttributeError, ValueError, KeyError, TypeError) as e: + logger.warning("Failed to convert run %s: %s", getattr(r, "id", ""), e) + continue + + return results + def _extract_messages_from_payload( self, payload: Any, include_tool_calls: bool, *, is_output: bool = False ) -> List[Message]: @@ -161,13 +317,11 @@ def _dict_to_message(msg_dict: Dict[str, Any]) -> Message: # Extract id/type/function fields from dicts or provider-native objects if isinstance(tc, dict): tc_id = tc.get("id", None) - tc_type = tc.get("type", "function") or "function" fn = tc.get("function", {}) or {} fn_name = fn.get("name", None) fn_args = fn.get("arguments", None) else: tc_id = getattr(tc, "id", None) - tc_type = getattr(tc, "type", None) or "function" f = getattr(tc, "function", None) fn_name = getattr(f, "name", None) if f is not None else None fn_args = getattr(f, "arguments", None) if f is not None else None @@ -185,7 +339,7 @@ def _dict_to_message(msg_dict: Dict[str, Any]) -> Message: ) ) tool_calls = typed_calls - except Exception: + except (ImportError, AttributeError, TypeError, ValueError): # If OpenAI types unavailable, leave None to satisfy type checker tool_calls = None if "tool_call_id" in msg_dict: diff --git a/pyproject.toml b/pyproject.toml index 07061b96..b55ace62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,6 +114,9 @@ adapters = [ "datasets>=3.0.0", "transformers>=4.0.0", ] +langsmith = [ + "langsmith>=0.1.86", +] bigquery = [ "google-cloud-bigquery>=3.0.0", "google-auth>=2.0.0", From f71658c986f4aa096ceacd261c9cc62e8dda8246 Mon Sep 17 00:00:00 2001 From: benjibc Date: Tue, 16 Sep 2025 04:26:56 +0000 Subject: [PATCH 3/4] update lock --- uv.lock | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index 227436c7..6d333a22 100644 --- a/uv.lock +++ b/uv.lock @@ -1275,6 +1275,9 @@ langgraph-tools = [ { name = "langchain-fireworks" }, { name = "langgraph" }, ] +langsmith = [ + { name = "langsmith" }, +] openevals = [ { name = "openevals" }, ] @@ -1343,6 +1346,7 @@ requires-dist = [ { name = "langfuse", marker = "extra == 'langfuse'", specifier = ">=2.0.0" }, { name = "langgraph", marker = "extra == 'langgraph'", specifier = ">=0.6.7" }, { name = "langgraph", marker = "extra == 'langgraph-tools'", specifier = ">=0.6.7" }, + { name = "langsmith", marker = "extra == 'langsmith'", specifier = ">=0.1.86" }, { name = "litellm", specifier = ">=1.0.0" }, { name = "loguru", specifier = ">=0.6.0" }, { name = "mcp", specifier = ">=1.9.2" }, @@ -1390,7 +1394,7 @@ requires-dist = [ { name = "websockets", specifier = ">=15.0.1" }, { name = "werkzeug", marker = "extra == 'dev'", specifier = ">=2.0.0" }, ] -provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "bigquery", "svgbench", "pydantic", "supabase", "chinook", "langchain", "langgraph", "langgraph-tools"] +provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "langsmith", "bigquery", "svgbench", "pydantic", "supabase", "chinook", "langchain", "langgraph", "langgraph-tools"] [package.metadata.requires-dev] dev = [ From e85ac0a8a231390a94fa1d4958be0b0bc5f067d5 Mon Sep 17 00:00:00 2001 From: benjibc Date: Tue, 16 Sep 2025 04:36:40 +0000 Subject: [PATCH 4/4] formatting --- eval_protocol/adapters/langsmith.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py index 2503e3ad..1d29b66a 100644 --- a/eval_protocol/adapters/langsmith.py +++ b/eval_protocol/adapters/langsmith.py @@ -208,9 +208,13 @@ def get_evaluation_rows_by_ids( fetched_runs: List[Any] = [] try: if run_ids: - fetched_runs.extend(list(self.client.list_runs(ids=run_ids, select=["id", "inputs", "outputs", "trace_id"])) ) + fetched_runs.extend( + list(self.client.list_runs(ids=run_ids, select=["id", "inputs", "outputs", "trace_id"])) + ) if trace_ids: - fetched_runs.extend(list(self.client.list_runs(trace_ids=trace_ids, select=["id", "inputs", "outputs", "trace_id"])) ) + fetched_runs.extend( + list(self.client.list_runs(trace_ids=trace_ids, select=["id", "inputs", "outputs", "trace_id"])) + ) except (AttributeError, ValueError, KeyError, TypeError) as e: logger.warning("Failed to fetch runs by ids: %s", e) return [] @@ -232,7 +236,11 @@ def get_evaluation_rows_by_ids( ep_messages: List[Message] = [] if isinstance(out, dict) and isinstance(out.get("messages"), list): - ep_messages.extend(self._extract_messages_from_payload({"messages": out["messages"]}, include_tool_calls, is_output=True)) + ep_messages.extend( + self._extract_messages_from_payload( + {"messages": out["messages"]}, include_tool_calls, is_output=True + ) + ) else: ep_messages.extend(self._extract_messages_from_payload(inp, include_tool_calls)) ep_messages.extend(self._extract_messages_from_payload(out, include_tool_calls, is_output=True))