Skip to content

Commit a65ab80

Browse files
committed
langsmith changes
1 parent 5d4daa6 commit a65ab80

File tree

2 files changed

+177
-20
lines changed

2 files changed

+177
-20
lines changed

eval_protocol/adapters/langsmith.py

Lines changed: 174 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from __future__ import annotations
1111

1212
import logging
13-
from typing import Any, Dict, List, Optional
13+
from typing import Any, Dict, List, Optional, Iterable
1414

1515
from eval_protocol.models import EvaluationRow, InputMetadata, Message
1616

@@ -36,7 +36,7 @@ class LangSmithAdapter:
3636

3737
def __init__(self, client: Optional[Client] = None) -> None:
3838
if not LANGSMITH_AVAILABLE:
39-
raise ImportError("LangSmith not installed. Install with: pip install langsmith")
39+
raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'")
4040
self.client = client or Client()
4141

4242
def get_evaluation_rows(
@@ -45,6 +45,31 @@ def get_evaluation_rows(
4545
project_name: str,
4646
limit: int = 50,
4747
include_tool_calls: bool = True,
48+
# Pass-through filters to list_runs to match LangSmith Client API
49+
run_id: Optional[str] = None,
50+
ids: Optional[List[str]] = None,
51+
run_type: Optional[str] = None,
52+
execution_order: Optional[int] = None,
53+
parent_run_id: Optional[str] = None,
54+
trace_id: Optional[str] = None,
55+
trace_ids: Optional[List[str]] = None,
56+
reference_example_id: Optional[str] = None,
57+
session_name: Optional[str] = None,
58+
error: Optional[bool] = None,
59+
start_time: Optional[str] = None,
60+
end_time: Optional[str] = None,
61+
filter_expr: Optional[str] = None, # server-side filter DSL
62+
tags: Optional[List[str]] = None,
63+
metadata: Optional[Dict[str, Any]] = None,
64+
feedback_keys: Optional[List[str]] = None,
65+
feedback_source: Optional[str] = None,
66+
tree_id: Optional[str] = None,
67+
# ordering/pagination
68+
offset: Optional[int] = None,
69+
order_by: Optional[str] = None,
70+
# selection
71+
select: Optional[List[str]] = None,
72+
**list_runs_kwargs: Any,
4873
) -> List[EvaluationRow]:
4974
"""Pull runs from LangSmith and convert to EvaluationRow format.
5075
@@ -55,17 +80,57 @@ def get_evaluation_rows(
5580
"""
5681
rows: List[EvaluationRow] = []
5782

58-
# Prefer root runs; they usually contain messages in inputs/outputs when tracing app-level flows
59-
runs = list(
60-
self.client.list_runs(
61-
project_name=project_name,
62-
is_root=True,
63-
limit=limit,
64-
select=["id", "inputs", "outputs"],
65-
)
66-
)
67-
83+
# Fetch runs with pass-through filters. Prefer root runs by default.
84+
params: Dict[str, Any] = {"project_name": project_name, "limit": limit}
85+
# Only include non-None params
86+
if run_type is None:
87+
params["is_root"] = True
88+
for key, value in [
89+
("id", run_id),
90+
("ids", ids),
91+
("run_type", run_type),
92+
("execution_order", execution_order),
93+
("parent_run_id", parent_run_id),
94+
("trace_id", trace_id),
95+
("trace_ids", trace_ids),
96+
("reference_example_id", reference_example_id),
97+
("session_name", session_name),
98+
("error", error),
99+
("start_time", start_time),
100+
("end_time", end_time),
101+
("filter", filter_expr),
102+
("tags", tags),
103+
("metadata", metadata),
104+
("feedback_keys", feedback_keys),
105+
("feedback_source", feedback_source),
106+
("tree_id", tree_id),
107+
("offset", offset),
108+
("order_by", order_by),
109+
]:
110+
if value is not None:
111+
params[key] = value
112+
params["select"] = select or ["id", "inputs", "outputs", "trace_id"]
113+
114+
# Merge any additional kwargs last to allow explicit overrides
115+
if list_runs_kwargs:
116+
for k, v in list_runs_kwargs.items():
117+
if v is not None:
118+
params[k] = v
119+
120+
runs_iter: Iterable[Any] = self.client.list_runs(**params)
121+
122+
runs = list(runs_iter)
123+
if not runs:
124+
logger.warning("No LangSmith runs found for project '%s' with current filters", project_name)
125+
return []
126+
127+
# Group by trace_id and pick the last run in each trace (assume iterator yields chronological)
128+
trace_to_last_run: Dict[str, Any] = {}
68129
for r in runs:
130+
t_id = str(getattr(r, "trace_id", "")) or str(getattr(r, "id", ""))
131+
trace_to_last_run[t_id] = r
132+
133+
for r in trace_to_last_run.values():
69134
try:
70135
inp = getattr(r, "inputs", None)
71136
out = getattr(r, "outputs", None)
@@ -86,10 +151,9 @@ def get_evaluation_rows(
86151

87152
# Deduplicate consecutive identical user messages (common echo pattern)
88153
def _canon(text: Any) -> str:
89-
try:
90-
return " ".join(str(text or "").strip().lower().split())
91-
except Exception:
92-
return str(text or "")
154+
# Best-effort canonicalization; avoid broad exception handling warnings by handling types
155+
text_str = str(text) if text is not None else ""
156+
return " ".join(text_str.strip().lower().split())
93157

94158
deduped: List[Message] = []
95159
for m in ep_messages:
@@ -102,23 +166,115 @@ def _canon(text: Any) -> str:
102166
if not ep_messages:
103167
continue
104168

169+
tools = None
170+
if include_tool_calls and isinstance(inp, dict):
171+
# Try to extract tool schema if present in inputs
172+
if "tools" in inp:
173+
tools = inp["tools"]
174+
105175
rows.append(
106176
EvaluationRow(
107177
messages=ep_messages,
178+
tools=tools,
108179
input_metadata=InputMetadata(
109180
session_data={
110181
"langsmith_run_id": str(getattr(r, "id", "")),
182+
"langsmith_trace_id": str(getattr(r, "trace_id", "")),
111183
"langsmith_project": project_name,
112184
}
113185
),
114186
)
115187
)
116-
except Exception as e:
188+
except (AttributeError, ValueError, KeyError, TypeError) as e:
117189
logger.warning("Failed to convert run %s: %s", getattr(r, "id", ""), e)
118190
continue
119191

120192
return rows
121193

194+
def get_evaluation_rows_by_ids(
195+
self,
196+
*,
197+
run_ids: Optional[List[str]] = None,
198+
trace_ids: Optional[List[str]] = None,
199+
include_tool_calls: bool = True,
200+
project_name: Optional[str] = None,
201+
) -> List[EvaluationRow]:
202+
"""Fetch specific runs or traces and convert to EvaluationRow.
203+
204+
If both run_ids and trace_ids are provided, both sets are fetched.
205+
"""
206+
results: List[EvaluationRow] = []
207+
208+
fetched_runs: List[Any] = []
209+
try:
210+
if run_ids:
211+
fetched_runs.extend(list(self.client.list_runs(ids=run_ids, select=["id", "inputs", "outputs", "trace_id"])) )
212+
if trace_ids:
213+
fetched_runs.extend(list(self.client.list_runs(trace_ids=trace_ids, select=["id", "inputs", "outputs", "trace_id"])) )
214+
except (AttributeError, ValueError, KeyError, TypeError) as e:
215+
logger.warning("Failed to fetch runs by ids: %s", e)
216+
return []
217+
218+
if not fetched_runs:
219+
logger.warning("No LangSmith runs found for provided ids")
220+
return []
221+
222+
# Prefer the last run per trace id
223+
trace_to_last_run: Dict[str, Any] = {}
224+
for r in fetched_runs:
225+
t_id = str(getattr(r, "trace_id", "")) or str(getattr(r, "id", ""))
226+
trace_to_last_run[t_id] = r
227+
228+
for r in trace_to_last_run.values():
229+
try:
230+
inp = getattr(r, "inputs", None)
231+
out = getattr(r, "outputs", None)
232+
233+
ep_messages: List[Message] = []
234+
if isinstance(out, dict) and isinstance(out.get("messages"), list):
235+
ep_messages.extend(self._extract_messages_from_payload({"messages": out["messages"]}, include_tool_calls, is_output=True))
236+
else:
237+
ep_messages.extend(self._extract_messages_from_payload(inp, include_tool_calls))
238+
ep_messages.extend(self._extract_messages_from_payload(out, include_tool_calls, is_output=True))
239+
240+
def _canon(text: Any) -> str:
241+
text_str = str(text) if text is not None else ""
242+
return " ".join(text_str.strip().lower().split())
243+
244+
deduped: List[Message] = []
245+
for m in ep_messages:
246+
if deduped and m.role == "user" and deduped[-1].role == "user":
247+
if _canon(m.content) == _canon(deduped[-1].content):
248+
continue
249+
deduped.append(m)
250+
ep_messages = deduped
251+
252+
if not ep_messages:
253+
continue
254+
255+
tools = None
256+
if include_tool_calls and isinstance(inp, dict) and "tools" in inp:
257+
tools = inp["tools"]
258+
259+
results.append(
260+
EvaluationRow(
261+
messages=ep_messages,
262+
tools=tools,
263+
input_metadata=InputMetadata(
264+
session_data={
265+
"langsmith_run_id": str(getattr(r, "id", "")),
266+
"langsmith_trace_id": str(getattr(r, "trace_id", "")),
267+
"langsmith_project": project_name or "",
268+
}
269+
),
270+
)
271+
)
272+
except (AttributeError, ValueError, KeyError, TypeError) as e:
273+
logger.warning("Failed to convert run %s: %s", getattr(r, "id", ""), e)
274+
continue
275+
276+
return results
277+
122278
def _extract_messages_from_payload(
123279
self, payload: Any, include_tool_calls: bool, *, is_output: bool = False
124280
) -> List[Message]:
@@ -161,13 +317,11 @@ def _dict_to_message(msg_dict: Dict[str, Any]) -> Message:
161317
# Extract id/type/function fields from dicts or provider-native objects
162318
if isinstance(tc, dict):
163319
tc_id = tc.get("id", None)
164-
tc_type = tc.get("type", "function") or "function"
165320
fn = tc.get("function", {}) or {}
166321
fn_name = fn.get("name", None)
167322
fn_args = fn.get("arguments", None)
168323
else:
169324
tc_id = getattr(tc, "id", None)
170-
tc_type = getattr(tc, "type", None) or "function"
171325
f = getattr(tc, "function", None)
172326
fn_name = getattr(f, "name", None) if f is not None else None
173327
fn_args = getattr(f, "arguments", None) if f is not None else None
@@ -185,7 +339,7 @@ def _dict_to_message(msg_dict: Dict[str, Any]) -> Message:
185339
)
186340
)
187341
tool_calls = typed_calls
188-
except Exception:
342+
except (ImportError, AttributeError, TypeError, ValueError):
189343
# If OpenAI types unavailable, leave None to satisfy type checker
190344
tool_calls = None
191345
if "tool_call_id" in msg_dict:

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ adapters = [
114114
"datasets>=3.0.0",
115115
"transformers>=4.0.0",
116116
]
117+
langsmith = [
118+
"langsmith>=0.1.86",
119+
]
117120
bigquery = [
118121
"google-cloud-bigquery>=3.0.0",
119122
"google-auth>=2.0.0",

0 commit comments

Comments
 (0)