Skip to content

Commit ae7211a

Browse files
committed
add timing filter
1 parent b995a9f commit ae7211a

File tree

4 files changed

+19
-9
lines changed

4 files changed

+19
-9
lines changed

eval_protocol/adapters/langfuse.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ def get_evaluation_rows(
6565
user_id: Optional[str] = None,
6666
session_id: Optional[str] = None,
6767
hours_back: Optional[int] = None,
68+
from_timestamp: Optional[datetime] = None,
69+
to_timestamp: Optional[datetime] = None,
6870
include_tool_calls: bool = True,
6971
page_size: int = 30,
7072
sleep_between_gets: float = 0.1,
@@ -78,6 +80,8 @@ def get_evaluation_rows(
7880
user_id: Filter by user ID
7981
session_id: Filter by session ID
8082
hours_back: Filter traces from this many hours ago
83+
from_timestamp: Only include traces with timestamp >= this datetime
84+
to_timestamp: Only include traces with timestamp <= this datetime
8185
include_tool_calls: Whether to include tool calling traces
8286
page_size: Number of traces to fetch per page (smaller = less rate limit issues)
8387
sleep_between_gets: Sleep time between individual trace.get() calls
@@ -88,12 +92,10 @@ def get_evaluation_rows(
8892
"""
8993
eval_rows = []
9094

91-
if hours_back:
95+
# Determine time window: explicit from/to takes precedence
96+
if from_timestamp is None and to_timestamp is None and hours_back:
9297
to_timestamp = datetime.now()
9398
from_timestamp = to_timestamp - timedelta(hours=hours_back)
94-
else:
95-
to_timestamp = None
96-
from_timestamp = None
9799

98100
# Single API call to get trace list
99101
traces = self.client.api.trace.list(
@@ -155,7 +157,7 @@ def get_evaluation_rows(
155157
logger.warning("Failed to convert trace %s: %s", trace_info.id, e)
156158
continue
157159

158-
logger.info("Successfully processed %d traces into %d evaluation rows", len(selected_traces), len(eval_rows))
160+
logger.info("Successfully processed %d traces into evaluation rows", len(selected_traces))
159161
return eval_rows
160162

161163
def get_evaluation_rows_by_ids(

eval_protocol/mcp/execution/policy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,8 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
197197
response = await acompletion(
198198
model=self.model_id,
199199
**request_params,
200-
api_base="https://litellm-cloud-proxy-prod-zfdbl7ykrq-uc.a.run.app/v1",
201-
extra_body={"tags": ["kimi-k2-tau-bench"]},
200+
# api_base="https://litellm-cloud-proxy-prod-zfdbl7ykrq-uc.a.run.app/v1",
201+
# extra_body={"tags": ["kimi-k2-tau-bench"]},
202202
)
203203

204204
# Log cache hit/miss for monitoring

eval_protocol/quickstart/llm_judge.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import os
6+
from datetime import datetime
67
from typing import List, Dict, Any, Optional
78
from tqdm import tqdm
89

@@ -27,7 +28,7 @@
2728
@evaluation_test(
2829
input_rows=[
2930
fetch_langfuse_traces_as_evaluation_rows(
30-
hours_back=24,
31+
to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
3132
limit=40,
3233
page_size=10,
3334
sleep_between_gets=3.0,

eval_protocol/quickstart/utils.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import os
6+
from datetime import datetime
67
import re
78
from typing import List, Dict, Any, Optional
89
import pandas as pd
@@ -50,7 +51,7 @@
5051
"max_concurrency": 16,
5152
},
5253
"kimi-k2-instruct-0905": {
53-
"model": "kimi-k2-instruct-0905",
54+
"model": "accounts/fireworks/models/kimi-k2-instruct-0905",
5455
"temperature": 0.6, # Kimi recommended temperature
5556
"max_tokens": 131000,
5657
"api_key": os.getenv("FIREWORKS_API_KEY"),
@@ -230,6 +231,8 @@ def fetch_langfuse_traces_as_evaluation_rows(
230231
user_id: Optional[str] = None,
231232
session_id: Optional[str] = None,
232233
hours_back: Optional[int] = None,
234+
from_timestamp: Optional[datetime] = None,
235+
to_timestamp: Optional[datetime] = None,
233236
include_tool_calls: bool = True,
234237
page_size: int = 30,
235238
sleep_between_gets: float = 0.1,
@@ -244,6 +247,8 @@ def fetch_langfuse_traces_as_evaluation_rows(
244247
user_id: Filter traces by user ID
245248
session_id: Filter traces by session ID
246249
hours_back: Only fetch traces from the last N hours
250+
from_timestamp: Only include traces with timestamp >= this datetime
251+
to_timestamp: Only include traces with timestamp <= this datetime
247252
include_tool_calls: Whether to include tool calls in messages
248253
page_size: Number of traces to fetch per page (smaller = less rate limit issues)
249254
sleep_between_gets: Sleep time between individual trace.get() calls (2.5s for 30 req/min limit)
@@ -262,6 +267,8 @@ def fetch_langfuse_traces_as_evaluation_rows(
262267
user_id=user_id,
263268
session_id=session_id,
264269
hours_back=hours_back,
270+
from_timestamp=from_timestamp,
271+
to_timestamp=to_timestamp,
265272
include_tool_calls=include_tool_calls,
266273
page_size=page_size,
267274
sleep_between_gets=sleep_between_gets,

0 commit comments

Comments
 (0)