add timing filter

xzrderek · xzrderek · commit ae7211a006b9 · 2025-09-12T17:03:23.000-07:00
diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py
@@ -65,6 +65,8 @@ def get_evaluation_rows(
         user_id: Optional[str] = None,
         session_id: Optional[str] = None,
         hours_back: Optional[int] = None,
+        from_timestamp: Optional[datetime] = None,
+        to_timestamp: Optional[datetime] = None,
         include_tool_calls: bool = True,
         page_size: int = 30,
         sleep_between_gets: float = 0.1,
@@ -78,6 +80,8 @@ def get_evaluation_rows(
             user_id: Filter by user ID
             session_id: Filter by session ID
             hours_back: Filter traces from this many hours ago
+            from_timestamp: Only include traces with timestamp >= this datetime
+            to_timestamp: Only include traces with timestamp <= this datetime
             include_tool_calls: Whether to include tool calling traces
             page_size: Number of traces to fetch per page (smaller = less rate limit issues)
             sleep_between_gets: Sleep time between individual trace.get() calls
@@ -88,12 +92,10 @@ def get_evaluation_rows(
         """
         eval_rows = []
 
-        if hours_back:
+        # Determine time window: explicit from/to takes precedence
+        if from_timestamp is None and to_timestamp is None and hours_back:
             to_timestamp = datetime.now()
             from_timestamp = to_timestamp - timedelta(hours=hours_back)
-        else:
-            to_timestamp = None
-            from_timestamp = None
 
         # Single API call to get trace list
         traces = self.client.api.trace.list(
@@ -155,7 +157,7 @@ def get_evaluation_rows(
                     logger.warning("Failed to convert trace %s: %s", trace_info.id, e)
                     continue
 
-        logger.info("Successfully processed %d traces into %d evaluation rows", len(selected_traces), len(eval_rows))
+        logger.info("Successfully processed %d traces into evaluation rows", len(selected_traces))
         return eval_rows
 
     def get_evaluation_rows_by_ids(
diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py
@@ -197,8 +197,8 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
             response = await acompletion(
                 model=self.model_id,
                 **request_params,
-                api_base="https://litellm-cloud-proxy-prod-zfdbl7ykrq-uc.a.run.app/v1",
-                extra_body={"tags": ["kimi-k2-tau-bench"]},
+                # api_base="https://litellm-cloud-proxy-prod-zfdbl7ykrq-uc.a.run.app/v1",
+                # extra_body={"tags": ["kimi-k2-tau-bench"]},
             )
 
             # Log cache hit/miss for monitoring
diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
@@ -3,6 +3,7 @@
 """
 
 import os
+from datetime import datetime
 from typing import List, Dict, Any, Optional
 from tqdm import tqdm
 
@@ -27,7 +28,7 @@
 @evaluation_test(
     input_rows=[
         fetch_langfuse_traces_as_evaluation_rows(
-            hours_back=24,
+            to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
             limit=40,
             page_size=10,
             sleep_between_gets=3.0,
diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py
@@ -3,6 +3,7 @@
 """
 
 import os
+from datetime import datetime
 import re
 from typing import List, Dict, Any, Optional
 import pandas as pd
@@ -50,7 +51,7 @@
         "max_concurrency": 16,
     },
     "kimi-k2-instruct-0905": {
-        "model": "kimi-k2-instruct-0905",
+        "model": "accounts/fireworks/models/kimi-k2-instruct-0905",
         "temperature": 0.6,  # Kimi recommended temperature
         "max_tokens": 131000,
         "api_key": os.getenv("FIREWORKS_API_KEY"),
@@ -230,6 +231,8 @@ def fetch_langfuse_traces_as_evaluation_rows(
     user_id: Optional[str] = None,
     session_id: Optional[str] = None,
     hours_back: Optional[int] = None,
+    from_timestamp: Optional[datetime] = None,
+    to_timestamp: Optional[datetime] = None,
     include_tool_calls: bool = True,
     page_size: int = 30,
     sleep_between_gets: float = 0.1,
@@ -244,6 +247,8 @@ def fetch_langfuse_traces_as_evaluation_rows(
         user_id: Filter traces by user ID
         session_id: Filter traces by session ID
         hours_back: Only fetch traces from the last N hours
+        from_timestamp: Only include traces with timestamp >= this datetime
+        to_timestamp: Only include traces with timestamp <= this datetime
         include_tool_calls: Whether to include tool calls in messages
         page_size: Number of traces to fetch per page (smaller = less rate limit issues)
         sleep_between_gets: Sleep time between individual trace.get() calls (2.5s for 30 req/min limit)
@@ -262,6 +267,8 @@ def fetch_langfuse_traces_as_evaluation_rows(
             user_id=user_id,
             session_id=session_id,
             hours_back=hours_back,
+            from_timestamp=from_timestamp,
+            to_timestamp=to_timestamp,
             include_tool_calls=include_tool_calls,
             page_size=page_size,
             sleep_between_gets=sleep_between_gets,