update adapter

xzrderek · xzrderek · commit 7472db41ed16 · 2025-09-14T21:35:24.000-07:00
diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py
@@ -61,68 +61,117 @@ def __init__(self):
     def get_evaluation_rows(
         self,
         limit: int = 100,
+        sample_size: int = 50,
         tags: Optional[List[str]] = None,
         user_id: Optional[str] = None,
         session_id: Optional[str] = None,
         hours_back: Optional[int] = None,
         from_timestamp: Optional[datetime] = None,
         to_timestamp: Optional[datetime] = None,
         include_tool_calls: bool = True,
-        page_size: int = 30,  # TODO: remove probably
-        sleep_between_gets: float = 0.1,
+        sleep_between_gets: float = 2.5,
         max_retries: int = 3,
     ) -> List[EvaluationRow]:
         """Pull traces from Langfuse and convert to EvaluationRow format.
 
         Args:
-            limit: Maximum number of rows to return
+            limit: Max number of trace summaries to collect via pagination (pre-sampling)
+            sample_size: Number of traces to fetch full details for (sampled from collected summaries)
             tags: Filter by specific tags
             user_id: Filter by user ID
             session_id: Filter by session ID
             hours_back: Filter traces from this many hours ago
-            from_timestamp: Only include traces with timestamp >= this datetime
-            to_timestamp: Only include traces with timestamp <= this datetime
+            from_timestamp: Explicit start time (overrides hours_back)
+            to_timestamp: Explicit end time (overrides hours_back)
             include_tool_calls: Whether to include tool calling traces
-            page_size: Number of traces to fetch per page (smaller = less rate limit issues)
-            sleep_between_gets: Sleep time between individual trace.get() calls
+            sleep_between_gets: Sleep time between individual trace.get() calls (2.5s for 30 req/min limit)
             max_retries: Maximum retries for rate limit errors
 
         Returns:
             List[EvaluationRow]: Converted evaluation rows
         """
         eval_rows = []
 
-        # Determine time window: explicit from/to takes precedence
+        # Determine time window: explicit from/to takes precedence over hours_back
         if from_timestamp is None and to_timestamp is None and hours_back:
             to_timestamp = datetime.now()
             from_timestamp = to_timestamp - timedelta(hours=hours_back)
 
-        # Single API call to get trace list
-        traces = self.client.api.trace.list(
-            limit=limit,
-            tags=tags,
-            user_id=user_id,
-            session_id=session_id,
-            from_timestamp=from_timestamp,
-            to_timestamp=to_timestamp,
-            order_by="timestamp.desc",
-        )
+        # Collect trace summaries via pagination (up to limit)
+        all_traces = []
+        page = 1
+        collected = 0
+
+        while collected < limit:
+            current_page_limit = min(100, limit - collected)  # Langfuse API max is 100
+
+            logger.debug(
+                "Fetching page %d with limit %d (collected: %d/%d)", page, current_page_limit, collected, limit
+            )
 
-        if not traces or not traces.data:
+            # Fetch trace list with retry logic
+            traces = None
+            list_retries = 0
+            while list_retries < max_retries:
+                try:
+                    traces = self.client.api.trace.list(
+                        page=page,
+                        limit=current_page_limit,
+                        tags=tags,
+                        user_id=user_id,
+                        session_id=session_id,
+                        from_timestamp=from_timestamp,
+                        to_timestamp=to_timestamp,
+                        order_by="timestamp.desc",
+                    )
+                    break
+                except Exception as e:
+                    list_retries += 1
+                    if "429" in str(e) and list_retries < max_retries:
+                        sleep_time = 2**list_retries  # Exponential backoff
+                        logger.warning(
+                            "Rate limit hit on trace.list(), retrying in %ds (attempt %d/%d)",
+                            sleep_time,
+                            list_retries,
+                            max_retries,
+                        )
+                        time.sleep(sleep_time)
+                    else:
+                        logger.error("Failed to fetch trace list after %d retries: %s", max_retries, e)
+                        return eval_rows  # Return what we have so far
+
+            if not traces or not traces.data:
+                logger.debug("No more traces found on page %d", page)
+                break
+
+            logger.debug("Collected %d traces from page %d", len(traces.data), page)
+
+            all_traces.extend(traces.data)
+            collected += len(traces.data)
+
+            # Check if we have more pages
+            if hasattr(traces.meta, "page") and hasattr(traces.meta, "total_pages"):
+                if traces.meta.page >= traces.meta.total_pages:
+                    break
+            elif len(traces.data) < current_page_limit:
+                break
+
+            page += 1
+
+        if not all_traces:
             logger.debug("No traces found")
             return eval_rows
 
-        # Randomly sample the requested number of traces
-        available_traces = traces.data
-        sample_size = min(limit, len(available_traces))
-        selected_traces = random.sample(available_traces, sample_size)
+        # Randomly sample traces to fetch full details (respect rate limits)
+        actual_sample_size = min(sample_size, len(all_traces))
+        selected_traces = random.sample(all_traces, actual_sample_size)
 
-        logger.debug("Randomly selected %d traces from %d available", len(selected_traces), len(available_traces))
+        logger.debug("Randomly selected %d traces from %d collected", actual_sample_size, len(all_traces))
 
         # Process each selected trace with sleep and retry logic
-        for i, trace_info in enumerate(selected_traces):
+        for trace_info in selected_traces:
             # Sleep between gets to avoid rate limits
-            if sleep_between_gets > 0 and i > 0:
+            if sleep_between_gets > 0:
                 time.sleep(sleep_between_gets)
 
             # Fetch full trace details with retry logic
@@ -157,7 +206,9 @@ def get_evaluation_rows(
                     logger.warning("Failed to convert trace %s: %s", trace_info.id, e)
                     continue
 
-        logger.info("Successfully processed %d traces into evaluation rows", len(selected_traces))
+        logger.info(
+            "Successfully processed %d selected traces into %d evaluation rows", len(selected_traces), len(eval_rows)
+        )
         return eval_rows
 
     def get_evaluation_rows_by_ids(
diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
@@ -15,22 +15,24 @@
 from eval_protocol.quickstart.utils import (
     split_multi_turn_rows,
     JUDGE_CONFIGS,
-    fetch_langfuse_traces_as_evaluation_rows,
     calculate_bootstrap_scores,
     push_scores_to_langfuse,
-    run_judgment_async_with_shared_client,
+    run_judgment_async,
 )
 import asyncio
 from openai import AsyncOpenAI
+from eval_protocol.adapters.langfuse import create_langfuse_adapter
+
+adapter = create_langfuse_adapter()
 
 
 @pytest.mark.asyncio
 @evaluation_test(
     input_rows=[
-        fetch_langfuse_traces_as_evaluation_rows(
+        adapter.get_evaluation_rows(
             to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
-            limit=40,
-            page_size=10,
+            limit=711,
+            sample_size=50,
             sleep_between_gets=3.0,
             max_retries=5,
         )
@@ -71,7 +73,7 @@ async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
         Same rows with updated evaluation_result containing scores and judgments
     """
 
-    judge_name = "kimi-k2-instruct-0905"  # Edit to which judge you'd like to use. Configs are in utils.py.
+    judge_name = "gemini-2.5-pro"  # Edit to which judge you'd like to use. Configs are in utils.py.
 
     if not rows:
         print("❌ No evaluation rows provided")
@@ -91,11 +93,11 @@ async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
     ) as shared_client:
         semaphore = asyncio.Semaphore(max_concurrency)
 
-        async def run_judgment_with_semaphore(row):
+        async def run_judgment(row):
             async with semaphore:
-                return await run_judgment_async_with_shared_client(row, model_name, judge_name, shared_client)
+                return await run_judgment_async(row, model_name, judge_name, shared_client)
 
-        tasks = [run_judgment_with_semaphore(row) for row in rows]
+        tasks = [run_judgment(row) for row in rows]
 
         for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Generating judgments"):
             result = await coro
@@ -131,6 +133,6 @@ async def run_judgment_with_semaphore(row):
             )  # Standard error approximation from 90% CI
 
     # Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
-    # push_scores_to_langfuse(rows, model_name, mean_score)
+    push_scores_to_langfuse(rows, model_name, mean_score)
 
     return rows
diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py
@@ -50,6 +50,14 @@
         "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
         "max_concurrency": 16,
     },
+    "gemini-2.5-flash": {
+        "model": "gemini-2.5-flash",
+        "temperature": 1.0,
+        "max_tokens": 32000,
+        "api_key": os.getenv("GEMINI_API_KEY"),
+        "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
+        "max_concurrency": 16,
+    },
     "kimi-k2-instruct-0905": {
         "model": "accounts/fireworks/models/kimi-k2-instruct-0905",
         "temperature": 0.6,  # Kimi recommended temperature
@@ -195,7 +203,7 @@ async def pairwise_judgment_async(question_text, answer_a, answer_b, tools, judg
     return {"score": score, "judgment": judgment_text, "prompt": messages}
 
 
-async def run_judgment_async_with_shared_client(
+async def run_judgment_async(
     row: EvaluationRow, model_name: str, judge_name: str, shared_client
 ) -> Optional[Dict[str, Any]]:
     """Async judgment using shared client to avoid cleanup issues."""
@@ -232,60 +240,6 @@ async def run_judgment_async_with_shared_client(
     return {"model": model_name, "games": games}
 
 
-def fetch_langfuse_traces_as_evaluation_rows(
-    limit: int = 100,
-    tags: Optional[List[str]] = None,
-    user_id: Optional[str] = None,
-    session_id: Optional[str] = None,
-    hours_back: Optional[int] = None,
-    from_timestamp: Optional[datetime] = None,
-    to_timestamp: Optional[datetime] = None,
-    include_tool_calls: bool = True,
-    page_size: int = 30,
-    sleep_between_gets: float = 0.1,
-    max_retries: int = 3,
-) -> List[EvaluationRow]:
-    """
-    Fetch Langfuse traces and convert them to EvaluationRow objects.
-
-    Args:
-        limit: Maximum number of traces to fetch
-        tags: Filter traces by tags
-        user_id: Filter traces by user ID
-        session_id: Filter traces by session ID
-        hours_back: Only fetch traces from the last N hours
-        from_timestamp: Only include traces with timestamp >= this datetime
-        to_timestamp: Only include traces with timestamp <= this datetime
-        include_tool_calls: Whether to include tool calls in messages
-        page_size: Number of traces to fetch per page (smaller = less rate limit issues)
-        sleep_between_gets: Sleep time between individual trace.get() calls (2.5s for 30 req/min limit)
-        max_retries: Maximum retries for rate limit errors
-
-    Returns:
-        List of EvaluationRow objects converted from Langfuse traces
-    """
-    try:
-        from eval_protocol.adapters.langfuse import create_langfuse_adapter
-
-        adapter = create_langfuse_adapter()
-        return adapter.get_evaluation_rows(
-            limit=limit,
-            tags=tags,
-            user_id=user_id,
-            session_id=session_id,
-            hours_back=hours_back,
-            from_timestamp=from_timestamp,
-            to_timestamp=to_timestamp,
-            include_tool_calls=include_tool_calls,
-            page_size=page_size,
-            sleep_between_gets=sleep_between_gets,
-            max_retries=max_retries,
-        )
-    except Exception as e:
-        print(f"❌ LangfuseAdapter failed: {e}")
-        return []
-
-
 def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> tuple[float, float, float]:
     """
     Calculate bootstrap confidence intervals for Arena-Hard-Auto style judgments.