eval-protocol
diff --git a/‎eval_protocol/adapters/langfuse.py‎
Lines changed: 141 additions & 29 deletions b/‎eval_protocol/adapters/langfuse.py‎
Lines changed: 141 additions & 29 deletions
diff --git a/‎eval_protocol/human_id/__init__.py‎
Lines changed: 27 additions & 46 deletions b/‎eval_protocol/human_id/__init__.py‎
Lines changed: 27 additions & 46 deletions
diff --git a/‎eval_protocol/mcp/execution/policy.py‎
Lines changed: 6 additions & 1 deletion b/‎eval_protocol/mcp/execution/policy.py‎
Lines changed: 6 additions & 1 deletion
@@ -6,6 +6,8 @@
 
 from langfuse.api.resources.commons.types.observations_view import ObservationsView
 import logging
+import random
+import time
 from datetime import datetime, timedelta
 from typing import Any, Dict, Iterator, List, Optional, cast
 
@@ -59,53 +61,154 @@ def __init__(self):
     def get_evaluation_rows(
         self,
         limit: int = 100,
+        sample_size: int = 50,
         tags: Optional[List[str]] = None,
         user_id: Optional[str] = None,
         session_id: Optional[str] = None,
         hours_back: Optional[int] = None,
+        from_timestamp: Optional[datetime] = None,
+        to_timestamp: Optional[datetime] = None,
         include_tool_calls: bool = True,
+        sleep_between_gets: float = 2.5,
+        max_retries: int = 3,
     ) -> List[EvaluationRow]:
         """Pull traces from Langfuse and convert to EvaluationRow format.
 
         Args:
-            limit: Maximum number of rows to return
+            limit: Max number of trace summaries to collect via pagination (pre-sampling)
+            sample_size: Number of traces to fetch full details for (sampled from collected summaries)
             tags: Filter by specific tags
             user_id: Filter by user ID
             session_id: Filter by session ID
             hours_back: Filter traces from this many hours ago
+            from_timestamp: Explicit start time (overrides hours_back)
+            to_timestamp: Explicit end time (overrides hours_back)
             include_tool_calls: Whether to include tool calling traces
+            sleep_between_gets: Sleep time between individual trace.get() calls (2.5s for 30 req/min limit)
+            max_retries: Maximum retries for rate limit errors
 
-        Yields:
-            EvaluationRow: Converted evaluation rows
+        Returns:
+            List[EvaluationRow]: Converted evaluation rows
         """
-        # Get traces from Langfuse using new API
+        eval_rows = []
 
-        if hours_back:
+        # Determine time window: explicit from/to takes precedence over hours_back
+        if from_timestamp is None and to_timestamp is None and hours_back:
             to_timestamp = datetime.now()
             from_timestamp = to_timestamp - timedelta(hours=hours_back)
-        else:
-            to_timestamp = None
-            from_timestamp = None
 
-        eval_rows = []
+        # Collect trace summaries via pagination (up to limit)
+        all_traces = []
+        page = 1
+        collected = 0
 
-        traces: Traces = self.client.api.trace.list(
-            limit=limit,
-            tags=tags,
-            user_id=user_id,
-            session_id=session_id,
-            from_timestamp=from_timestamp,
-            to_timestamp=to_timestamp,
-        )
+        while collected < limit:
+            current_page_limit = min(100, limit - collected)  # Langfuse API max is 100
 
-        for trace in traces.data:
-            try:
-                eval_row = self._convert_trace_to_evaluation_row(trace, include_tool_calls)
-                if eval_row:
-                    eval_rows.append(eval_row)
-            except (AttributeError, ValueError, KeyError) as e:
-                logger.warning("Failed to convert trace %s: %s", trace.id, e)
-                continue
+            logger.debug(
+                "Fetching page %d with limit %d (collected: %d/%d)", page, current_page_limit, collected, limit
+            )
+
+            # Fetch trace list with retry logic
+            traces = None
+            list_retries = 0
+            while list_retries < max_retries:
+                try:
+                    traces = self.client.api.trace.list(
+                        page=page,
+                        limit=current_page_limit,
+                        tags=tags,
+                        user_id=user_id,
+                        session_id=session_id,
+                        from_timestamp=from_timestamp,
+                        to_timestamp=to_timestamp,
+                        order_by="timestamp.desc",
+                    )
+                    break
+                except Exception as e:
+                    list_retries += 1
+                    if "429" in str(e) and list_retries < max_retries:
+                        sleep_time = 2**list_retries  # Exponential backoff
+                        logger.warning(
+                            "Rate limit hit on trace.list(), retrying in %ds (attempt %d/%d)",
+                            sleep_time,
+                            list_retries,
+                            max_retries,
+                        )
+                        time.sleep(sleep_time)
+                    else:
+                        logger.error("Failed to fetch trace list after %d retries: %s", max_retries, e)
+                        return eval_rows  # Return what we have so far
+
+            if not traces or not traces.data:
+                logger.debug("No more traces found on page %d", page)
+                break
+
+            logger.debug("Collected %d traces from page %d", len(traces.data), page)
+
+            all_traces.extend(traces.data)
+            collected += len(traces.data)
+
+            # Check if we have more pages
+            if hasattr(traces.meta, "page") and hasattr(traces.meta, "total_pages"):
+                if traces.meta.page >= traces.meta.total_pages:
+                    break
+            elif len(traces.data) < current_page_limit:
+                break
+
+            page += 1
+
+        if not all_traces:
+            logger.debug("No traces found")
+            return eval_rows
+
+        # Randomly sample traces to fetch full details (respect rate limits)
+        actual_sample_size = min(sample_size, len(all_traces))
+        selected_traces = random.sample(all_traces, actual_sample_size)
+
+        logger.debug("Randomly selected %d traces from %d collected", actual_sample_size, len(all_traces))
+
+        # Process each selected trace with sleep and retry logic
+        for trace_info in selected_traces:
+            # Sleep between gets to avoid rate limits
+            if sleep_between_gets > 0:
+                time.sleep(sleep_between_gets)
+
+            # Fetch full trace details with retry logic
+            trace_full = None
+            detail_retries = 0
+            while detail_retries < max_retries:
+                try:
+                    trace_full = self.client.api.trace.get(trace_info.id)
+                    break
+                except Exception as e:
+                    detail_retries += 1
+                    if "429" in str(e) and detail_retries < max_retries:
+                        sleep_time = 2**detail_retries  # Exponential backoff
+                        logger.warning(
+                            "Rate limit hit on trace.get(%s), retrying in %ds (attempt %d/%d)",
+                            trace_info.id,
+                            sleep_time,
+                            detail_retries,
+                            max_retries,
+                        )
+                        time.sleep(sleep_time)
+                    else:
+                        logger.warning("Failed to fetch trace %s after %d retries: %s", trace_info.id, max_retries, e)
+                        break  # Skip this trace
+
+            if trace_full:
+                try:
+                    eval_row = self._convert_trace_to_evaluation_row(trace_full, include_tool_calls)
+                    if eval_row:
+                        eval_rows.append(eval_row)
+                except (AttributeError, ValueError, KeyError) as e:
+                    logger.warning("Failed to convert trace %s: %s", trace_info.id, e)
+                    continue
+
+        logger.info(
+            "Successfully processed %d selected traces into %d evaluation rows", len(selected_traces), len(eval_rows)
+        )
         return eval_rows
 
     def get_evaluation_rows_by_ids(
@@ -135,7 +238,7 @@ def get_evaluation_rows_by_ids(
         return eval_rows
 
     def _convert_trace_to_evaluation_row(
-        self, trace: Trace, include_tool_calls: bool = True
+        self, trace: TraceWithFullDetails, include_tool_calls: bool = True
     ) -> Optional[EvaluationRow]:
         """Convert a Langfuse trace to EvaluationRow format.
 
@@ -147,8 +250,6 @@ def _convert_trace_to_evaluation_row(
             EvaluationRow or None if conversion fails
         """
         try:
-            trace = self.client.api.trace.get("2d9f3474-83ab-4431-9788-049ca4219023")
-
             # Extract messages from trace input and output
             messages = self._extract_messages_from_trace(trace, include_tool_calls)
 
@@ -163,13 +264,20 @@ def _convert_trace_to_evaluation_row(
             return EvaluationRow(
                 messages=messages,
                 tools=tools,
+                input_metadata=InputMetadata(
+                    session_data={
+                        "langfuse_trace_id": trace.id,  # Store the trace ID here
+                    }
+                ),
             )
 
         except (AttributeError, ValueError, KeyError) as e:
             logger.error("Error converting trace %s: %s", trace.id, e)
             return None
 
-    def _extract_messages_from_trace(self, trace: Any, include_tool_calls: bool = True) -> List[Message]:
+    def _extract_messages_from_trace(
+        self, trace: TraceWithFullDetails, include_tool_calls: bool = True
+    ) -> List[Message]:
         """Extract messages from Langfuse trace input and output.
 
         Args:
@@ -214,6 +322,10 @@ def _extract_messages_from_trace(self, trace: Any, include_tool_calls: bool = Tr
                     else:
                         # Fallback: convert entire output to string
                         messages.append(Message(role="assistant", content=str(trace.output)))
+                elif isinstance(trace.output, list):
+                    # Direct list of message dicts (same as input handling)
+                    for msg in trace.output:
+                        messages.append(self._dict_to_message(msg, include_tool_calls))
                 elif isinstance(trace.output, str):
                     messages.append(Message(role="assistant", content=trace.output))
 
 
@@ -12,85 +12,66 @@
 def generate_id(
     separator: str = "-",
     seed: int | float | str | bytes | bytearray | None = None,
-    word_count: int = 5,
     index: int | None = None,
 ) -> str:
     """
-    Generate a human readable ID
+    Generate a human readable ID in format: adjective-noun-NN
 
     :param separator: The string to use to separate words
     :param seed: The seed to use. The same seed will produce the same ID or index-based mapping
     :param index: Optional non-negative integer providing a 1:1 mapping to an ID.
                   When provided, the mapping is deterministic and bijective for
                   all integers in range [0, total_combinations).
-    :param word_count: The number of words to use. Minimum of 3.
     :return: A human readable ID
     """
-    if word_count < 3:
-        raise ValueError("word_count cannot be lower than 3")
 
-    # If a specific index is provided, use mixed-radix encoding into a fixed
-    # sequence of parts to guarantee a bijection between integers and IDs.
-    # The sequence cycles as: verb, adjective, noun, verb, adjective, noun, ...
+    # If a specific index is provided, use it for deterministic generation
     if index is not None:
         if not isinstance(index, int) or index < 0:
             raise ValueError("index must be a non-negative integer if provided")
 
         # Prepare category lists; if seed is provided, shuffle deterministically
-        base_categories = [dictionary.verbs, dictionary.adjectives, dictionary.nouns]
         if seed is not None:
             rnd = random.Random(seed)
-            categories = [tuple(rnd.sample(cat, len(cat))) for cat in base_categories]
+            adjectives = tuple(rnd.sample(dictionary.adjectives, len(dictionary.adjectives)))
+            nouns = tuple(rnd.sample(dictionary.nouns, len(dictionary.nouns)))
         else:
-            categories = base_categories
-        # Build the category order for the desired word_count
-        ordered_categories = [categories[i % 3] for i in range(word_count)]
+            adjectives = dictionary.adjectives
+            nouns = dictionary.nouns
 
-        # Compute total number of combinations for this word_count
-        radices = [len(cat) for cat in ordered_categories]
-        total = num_combinations(word_count)
+        # Calculate total combinations: adjectives * nouns * 100 (for 00-99)
+        total = len(adjectives) * len(nouns) * 100
 
         if index >= total:
-            raise ValueError(f"index out of range for given word_count. Received {index}, max allowed is {total - 1}")
+            raise ValueError(f"index out of range. Received {index}, max allowed is {total - 1}")
 
-        # Mixed-radix decomposition (least significant position is the last word)
-        digits: list[int] = []
-        remaining = index
-        for base in reversed(radices):
-            digits.append(remaining % base)
-            remaining //= base
-        digits.reverse()
+        # Decompose index into adjective, noun, and number
+        number = index % 100
+        remaining = index // 100
+        noun_idx = remaining % len(nouns)
+        adj_idx = remaining // len(nouns)
 
-        words = [ordered_categories[pos][digits[pos]] for pos in range(word_count)]
-        return separator.join(words)
+        adjective = adjectives[adj_idx]
+        noun = nouns[noun_idx]
 
+        return f"{adjective}{separator}{noun}{separator}{number:02d}"
+
+    # Random generation
     random_obj = system_random
     if seed is not None:
         random_obj = random.Random(seed)
 
-    parts = {dictionary.verbs: 1, dictionary.adjectives: 1, dictionary.nouns: 1}
-
-    for _ in range(3, word_count):
-        parts[random_obj.choice(list(parts.keys()))] += 1
-
-    parts = itertools.chain.from_iterable(random_obj.sample(part, count) for part, count in parts.items())
+    adjective = random_obj.choice(dictionary.adjectives)
+    noun = random_obj.choice(dictionary.nouns)
+    number = random_obj.randint(0, 99)
 
-    return separator.join(parts)
+    return f"{adjective}{separator}{noun}{separator}{number:02d}"
 
 
-def num_combinations(word_count: int = 5) -> int:
+def num_combinations() -> int:
     """
-    Return the total number of unique IDs possible for the given word_count.
+    Return the total number of unique IDs possible.
 
-    The sequence of categories cycles as: verb, adjective, noun, then repeats.
-    This value can be used to mod an index when calling generate_id(index=...).
+    Format uses adjective-noun-NN, so total = adjectives * nouns * 100.
     """
-    if word_count < 3:
-        raise ValueError("word_count cannot be lower than 3")
-
-    categories = [dictionary.verbs, dictionary.adjectives, dictionary.nouns]
-    radices = [len(categories[i % 3]) for i in range(word_count)]
-    total = 1
-    for r in radices:
-        total *= r
-    return total
+    return len(dictionary.adjectives) * len(dictionary.nouns) * 100
@@ -194,7 +194,12 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
             request_params["tools"] = tools
 
         try:
-            response = await acompletion(model=self.model_id, **request_params)
+            response = await acompletion(
+                model=self.model_id,
+                **request_params,
+                # api_base="https://litellm-cloud-proxy-prod-zfdbl7ykrq-uc.a.run.app/v1",
+                # extra_body={"tags": ["kimi-k2-tau-bench"]},
+            )
 
             # Log cache hit/miss for monitoring
             hidden = getattr(response, "_hidden_params", {})