Skip to content

Commit 556e2c9

Browse files
author
Dylan Huang
committed
Merge branch 'main' into link-to-local-ui-part-2
2 parents 11c10ec + fdd76dc commit 556e2c9

File tree

9 files changed

+567
-441
lines changed

9 files changed

+567
-441
lines changed

README.md

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,18 @@
44

55
**The open-source toolkit for building your internal model leaderboard.**
66

7-
When you have multiple AI models to choose from—different versions, providers, or configurations—how do you know which one is best for your use case?
7+
When you have multiple AI models to choose from—different versions, providers,
8+
or configurations—how do you know which one is best for your use case?
9+
10+
## 🚀 Features
11+
12+
- **Custom Evaluations**: Write evaluations tailored to your specific business needs
13+
- **Auto-Evaluation**: Stack-rank models using LLMs as judges with just model traces using out-of-the-box evaluators
14+
- **RL Environments via MCP**: Build reinforcement learning environments using the Model Control Protocol (MCP) to simulate user interactions and advanced evaluation scenarios
15+
- **Consistent Testing**: Test across various models and configurations with a unified framework
16+
- **Resilient Runtime**: Automatic retries for unstable LLM APIs and concurrent execution for long-running evaluations
17+
- **Rich Visualizations**: Built-in pivot tables and visualizations for result analysis
18+
- **Data-Driven Decisions**: Make informed model deployment decisions based on comprehensive evaluation results
819

920
## Quick Examples
1021

@@ -69,15 +80,6 @@ def test_math_reasoning(row: EvaluationRow) -> EvaluationRow:
6980
return row
7081
```
7182

72-
## 🚀 Features
73-
74-
- **Custom Evaluations**: Write evaluations tailored to your specific business needs
75-
- **Auto-Evaluation**: Stack-rank models using LLMs as judges with just model traces
76-
- **Model Context Protocol (MCP) Integration**: Build reinforcement learning environments and trigger user simulations for complex scenarios
77-
- **Consistent Testing**: Test across various models and configurations with a unified framework
78-
- **Resilient Runtime**: Automatic retries for unstable LLM APIs and concurrent execution for long-running evaluations
79-
- **Rich Visualizations**: Built-in pivot tables and visualizations for result analysis
80-
- **Data-Driven Decisions**: Make informed model deployment decisions based on comprehensive evaluation results
8183

8284
## 📚 Resources
8385

eval_protocol/adapters/langfuse.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
"""
66

77
from __future__ import annotations
8-
98
import logging
109
import random
1110
import time
1211
from datetime import datetime, timedelta
13-
from typing import Any, Dict, List, Optional, Protocol, TYPE_CHECKING
12+
from typing import Any, Dict, List, Optional, Protocol, TYPE_CHECKING, cast
1413

14+
from langfuse.api.resources.commons.types.observations_view import ObservationsView
1515
from eval_protocol.models import EvaluationRow, InputMetadata, Message
1616
from .base import BaseAdapter
1717
from .utils import extract_messages_from_data
@@ -232,12 +232,12 @@ class LangfuseAdapter(BaseAdapter):
232232
... ))
233233
"""
234234

235-
def __init__(self):
235+
def __init__(self, client: Optional[Any] = None):
236236
"""Initialize the Langfuse adapter."""
237237
if not LANGFUSE_AVAILABLE:
238238
raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'")
239239

240-
self.client = get_client()
240+
self.client = client or cast(Any, get_client)()
241241

242242
def get_evaluation_rows(
243243
self,

eval_protocol/adapters/langsmith.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from __future__ import annotations
1111

1212
import logging
13-
from typing import Any, Dict, List, Optional, Iterable
13+
from typing import Any, Dict, List, Optional, Iterable, cast
1414

1515
from eval_protocol.models import EvaluationRow, InputMetadata, Message
1616
from .base import BaseAdapter
@@ -23,6 +23,7 @@
2323
LANGSMITH_AVAILABLE = True
2424
except ImportError:
2525
LANGSMITH_AVAILABLE = False
26+
Client = None # type: ignore[misc]
2627

2728

2829
class LangSmithAdapter(BaseAdapter):
@@ -38,9 +39,11 @@ class LangSmithAdapter(BaseAdapter):
3839
def __init__(self, client: Optional[Any] = None) -> None:
3940
if not LANGSMITH_AVAILABLE:
4041
raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'")
41-
# Client is provided by langsmith package; typing is relaxed to Any to avoid
42-
# static analysis issues when stubs aren't available.
43-
self.client = client or Client() # type: ignore[reportCallIssue]
42+
if client is not None:
43+
self.client = client
44+
else:
45+
assert Client is not None
46+
self.client = cast(Any, Client)()
4447

4548
def get_evaluation_rows(
4649
self,

examples/langsmith/README.md

Lines changed: 0 additions & 24 deletions
This file was deleted.

examples/langsmith/dump_traces_langsmith.py

Lines changed: 0 additions & 115 deletions
This file was deleted.

examples/langsmith/emit_tool_calls.py

Lines changed: 0 additions & 116 deletions
This file was deleted.

0 commit comments

Comments
 (0)