Skip to content

Commit eb7dd00

Browse files
committed
Add local Langfuse LiteLLM aha judge example
1 parent 76cc1e7 commit eb7dd00

File tree

1 file changed

+159
-0
lines changed

1 file changed

+159
-0
lines changed
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
"""Fully local Langfuse + LiteLLM example for the aha judge.
2+
3+
This example shows how to run the Arena-Hard-Auto ("aha") judge entirely on
4+
local infrastructure. It reuses the Langfuse adapter to pull traces from a
5+
self-hosted Langfuse deployment and evaluates them with a local LiteLLM router
6+
that fronts both `ollama` and `llama.cpp` backends.
7+
8+
Prerequisites
9+
-------------
10+
1. Start Langfuse locally and export the usual environment variables so the
11+
SDK can connect::
12+
13+
docker compose up -d
14+
export LANGFUSE_PUBLIC_KEY=local
15+
export LANGFUSE_SECRET_KEY=local
16+
export LANGFUSE_HOST=http://localhost:3000
17+
18+
Replace the credentials with whatever you configured for your local
19+
deployment.
20+
21+
2. Launch the model backends. The example below assumes:
22+
23+
* ``ollama`` is running on ``http://127.0.0.1:11434`` with the model
24+
``llama3.1`` pulled.
25+
* A ``llama.cpp`` server is running on ``http://127.0.0.1:8080`` that serves
26+
``Meta-Llama-3-8B-Instruct`` (adjust the path/model name for your set-up).
27+
28+
3. Start a LiteLLM router that proxies both backends. Save the following to
29+
``litellm-config.yaml`` (change model names as desired)::
30+
31+
model_list:
32+
- model_name: "judge/llama3.1"
33+
litellm_params:
34+
model: "ollama/llama3.1"
35+
api_base: "http://127.0.0.1:11434"
36+
- model_name: "candidate/llama3.8b"
37+
litellm_params:
38+
model: "llama.cpp"
39+
api_base: "http://127.0.0.1:8080/v1"
40+
model_path: "/path/to/Meta-Llama-3-8B-Instruct.gguf"
41+
42+
litellm_settings:
43+
drop_params: true
44+
telemetry: false
45+
46+
Then launch the router::
47+
48+
export LITELLM_API_KEY=local-demo-key
49+
litellm --config litellm-config.yaml --port 4000
50+
51+
4. Point the example at the router. The defaults below expect the router on
52+
``http://127.0.0.1:4000`` and use ``judge/llama3.1`` as the judge model.
53+
Override them via ``LITELLM_BASE_URL`` and ``LOCAL_JUDGE_MODEL`` if your
54+
configuration is different.
55+
56+
Running the example
57+
-------------------
58+
With the services running, execute::
59+
60+
pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local
61+
62+
The test will fetch traces from the local Langfuse instance, convert each
63+
assistant turn into an ``EvaluationRow``, and score them with the local judge.
64+
"""
65+
66+
from __future__ import annotations
67+
68+
from datetime import datetime
69+
import os
70+
71+
import pytest
72+
73+
from eval_protocol import (
74+
DynamicDataLoader,
75+
EvaluationRow,
76+
SingleTurnRolloutProcessor,
77+
aha_judge,
78+
create_langfuse_adapter,
79+
evaluation_test,
80+
multi_turn_assistant_to_ground_truth,
81+
)
82+
from eval_protocol.quickstart.utils import JUDGE_CONFIGS
83+
84+
# ---------------------------------------------------------------------------
85+
# Local judge configuration
86+
# ---------------------------------------------------------------------------
87+
LITELLM_BASE_URL = os.getenv("LITELLM_BASE_URL", "http://127.0.0.1:4000")
88+
LITELLM_API_KEY = os.getenv("LITELLM_API_KEY", "local-demo-key")
89+
LOCAL_JUDGE_MODEL = os.getenv("LOCAL_JUDGE_MODEL", "judge/llama3.1")
90+
LOCAL_JUDGE_TEMPERATURE = float(os.getenv("LOCAL_JUDGE_TEMPERATURE", "0.0"))
91+
LOCAL_JUDGE_MAX_TOKENS = int(os.getenv("LOCAL_JUDGE_MAX_TOKENS", "4096"))
92+
93+
# Register a judge profile that points to the local LiteLLM router. Importing
94+
# the module is enough for other quickstart helpers to discover it.
95+
JUDGE_CONFIGS.setdefault(
96+
"local-litellm",
97+
{
98+
"model": LOCAL_JUDGE_MODEL,
99+
"temperature": LOCAL_JUDGE_TEMPERATURE,
100+
"max_tokens": LOCAL_JUDGE_MAX_TOKENS,
101+
"api_key": LITELLM_API_KEY,
102+
"base_url": LITELLM_BASE_URL,
103+
},
104+
)
105+
106+
107+
# ---------------------------------------------------------------------------
108+
# Data loading helpers
109+
# ---------------------------------------------------------------------------
110+
def langfuse_local_data_generator() -> list[EvaluationRow]:
111+
"""Fetch evaluation rows from a local Langfuse deployment."""
112+
113+
adapter = create_langfuse_adapter()
114+
return adapter.get_evaluation_rows(
115+
environment=os.getenv("LANGFUSE_ENVIRONMENT", "local"),
116+
limit=int(os.getenv("LANGFUSE_LIMIT", "200")),
117+
sample_size=int(os.getenv("LANGFUSE_SAMPLE_SIZE", "20")),
118+
include_tool_calls=bool(int(os.getenv("LANGFUSE_INCLUDE_TOOL_CALLS", "1"))),
119+
sleep_between_gets=float(os.getenv("LANGFUSE_SLEEP", "0.5")),
120+
max_retries=int(os.getenv("LANGFUSE_MAX_RETRIES", "3")),
121+
from_timestamp=None,
122+
to_timestamp=datetime.utcnow(),
123+
)
124+
125+
126+
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip local example in CI")
127+
@pytest.mark.skipif(
128+
not os.getenv("LANGFUSE_PUBLIC_KEY") or not os.getenv("LANGFUSE_SECRET_KEY"),
129+
reason="LANGFUSE credentials not configured",
130+
)
131+
@pytest.mark.parametrize(
132+
"completion_params",
133+
[
134+
{
135+
"model": "candidate/llama3.8b",
136+
"api_key": LITELLM_API_KEY,
137+
"base_url": LITELLM_BASE_URL,
138+
"temperature": float(os.getenv("LOCAL_CANDIDATE_TEMPERATURE", "0.2")),
139+
},
140+
{
141+
"model": "ollama/llama3.1",
142+
"api_key": LITELLM_API_KEY,
143+
"base_url": LITELLM_BASE_URL,
144+
"extra_body": {"stream": False},
145+
},
146+
],
147+
)
148+
@evaluation_test(
149+
data_loaders=DynamicDataLoader(
150+
generators=[langfuse_local_data_generator],
151+
preprocess_fn=multi_turn_assistant_to_ground_truth,
152+
),
153+
rollout_processor=SingleTurnRolloutProcessor(),
154+
max_concurrent_evaluations=int(os.getenv("LOCAL_MAX_CONCURRENCY", "1")),
155+
)
156+
async def test_llm_judge_local(row: EvaluationRow) -> EvaluationRow:
157+
"""Evaluate one Langfuse trace row with the local aha judge."""
158+
159+
return await aha_judge(row, judge_name="local-litellm")

0 commit comments

Comments
 (0)