|
| 1 | +"""Fully local Langfuse + LiteLLM example for the aha judge. |
| 2 | +
|
| 3 | +This example shows how to run the Arena-Hard-Auto ("aha") judge entirely on |
| 4 | +local infrastructure. It reuses the Langfuse adapter to pull traces from a |
| 5 | +self-hosted Langfuse deployment and evaluates them with a local LiteLLM router |
| 6 | +that fronts both `ollama` and `llama.cpp` backends. |
| 7 | +
|
| 8 | +Prerequisites |
| 9 | +------------- |
| 10 | +1. Start Langfuse locally and export the usual environment variables so the |
| 11 | + SDK can connect:: |
| 12 | +
|
| 13 | + docker compose up -d |
| 14 | + export LANGFUSE_PUBLIC_KEY=local |
| 15 | + export LANGFUSE_SECRET_KEY=local |
| 16 | + export LANGFUSE_HOST=http://localhost:3000 |
| 17 | +
|
| 18 | + Replace the credentials with whatever you configured for your local |
| 19 | + deployment. |
| 20 | +
|
| 21 | +2. Launch the model backends. The example below assumes: |
| 22 | +
|
| 23 | + * ``ollama`` is running on ``http://127.0.0.1:11434`` with the model |
| 24 | + ``llama3.1`` pulled. |
| 25 | + * A ``llama.cpp`` server is running on ``http://127.0.0.1:8080`` that serves |
| 26 | + ``Meta-Llama-3-8B-Instruct`` (adjust the path/model name for your set-up). |
| 27 | +
|
| 28 | +3. Start a LiteLLM router that proxies both backends. Save the following to |
| 29 | + ``litellm-config.yaml`` (change model names as desired):: |
| 30 | +
|
| 31 | + model_list: |
| 32 | + - model_name: "judge/llama3.1" |
| 33 | + litellm_params: |
| 34 | + model: "ollama/llama3.1" |
| 35 | + api_base: "http://127.0.0.1:11434" |
| 36 | + - model_name: "candidate/llama3.8b" |
| 37 | + litellm_params: |
| 38 | + model: "llama.cpp" |
| 39 | + api_base: "http://127.0.0.1:8080/v1" |
| 40 | + model_path: "/path/to/Meta-Llama-3-8B-Instruct.gguf" |
| 41 | +
|
| 42 | + litellm_settings: |
| 43 | + drop_params: true |
| 44 | + telemetry: false |
| 45 | +
|
| 46 | + Then launch the router:: |
| 47 | +
|
| 48 | + export LITELLM_API_KEY=local-demo-key |
| 49 | + litellm --config litellm-config.yaml --port 4000 |
| 50 | +
|
| 51 | +4. Point the example at the router. The defaults below expect the router on |
| 52 | + ``http://127.0.0.1:4000`` and use ``judge/llama3.1`` as the judge model. |
| 53 | + Override them via ``LITELLM_BASE_URL`` and ``LOCAL_JUDGE_MODEL`` if your |
| 54 | + configuration is different. |
| 55 | +
|
| 56 | +Running the example |
| 57 | +------------------- |
| 58 | +With the services running, execute:: |
| 59 | +
|
| 60 | + pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local |
| 61 | +
|
| 62 | +The test will fetch traces from the local Langfuse instance, convert each |
| 63 | +assistant turn into an ``EvaluationRow``, and score them with the local judge. |
| 64 | +""" |
| 65 | + |
| 66 | +from __future__ import annotations |
| 67 | + |
| 68 | +from datetime import datetime |
| 69 | +import os |
| 70 | + |
| 71 | +import pytest |
| 72 | + |
| 73 | +from eval_protocol import ( |
| 74 | + DynamicDataLoader, |
| 75 | + EvaluationRow, |
| 76 | + SingleTurnRolloutProcessor, |
| 77 | + aha_judge, |
| 78 | + create_langfuse_adapter, |
| 79 | + evaluation_test, |
| 80 | + multi_turn_assistant_to_ground_truth, |
| 81 | +) |
| 82 | +from eval_protocol.quickstart.utils import JUDGE_CONFIGS |
| 83 | + |
| 84 | +# --------------------------------------------------------------------------- |
| 85 | +# Local judge configuration |
| 86 | +# --------------------------------------------------------------------------- |
| 87 | +LITELLM_BASE_URL = os.getenv("LITELLM_BASE_URL", "http://127.0.0.1:4000") |
| 88 | +LITELLM_API_KEY = os.getenv("LITELLM_API_KEY", "local-demo-key") |
| 89 | +LOCAL_JUDGE_MODEL = os.getenv("LOCAL_JUDGE_MODEL", "judge/llama3.1") |
| 90 | +LOCAL_JUDGE_TEMPERATURE = float(os.getenv("LOCAL_JUDGE_TEMPERATURE", "0.0")) |
| 91 | +LOCAL_JUDGE_MAX_TOKENS = int(os.getenv("LOCAL_JUDGE_MAX_TOKENS", "4096")) |
| 92 | + |
| 93 | +# Register a judge profile that points to the local LiteLLM router. Importing |
| 94 | +# the module is enough for other quickstart helpers to discover it. |
| 95 | +JUDGE_CONFIGS.setdefault( |
| 96 | + "local-litellm", |
| 97 | + { |
| 98 | + "model": LOCAL_JUDGE_MODEL, |
| 99 | + "temperature": LOCAL_JUDGE_TEMPERATURE, |
| 100 | + "max_tokens": LOCAL_JUDGE_MAX_TOKENS, |
| 101 | + "api_key": LITELLM_API_KEY, |
| 102 | + "base_url": LITELLM_BASE_URL, |
| 103 | + }, |
| 104 | +) |
| 105 | + |
| 106 | + |
| 107 | +# --------------------------------------------------------------------------- |
| 108 | +# Data loading helpers |
| 109 | +# --------------------------------------------------------------------------- |
| 110 | +def langfuse_local_data_generator() -> list[EvaluationRow]: |
| 111 | + """Fetch evaluation rows from a local Langfuse deployment.""" |
| 112 | + |
| 113 | + adapter = create_langfuse_adapter() |
| 114 | + return adapter.get_evaluation_rows( |
| 115 | + environment=os.getenv("LANGFUSE_ENVIRONMENT", "local"), |
| 116 | + limit=int(os.getenv("LANGFUSE_LIMIT", "200")), |
| 117 | + sample_size=int(os.getenv("LANGFUSE_SAMPLE_SIZE", "20")), |
| 118 | + include_tool_calls=bool(int(os.getenv("LANGFUSE_INCLUDE_TOOL_CALLS", "1"))), |
| 119 | + sleep_between_gets=float(os.getenv("LANGFUSE_SLEEP", "0.5")), |
| 120 | + max_retries=int(os.getenv("LANGFUSE_MAX_RETRIES", "3")), |
| 121 | + from_timestamp=None, |
| 122 | + to_timestamp=datetime.utcnow(), |
| 123 | + ) |
| 124 | + |
| 125 | + |
| 126 | +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip local example in CI") |
| 127 | +@pytest.mark.skipif( |
| 128 | + not os.getenv("LANGFUSE_PUBLIC_KEY") or not os.getenv("LANGFUSE_SECRET_KEY"), |
| 129 | + reason="LANGFUSE credentials not configured", |
| 130 | +) |
| 131 | +@pytest.mark.parametrize( |
| 132 | + "completion_params", |
| 133 | + [ |
| 134 | + { |
| 135 | + "model": "candidate/llama3.8b", |
| 136 | + "api_key": LITELLM_API_KEY, |
| 137 | + "base_url": LITELLM_BASE_URL, |
| 138 | + "temperature": float(os.getenv("LOCAL_CANDIDATE_TEMPERATURE", "0.2")), |
| 139 | + }, |
| 140 | + { |
| 141 | + "model": "ollama/llama3.1", |
| 142 | + "api_key": LITELLM_API_KEY, |
| 143 | + "base_url": LITELLM_BASE_URL, |
| 144 | + "extra_body": {"stream": False}, |
| 145 | + }, |
| 146 | + ], |
| 147 | +) |
| 148 | +@evaluation_test( |
| 149 | + data_loaders=DynamicDataLoader( |
| 150 | + generators=[langfuse_local_data_generator], |
| 151 | + preprocess_fn=multi_turn_assistant_to_ground_truth, |
| 152 | + ), |
| 153 | + rollout_processor=SingleTurnRolloutProcessor(), |
| 154 | + max_concurrent_evaluations=int(os.getenv("LOCAL_MAX_CONCURRENCY", "1")), |
| 155 | +) |
| 156 | +async def test_llm_judge_local(row: EvaluationRow) -> EvaluationRow: |
| 157 | + """Evaluate one Langfuse trace row with the local aha judge.""" |
| 158 | + |
| 159 | + return await aha_judge(row, judge_name="local-litellm") |
0 commit comments