eval-protocol
diff --git a/‎eval_protocol/adapters/fireworks_tracing.py‎
Lines changed: 3 additions & 55 deletions b/‎eval_protocol/adapters/fireworks_tracing.py‎
Lines changed: 3 additions & 55 deletions
diff --git a/‎eval_protocol/proxy/Dockerfile.gateway‎
Lines changed: 8 additions & 10 deletions b/‎eval_protocol/proxy/Dockerfile.gateway‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎eval_protocol/proxy/README.md‎
Lines changed: 9 additions & 9 deletions b/‎eval_protocol/proxy/README.md‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎eval_protocol/proxy/config_no_cache.yaml‎
Lines changed: 2 additions & 1 deletion b/‎eval_protocol/proxy/config_no_cache.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎eval_protocol/proxy/docker-compose.yml‎
Lines changed: 29 additions & 6 deletions b/‎eval_protocol/proxy/docker-compose.yml‎
Lines changed: 29 additions & 6 deletions
diff --git a/‎eval_protocol/proxy/proxy_core/app.py‎
Lines changed: 15 additions & 11 deletions b/‎eval_protocol/proxy/proxy_core/app.py‎
Lines changed: 15 additions & 11 deletions
diff --git a/‎eval_protocol/proxy/proxy_core/langfuse.py‎
Lines changed: 0 additions & 1 deletion b/‎eval_protocol/proxy/proxy_core/langfuse.py‎
Lines changed: 0 additions & 1 deletion
@@ -8,10 +8,8 @@
 import logging
 import requests
 from datetime import datetime
-import ast
-import json
-import os
 from typing import Any, Dict, List, Optional, Protocol
+import os
 
 from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
 from .base import BaseAdapter
@@ -46,43 +44,6 @@ def __call__(
         ...
 
 
-def extract_otel_attributes(observations: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
-    """Attempt to extract and parse attributes from raw_gen_ai_request observation. This only works when stored in OTEL format.
-
-    Args:
-        observations: List of observation dictionaries from the trace
-
-    Returns:
-        Dict with all attributes parsed. Or None if not found.
-    """
-    for obs in observations:
-        if obs.get("name") == "raw_gen_ai_request" and obs.get("type") == "SPAN":
-            metadata = obs.get("metadata") or {}
-            attributes = metadata.get("attributes") or {}
-
-            result: Dict[str, Any] = {}
-
-            for key, value in attributes.items():
-                # Try to parse stringified objects (could be Python repr or JSON)
-                if isinstance(value, str) and value.startswith(("[", "{")):
-                    try:
-                        result[key] = ast.literal_eval(value)
-                    except Exception as e:
-                        logger.debug("Failed to parse %s with ast.literal_eval: %s", key, e)
-                        try:
-                            result[key] = json.loads(value)
-                        except Exception as e:
-                            logger.debug("Failed to parse %s with json.loads: %s", key, e)
-                            result[key] = value
-                else:
-                    result[key] = value
-
-            if result:
-                return result
-
-    return None
-
-
 def convert_trace_dict_to_evaluation_row(
     trace: Dict[str, Any], include_tool_calls: bool = True, span_name: Optional[str] = None
 ) -> Optional[EvaluationRow]:
@@ -135,19 +96,6 @@ def convert_trace_dict_to_evaluation_row(
                 ):
                     break  # Break early if we've found all the metadata we need
 
-        observations = trace.get("observations") or []
-        # We can only extract when stored in OTEL format.
-        otel_attributes = extract_otel_attributes(observations)
-        if otel_attributes:
-            # Find choices from any provider (llm.*.choices pattern)
-            choices = None
-            for key, value in otel_attributes.items():
-                if key.endswith(".choices") and isinstance(value, list):
-                    choices = value
-                    break
-            if choices and len(choices) > 0:
-                execution_metadata.finish_reason = choices[0].get("finish_reason")
-
         return EvaluationRow(
             messages=messages,
             tools=tools,
@@ -212,7 +160,7 @@ def extract_messages_from_trace_dict(
         # Fallback: use the last GENERATION observation which typically contains full chat history
         if not messages:
             try:
-                all_observations = trace.get("observations") or []
+                all_observations = trace.get("observations", [])
                 gens = [obs for obs in all_observations if obs.get("type") == "GENERATION"]
                 if gens:
                     gens.sort(key=lambda x: x.get("start_time", ""))
@@ -238,7 +186,7 @@ def get_final_generation_in_span_dict(trace: Dict[str, Any], span_name: str) ->
         The final generation dictionary, or None if not found
     """
     # Get all observations from the trace
-    all_observations = trace.get("observations") or []
+    all_observations = trace.get("observations", [])
 
     # Find a span with the given name that has generation children
     parent_span = None
 
@@ -1,25 +1,23 @@
-# Metadata Extraction Gateway - Uses LiteLLM SDK directly with Langfuse OTEL
+# Metadata Extraction Gateway - Sits in front of LiteLLM
 FROM python:3.11-slim
 
 WORKDIR /app
 
 # Prevent Python from buffering stdout/stderr
 ENV PYTHONUNBUFFERED=1
 
-# Copy the entire package for local install (context is repo root)
-COPY pyproject.toml /app/pyproject.toml
-COPY eval_protocol /app/eval_protocol
-COPY README.md /app/README.md
+# Copy requirements file
+COPY ./requirements.txt /app/requirements.txt
 
-# Install from local source with proxy extras
-RUN pip install --no-cache-dir ".[proxy]"
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
 
-# Copy the proxy package (local overrides for main.py, auth.py, etc.)
-COPY eval_protocol/proxy/proxy_core /app/proxy_core
+# Copy the proxy package
+COPY ./proxy_core /app/proxy_core
 
 # Expose port
 EXPOSE 4000
 
 # Run the gateway as a module
-# LANGFUSE_HOST and REDIS_HOST will be set by environment (docker-compose or Cloud Run)
+# LITELLM_URL will be set by environment (docker-compose or Cloud Run)
 CMD ["python", "-m", "proxy_core.main"]
@@ -59,9 +59,9 @@ This enables distributed evaluation systems to track which LLM completions belon
    - Stores insertion IDs per rollout for completeness checking
    - Uses Redis Sets: `rollout_id -> {insertion_id_1, insertion_id_2, ...}`
 
-#### 3. **LiteLLM SDK (Direct)**
-   - Uses LiteLLM SDK directly for LLM calls (no separate proxy server needed)
-   - Integrated with Langfuse via `langfuse_otel` OpenTelemetry callback
+#### 3. **LiteLLM Backend**
+   - Standard LiteLLM proxy for routing to LLM providers
+   - Configured with Langfuse callbacks for automatic tracing
 
 ## Key Features
 
@@ -244,11 +244,12 @@ Forwards any other request to LiteLLM backend with API key injection.
 
 | Variable | Required | Default | Description |
 |----------|----------|---------|-------------|
+| `LITELLM_URL` | Yes | - | URL of LiteLLM backend |
 | `REDIS_HOST` | Yes | - | Redis hostname |
 | `REDIS_PORT` | No | 6379 | Redis port |
 | `REDIS_PASSWORD` | No | - | Redis password |
 | `SECRETS_PATH` | No | `proxy_core/secrets.yaml` | Path to secrets file (YAML) |
-| `LANGFUSE_HOST` | No | `https://us.cloud.langfuse.com` | Langfuse OTEL host for tracing |
+| `LANGFUSE_HOST` | No | `https://cloud.langfuse.com` | Langfuse base URL |
 | `REQUEST_TIMEOUT` | No | 300.0 | Request timeout (LLM calls) in seconds |
 | `LOG_LEVEL` | No | INFO | Logging level |
 | `PORT` | No | 4000 | Gateway port |
@@ -271,26 +272,25 @@ default_project_id: project-1
 
 ### LiteLLM Configuration
 
-The `config_no_cache.yaml` configures LiteLLM (only needed if running a standalone LiteLLM proxy):
+The `config_no_cache.yaml` configures LiteLLM:
 ```yaml
 model_list:
   - model_name: "*"
     litellm_params:
       model: "*"
 litellm_settings:
-  callbacks: ["langfuse_otel"]
+  success_callback: ["langfuse"]
+  failure_callback: ["langfuse"]
   drop_params: True
 general_settings:
   allow_client_side_credentials: true
 ```
 
 Key settings:
 - **Wildcard model support**: Route any model to any provider
-- **Langfuse OTEL**: OpenTelemetry-based tracing via `langfuse_otel` callback
+- **Langfuse callbacks**: Automatic tracing on success/failure
 - **Client-side credentials**: Accept API keys from request body
 
-**Note:** The proxy now uses the LiteLLM SDK directly with `langfuse_otel` integration, so a separate LiteLLM proxy server is no longer required.
-
 ## Security Considerations
 
 ### Authentication
 
@@ -3,7 +3,8 @@ model_list:
     litellm_params:
       model: "*"
 litellm_settings:
-  callbacks: ["langfuse_otel"]
+  success_callback: ["langfuse"]
+  failure_callback: ["langfuse"]
   drop_params: True
 general_settings:
   allow_client_side_credentials: true
@@ -7,19 +7,41 @@ services:
     ports:
       - "6379:6379"  # Expose for debugging if needed
     networks:
-      - proxy-network
+      - litellm-network
     restart: unless-stopped
     command: redis-server --appendonly yes
     volumes:
       - redis-data:/data
 
-  # Metadata Gateway - Handles LLM calls directly via LiteLLM SDK with Langfuse OTEL
+  # LiteLLM Backend - Handles actual LLM proxying
+  litellm-backend:
+    image: litellm/litellm:v1.77.3-stable
+    platform: linux/amd64
+    container_name: litellm-backend
+    command: ["--config", "/app/config.yaml", "--port", "4000", "--host", "0.0.0.0"]
+    # If you want to be able to use other model providers like OpenAI, Anthropic, etc., you need to set keys in .env file.
+    env_file:
+      - .env  # Load API keys from .env file
+    environment:
+      - LANGFUSE_PUBLIC_KEY=dummy  # Set dummy public and private key so Langfuse instance initializes in LiteLLM, then real keys get sent in proxy
+      - LANGFUSE_SECRET_KEY=dummy
+    volumes:
+      - ./config_no_cache.yaml:/app/config.yaml:ro
+    ports:
+      - "4001:4000"  # Expose on 4001 for direct access if needed
+    networks:
+      - litellm-network
+    restart: unless-stopped
+
+  # Metadata Gateway - Public-facing service that extracts metadata from URLs
   metadata-gateway:
     build:
-      context: ../..
-      dockerfile: eval_protocol/proxy/Dockerfile.gateway
+      context: .
+      dockerfile: Dockerfile.gateway
     container_name: metadata-gateway
     environment:
+      # Point to the LiteLLM backend service
+      - LITELLM_URL=http://litellm-backend:4000
       - PORT=4000
       # Redis configuration for assistant message counting
       - REDIS_HOST=redis
@@ -34,13 +56,14 @@ services:
     ports:
       - "4000:4000"  # Main public-facing port
     networks:
-      - proxy-network
+      - litellm-network
     depends_on:
+      - litellm-backend
       - redis
     restart: unless-stopped
 
 networks:
-  proxy-network:
+  litellm-network:
     driver: bridge
 
 volumes:
 
@@ -15,7 +15,7 @@
 
 from .models import ProxyConfig, LangfuseTracesResponse, TracesParams, ChatParams, ChatRequestHook, TracesRequestHook
 from .auth import AuthProvider, NoAuthProvider
-from .litellm import handle_chat_completion
+from .litellm import handle_chat_completion, proxy_to_litellm
 from .langfuse import fetch_langfuse_traces, pointwise_fetch_langfuse_trace
 
 # Configure logging before any other imports (so all modules inherit this config)
@@ -35,6 +35,10 @@ def build_proxy_config(
     preprocess_traces_request: Optional[TracesRequestHook] = None,
 ) -> ProxyConfig:
     """Load environment and secrets, and build ProxyConfig"""
+    # Env
+    litellm_url = os.getenv("LITELLM_URL")
+    if not litellm_url:
+        raise ValueError("LITELLM_URL environment variable must be set")
     request_timeout = float(os.getenv("REQUEST_TIMEOUT", "300.0"))
     langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
 
@@ -62,6 +66,7 @@ def build_proxy_config(
         raise ValueError(f"Invalid format in secrets file {secrets_path.name}: {e}")
 
     return ProxyConfig(
+        litellm_url=litellm_url,
         request_timeout=request_timeout,
         langfuse_host=langfuse_host,
         langfuse_keys=langfuse_keys,
@@ -108,16 +113,6 @@ async def lifespan(app: FastAPI):
         app.state.config = build_proxy_config(preprocess_chat_request, preprocess_traces_request)
         app.state.redis = init_redis()
 
-        config = app.state.config
-        default_keys = config.langfuse_keys[config.default_project_id]
-        os.environ["LANGFUSE_PUBLIC_KEY"] = default_keys["public_key"]
-        os.environ["LANGFUSE_SECRET_KEY"] = default_keys["secret_key"]
-        os.environ.setdefault("LANGFUSE_HOST", config.langfuse_host)
-
-        import litellm
-
-        litellm.callbacks = ["langfuse_otel"]
-
         try:
             yield
         finally:
@@ -302,4 +297,13 @@ async def pointwise_get_langfuse_trace(
     async def health():
         return {"status": "healthy", "service": "metadata-proxy"}
 
+    # Catch-all
+    @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH"])
+    async def catch_all_proxy(
+        path: str,
+        request: Request,
+        config: ProxyConfig = Depends(get_config),
+    ):
+        return await proxy_to_litellm(config, path, request)
+
     return app
@@ -50,7 +50,6 @@ def _serialize_trace_to_dict(trace_full: Any) -> Dict[str, Any]:
                 "input": getattr(obs, "input", None),
                 "output": getattr(obs, "output", None),
                 "parent_observation_id": getattr(obs, "parent_observation_id", None),
-                "metadata": getattr(obs, "metadata", None),
             }
             for obs in getattr(trace_full, "observations", [])
         ]
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,6 @@ def _serialize_trace_to_dict(trace_full: Any) -> Dict[str, Any]:`
`50`	`50`	`"input": getattr(obs, "input", None),`
`51`	`51`	`"output": getattr(obs, "output", None),`
`52`	`52`	`"parent_observation_id": getattr(obs, "parent_observation_id", None),`
`53`		`- "metadata": getattr(obs, "metadata", None),`
`54`	`53`	`}`
`55`	`54`	`for obs in getattr(trace_full, "observations", [])`
`56`	`55`	`]`