fix logging sink

benjibc · benjibc · commit 8e78dba09abb · 2025-10-18T15:04:48.000-07:00
diff --git a/eval_protocol/adapters/fireworks_tracing.py b/eval_protocol/adapters/fireworks_tracing.py
@@ -273,16 +273,30 @@ def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -
         if not tags:
             raise ValueError("At least one tag is required to fetch logs")
 
-        url = f"{self.base_url}/logs"
         headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
         params: Dict[str, Any] = {"tags": tags, "limit": limit, "hours_back": hours_back, "program": "eval_protocol"}
 
-        try:
-            response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
-            response.raise_for_status()
-            data = response.json() or {}
-        except requests.exceptions.RequestException as e:
-            logger.error("Failed to fetch logs from Fireworks /logs: %s", str(e))
+        # Try /logs first, fall back to /v1/logs if not found
+        urls_to_try = [f"{self.base_url}/logs", f"{self.base_url}/v1/logs"]
+        data: Dict[str, Any] = {}
+        last_error: Optional[str] = None
+        for url in urls_to_try:
+            try:
+                response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
+                if response.status_code == 404:
+                    # Try next variant
+                    last_error = f"404 for {url}"
+                    continue
+                response.raise_for_status()
+                data = response.json() or {}
+                break
+            except requests.exceptions.RequestException as e:
+                last_error = str(e)
+                continue
+        else:
+            # All attempts failed
+            if last_error:
+                logger.error("Failed to fetch logs from Fireworks (tried %s): %s", urls_to_try, last_error)
             return []
 
         entries: List[Dict[str, Any]] = data.get("entries", []) or []
diff --git a/eval_protocol/cli_commands/logs.py b/eval_protocol/cli_commands/logs.py
@@ -32,7 +32,13 @@ def logs_command(args):
 
     # Setup backend configs
     elasticsearch_config = None
-    fireworks_base_url = os.environ.get("FW_TRACING_GATEWAY_BASE_URL") or "https://tracing.fireworks.ai"
+    # Prefer explicit FW_TRACING_GATEWAY_BASE_URL, then GATEWAY_URL from env (remote validation),
+    # finally default to public tracing.fireworks.ai
+    fireworks_base_url = (
+        os.environ.get("FW_TRACING_GATEWAY_BASE_URL")
+        or os.environ.get("GATEWAY_URL")
+        or "https://tracing.fireworks.ai"
+    )
     try:
         if not use_fireworks:
             if getattr(args, "use_env_elasticsearch_config", False):
diff --git a/eval_protocol/log_utils/fireworks_tracing_http_handler.py b/eval_protocol/log_utils/fireworks_tracing_http_handler.py
@@ -34,9 +34,42 @@ def emit(self, record: logging.LogRecord) -> None:
             if not rollout_id:
                 return
             payload = self._build_payload(record, rollout_id)
-            url = f"{self.gateway_base_url.rstrip('/')}/logs"
+            base = self.gateway_base_url.rstrip("/")
+            url = f"{base}/logs"
+            # Optional debug prints to aid local diagnostics
+            if os.environ.get("EP_DEBUG") == "true":
+                try:
+                    tags_val = payload.get("tags")
+                    tags_len = len(tags_val) if isinstance(tags_val, list) else 0
+                    msg_val = payload.get("message")
+                    msg_preview = msg_val[:80] if isinstance(msg_val, str) else msg_val
+                    print(f"[FW_LOG] POST {url} rollout_id={rollout_id} tags={tags_len} msg={msg_preview}")
+                except Exception:
+                    pass
             with self._lock:
-                self._session.post(url, json=payload, timeout=5)
+                resp = self._session.post(url, json=payload, timeout=5)
+            if os.environ.get("EP_DEBUG") == "true":
+                try:
+                    print(f"[FW_LOG] resp={resp.status_code}")
+                except Exception:
+                    pass
+            # Fallback to /v1/logs if /logs is not found
+            if resp is not None and getattr(resp, "status_code", None) == 404:
+                alt = f"{base}/v1/logs"
+                if os.environ.get("EP_DEBUG") == "true":
+                    try:
+                        tags_val = payload.get("tags")
+                        tags_len = len(tags_val) if isinstance(tags_val, list) else 0
+                        print(f"[FW_LOG] RETRY POST {alt} rollout_id={rollout_id} tags={tags_len}")
+                    except Exception:
+                        pass
+                with self._lock:
+                    resp2 = self._session.post(alt, json=payload, timeout=5)
+                if os.environ.get("EP_DEBUG") == "true":
+                    try:
+                        print(f"[FW_LOG] retry resp={resp2.status_code}")
+                    except Exception:
+                        pass
         except Exception:
             # Avoid raising exceptions from logging
             self.handleError(record)
diff --git a/eval_protocol/log_utils/init.py b/eval_protocol/log_utils/init.py
@@ -41,7 +41,8 @@ def init_external_logging_from_env() -> None:
 
     # Fireworks tracing: prefer if FIREWORKS_API_KEY is present; default base URL if not provided
     fw_key = _get_env("FIREWORKS_API_KEY")
-    fw_url = _get_env("FW_TRACING_GATEWAY_BASE_URL") or "https://tracing.fireworks.ai"
+    # Allow remote validation gateway to act as tracing base when provided
+    fw_url = _get_env("FW_TRACING_GATEWAY_BASE_URL") or _get_env("GATEWAY_URL") or "https://tracing.fireworks.ai"
     if fw_key and "FireworksTracingHttpHandler" not in existing_handler_types:
         fw_handler = FireworksTracingHttpHandler(gateway_base_url=fw_url)
         fw_handler.setLevel(logging.INFO)
diff --git a/eval_protocol/proxy/proxy_core/redis_utils.py b/eval_protocol/proxy/proxy_core/redis_utils.py
@@ -3,7 +3,7 @@
 """
 
 import logging
-from typing import Set
+from typing import Set, cast
 import redis
 
 logger = logging.getLogger(__name__)
@@ -40,7 +40,16 @@ def get_insertion_ids(redis_client: redis.Redis, rollout_id: str) -> Set[str]:
         Set of insertion_id strings, empty set if none found or on error
     """
     try:
-        insertion_ids = redis_client.smembers(rollout_id)
+        raw = redis_client.smembers(rollout_id)
+        # Typing in redis stubs may be Awaitable[Set[Any]] | Set[Any]; at runtime this is a Set[bytes]
+        raw_ids = cast(Set[object], raw)
+        # Normalize to set[str]
+        insertion_ids: Set[str] = set()
+        for b in raw_ids:
+            try:
+                insertion_ids.add(b.decode("utf-8") if isinstance(b, (bytes, bytearray)) else cast(str, b))
+            except Exception:
+                continue
         logger.debug(f"Found {len(insertion_ids)} expected insertion_ids for rollout {rollout_id}")
         return insertion_ids
     except Exception as e:
diff --git a/scripts/validate_remote.py b/scripts/validate_remote.py
@@ -0,0 +1,136 @@
+import os
+import sys
+import time
+import requests
+
+
+def require_env(var_name: str) -> str:
+    value = os.getenv(var_name)
+    if not value:
+        print(f"Missing required env var: {var_name}", file=sys.stderr)
+        sys.exit(1)
+    return value
+
+
+def require_logs_endpoints(base_url: str) -> None:
+    try:
+        r = requests.get(f"{base_url}/openapi.json", timeout=30)
+        if not r.ok:
+            print("OpenAPI schema unavailable", file=sys.stderr)
+            sys.exit(1)
+        paths = r.json().get("paths", {})
+        ok = any(p.startswith("/logs") or p.startswith("/v1/logs") for p in paths.keys())
+        if not ok:
+            print("/logs endpoints not present on deployment", file=sys.stderr)
+            sys.exit(1)
+    except Exception as e:
+        print(f"Failed to check OpenAPI: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+def post_chat_completion(base_url: str, api_key: str, rollout_id: str) -> None:
+    headers = {"Authorization": f"Bearer {api_key}"}
+    now = int(time.time())
+    url = (
+        f"{base_url}/rollout_id/{rollout_id}/"
+        f"invocation_id/inv{now}/"
+        f"experiment_id/remote-validate/"
+        f"run_id/run-1/"
+        f"row_id/row-1/"
+        f"chat/completions"
+    )
+    body = {
+        "model": "fireworks_ai/accounts/fireworks/models/llama-v3p1-8b-instruct",
+        "messages": [{"role": "user", "content": "Say 'ok' if you can read this."}],
+        "temperature": 0.1,
+    }
+    r = requests.post(url, headers=headers, json=body, timeout=60)
+    if r.status_code != 200:
+        print(f"Chat completion failed: {r.status_code} {r.text[:500]}", file=sys.stderr)
+        sys.exit(1)
+    print("chat: ok")
+
+
+def wait_for_traces(base_url: str, api_key: str, rollout_id: str, max_attempts: int = 8) -> None:
+    headers = {"Authorization": f"Bearer {api_key}"}
+    params = {
+        "tags": [f"rollout_id:{rollout_id}"],
+        "limit": 10,
+        "hours_back": 6,
+    }
+    url = f"{base_url}/traces"
+    for attempt in range(1, max_attempts + 1):
+        r = requests.get(url, headers=headers, params=params, timeout=30)
+        if r.status_code == 200:
+            data = r.json()
+            total = int(data.get("total_traces") or 0)
+            print(f"traces: ok total_traces={total}")
+            if total > 0:
+                return
+        elif r.status_code != 404 and r.status_code != 401:
+            print(f"Traces fetch failed: {r.status_code} {r.text[:500]}", file=sys.stderr)
+            sys.exit(1)
+        sleep_s = min(2 ** (attempt - 1), 10)
+        time.sleep(sleep_s)
+    print("Traces not available after retries (indexing delay?)", file=sys.stderr)
+    sys.exit(1)
+
+
+def validate_logs_endpoints(base_url: str, rollout_id: str) -> None:
+    require_logs_endpoints(base_url)
+
+    # Ingest a structured log
+    payload = {
+        "program": "eval_protocol",
+        "status": "completed",
+        "message": "Remote validation run finished",
+        "tags": [f"rollout_id:{rollout_id}", "experiment_id:remote", "run_id:test"],
+        "metadata": {"dataset": "AIME"},
+        "extras": {"num_examples": 3},
+    }
+    r = requests.post(f"{base_url}/logs", json=payload, timeout=30)
+    if r.status_code != 200:
+        print(f"logs ingest failed: {r.status_code} {r.text[:500]}", file=sys.stderr)
+        sys.exit(1)
+    print("logs ingest: ok")
+
+    # Retrieve logs (retry for indexing)
+    params = {
+        "tags": [f"rollout_id:{rollout_id}"],
+        "program": "eval_protocol",
+        "hours_back": 1,
+        "limit": 10,
+    }
+    total = 0
+    for attempt in range(1, 12):
+        rr = requests.get(f"{base_url}/logs", params=params, timeout=30)
+        if rr.status_code == 200:
+            data = rr.json()
+            total = int(data.get("total_entries") or 0)
+            if total > 0:
+                print(f"logs fetch: ok total_entries={total}")
+                break
+        sleep_s = min(2 ** (attempt - 1), 10)
+        time.sleep(sleep_s)
+    if total == 0:
+        print("logs fetch: no entries found within retry window", file=sys.stderr)
+        sys.exit(1)
+
+
+def main():
+    base_url = require_env("GATEWAY_URL")
+    api_key = require_env("FIREWORKS_API_KEY")
+    rollout_id = f"r{int(time.time())}"
+
+    print(f"Gateway: {base_url}")
+    print(f"Rollout: rollout_id:{rollout_id}")
+
+    post_chat_completion(base_url, api_key, rollout_id)
+    wait_for_traces(base_url, api_key, rollout_id)
+    validate_logs_endpoints(base_url, rollout_id)
+
+    print("remote validation: SUCCESS")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/verify_logging_locally.py b/scripts/verify_logging_locally.py
@@ -15,8 +15,30 @@ def _now_rollout_id() -> str:
     return f"verify-{int(time.time())}"
 
 
+def _detect_gateway_base_url() -> str:
+    # Prefer explicit FW_TRACING_GATEWAY_BASE_URL, else GATEWAY_URL, else public default
+    return os.getenv("FW_TRACING_GATEWAY_BASE_URL") or os.getenv("GATEWAY_URL") or "https://tracing.fireworks.ai"
+
+
+def _detect_logs_endpoint(base_url: str) -> str:
+    # Inspect OpenAPI and choose the correct logs endpoint
+    try:
+        import requests
+
+        r = requests.get(f"{base_url.rstrip('/')}/openapi.json", timeout=5)
+        if r.ok:
+            paths = (r.json() or {}).get("paths", {})
+            if any(p.startswith("/v1/logs") for p in paths.keys()):
+                return "/v1/logs"
+            if any(p.startswith("/logs") for p in paths.keys()):
+                return "/logs"
+    except Exception:
+        pass
+    return "/logs"
+
+
 def verify_fireworks(rollout_id: str) -> int:
-    base_url = os.getenv("FW_TRACING_GATEWAY_BASE_URL") or "https://tracing.fireworks.ai"
+    base_url = _detect_gateway_base_url()
     api_key = os.getenv("FIREWORKS_API_KEY")
     if not api_key:
         print("FIREWORKS_API_KEY not set; cannot verify Fireworks")
@@ -26,6 +48,20 @@ def verify_fireworks(rollout_id: str) -> int:
     root = logging.getLogger()
     root.setLevel(logging.INFO)
     init_external_logging_from_env()
+    # Detect and use the correct logs endpoint
+    logs_ep = _detect_logs_endpoint(base_url)
+    # Print handler info for diagnostics
+    handlers = [type(h).__name__ for h in root.handlers]
+    print(
+        json.dumps(
+            {
+                "gateway_url": base_url,
+                "logs_endpoint": logs_ep,
+                "root_handlers": handlers,
+            }
+        )
+    )
+
     logger = logging.getLogger("ep.verify.fireworks")
     for i in range(2):
         logger.info(
@@ -47,12 +83,22 @@ def verify_fireworks(rollout_id: str) -> int:
         "limit": 50,
         "hours_back": 6,
     }
-    url = f"{base_url.rstrip('/')}/logs"
+    candidate_eps = [logs_ep, "/v1/logs" if logs_ep != "/v1/logs" else "/logs"]
     for _ in range(20):
         try:
-            r = requests.get(url, headers=headers, params=params, timeout=15)
-            r.raise_for_status()
-            data: Dict[str, Any] = r.json() or {}
+            data: Dict[str, Any] = {}
+            last_err: str | None = None
+            for ep in candidate_eps:
+                url = f"{base_url.rstrip('/')}{ep}"
+                r = requests.get(url, headers=headers, params=params, timeout=15)
+                if r.status_code == 404:
+                    last_err = f"404 for {ep}"
+                    continue
+                r.raise_for_status()
+                data = r.json() or {}
+                break
+            else:
+                raise Exception(last_err or "all endpoints failed")
             entries: List[Dict[str, Any]] = data.get("entries", []) or []
             matched = [e for e in entries if any(t == f"rollout_id:{rollout_id}" for t in e.get("tags", []))]
             if matched: