fife/fastapi_client.py at main · gtfintechlab/fife · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
"""FastAPI client for OpenAI-compatible endpoints.

This module provides a client for interacting with OpenAI-compatible APIs
through FastAPI endpoints, specifically designed for Cirrascale AI2 models.
"""

from __future__ import annotations

import asyncio
import logging
import os
import time
from typing import Any, ClassVar, Literal, TypedDict

import httpx
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_exponential,
)


# Type definitions for better type safety
class Message(TypedDict):
    """Type definition for chat message."""

    role: Literal["system", "user", "assistant"]
    content: str


class ChatCompletionResponse(TypedDict, total=False):
    """Type definition for OpenAI-compatible response."""

    id: str
    object: str
    created: int
    model: str
    choices: list[dict[str, Any]]
    usage: dict[str, int]  # Optional usage statistics
    system_fingerprint: str  # Optional


class FastAPIError(Exception):
    """Base exception for FastAPI client errors."""

    def __init__(self, message: str, context: dict[str, Any] | None = None):
        """Initialize error with contextual information.

        Args:
            message: Error message
            context: Optional context dictionary for debugging
        """
        super().__init__(message)
        self.context = context or {}


class RateLimitError(FastAPIError):
    """Raised when rate limit is exceeded."""

    pass


class AuthenticationError(FastAPIError):
    """Raised when API key is invalid or missing."""

    pass


class ModelNotFoundError(FastAPIError):
    """Raised when the requested model is not available."""

    pass


class RateLimiter:
    """Token bucket rate limiter for FastAPI endpoints.

    Implements a token bucket algorithm to ensure we stay within
    API rate limits (100,000 requests/minute for Cirrascale).
    """

    def __init__(self, max_requests: int = 1500, window: int = 1):
        """Initialize the rate limiter.

        Args:
            max_requests: Maximum requests per second (default: 1500)
            window: Time window in seconds (default: 1)
        """
        self.max_requests = max_requests
        self.window = window
        self.tokens = max_requests
        self.last_update = time.time()
        self._lock = asyncio.Lock()

    async def acquire(self) -> bool:
        """Acquire a token for request execution.

        Returns:
            True if a token was acquired, False if rate limit exceeded
        """
        async with self._lock:
            now = time.time()
            elapsed = now - self.last_update

            # Replenish tokens based on elapsed time
            tokens_to_add = elapsed * self.max_requests / self.window
            self.tokens = min(self.max_requests, self.tokens + tokens_to_add)
            self.last_update = now

            if self.tokens >= 1:
                self.tokens -= 1
                return True
            return False

    async def wait_for_token(self, max_wait: float = 1.0) -> None:
        """Wait until a token is available with adaptive backoff.

        Args:
            max_wait: Maximum time to wait in seconds
        """
        start_time = time.time()
        sleep_time = 0.001  # Start with 1ms
        while time.time() - start_time < max_wait:
            if await self.acquire():
                return
            # Adaptive backoff: increase sleep time up to 50ms
            await asyncio.sleep(min(sleep_time, 0.05))
            sleep_time = min(sleep_time * 1.5, 0.05)  # Exponential backoff capped at 50ms
        raise RateLimitError("Rate limit exceeded, please retry later")


class FastAPIClient:
    """OpenAI-compatible client for FastAPI endpoints.

    This client uses a singleton pattern per endpoint to reuse connections
    and improve performance.
    """

    # Class-level cache for client instances
    _instances: ClassVar[dict[tuple[str, str], FastAPIClient]] = {}

    @classmethod
    def get_instance(
        cls, endpoint: str, api_key: str, provider: str = "cirrascale"
    ) -> FastAPIClient:
        """Get or create a singleton instance for the given endpoint.

        Args:
            endpoint: Base API endpoint URL
            api_key: API key for authentication
            provider: Provider name for logging

        Returns:
            FastAPIClient instance (may be cached)
        """
        cache_key = (endpoint.rstrip("/"), provider)

        if cache_key not in cls._instances:
            cls._instances[cache_key] = cls(endpoint, api_key, provider)

        return cls._instances[cache_key]

    def __init__(self, endpoint: str, api_key: str, provider: str = "cirrascale"):
        """Initialize the FastAPI client.

        Args:
            endpoint: Base API endpoint URL
            api_key: API key for authentication
            provider: Provider name for logging
        """
        self.endpoint = endpoint.rstrip("/")
        self.api_key = api_key
        self.provider = provider
        self.rate_limiter = RateLimiter()

        # Track the event loop this client was created with
        self._loop = None
        self._client = None

        # Store configuration for recreating client
        self._client_config = {
            "timeout": httpx.Timeout(
                connect=10.0,  # Connection timeout
                read=60.0,  # Read timeout for responses
                write=10.0,  # Write timeout for requests
                pool=5.0,  # Connection pool timeout
            ),
            "limits": httpx.Limits(
                max_keepalive_connections=10,  # Increased for better reuse
                max_connections=20,  # Increased for concurrency
                keepalive_expiry=30.0,  # Keep connections alive longer
            ),
            "http2": True,  # Enable HTTP/2 for better performance
        }

        self.logger = logging.getLogger(__name__)

    @property
    def client(self) -> httpx.AsyncClient | None:
        """Get the current HTTP client (for backward compatibility).

        Note: This returns the internal client, but you should use
        _ensure_client() in async contexts to ensure it's valid.
        """
        return self._client

    async def __aenter__(self):
        """Async context manager entry."""
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Async context manager exit - close HTTP client."""
        await self.close()

    async def close(self):
        """Close the HTTP client and clean up resources."""
        if self._client:
            await self._client.aclose()
            self._client = None
            self._loop = None
        # Remove from singleton cache if present
        cache_key = (self.endpoint, self.provider)
        if cache_key in self._instances and self._instances[cache_key] is self:
            del self._instances[cache_key]

    async def _ensure_client(self) -> httpx.AsyncClient:
        """Ensure we have a valid client for the current event loop.

        This method checks if the event loop has changed and recreates
        the HTTP client if necessary. This prevents "Event loop is closed"
        errors when the client is reused across different event loops.

        Returns:
            Valid httpx.AsyncClient for the current event loop
        """
        try:
            current_loop = asyncio.get_running_loop()
        except RuntimeError:
            # No running loop, create one will be handled by asyncio.run()
            current_loop = None

        # Check if we need to recreate the client
        if self._loop != current_loop or self._client is None:
            # Close old client if it exists
            if self._client is not None:
                try:
                    # Try to close gracefully, but don't fail if loop is gone
                    await self._client.aclose()
                except Exception:
                    pass  # Old loop might be closed, ignore errors

            # Create new client for current loop
            self._client = httpx.AsyncClient(**self._client_config)
            self._loop = current_loop
            self.logger.debug(f"Created new HTTP client for {self.provider} (loop change detected)")

        return self._client

    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=4, max=10),
        retry=retry_if_exception_type((httpx.RequestError, RateLimitError)),
    )
    async def _make_request_with_retry(self, payload: dict[str, Any]) -> ChatCompletionResponse:
        """Make HTTP request with automatic retry logic.

        Args:
            payload: Request payload for the API

        Returns:
            API response as dictionary

        Raises:
            FastAPIError: Various API errors
        """
        # Wait for rate limit token
        await self.rate_limiter.wait_for_token()

        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }

        # Ensure we have a valid client for current event loop
        client = await self._ensure_client()

        try:
            response = await client.post(
                f"{self.endpoint}/api/chat/completions",
                headers=headers,
                json=payload,
            )

            # Handle different HTTP status codes with context
            if response.status_code == 401:
                raise AuthenticationError(
                    "Invalid API key or unauthorized access",
                    {"endpoint": self.endpoint, "provider": self.provider},
                )
            elif response.status_code == 404:
                model = payload.get("model", "unknown")
                raise ModelNotFoundError(
                    f"Model '{model}' not found on {self.provider}",
                    {"model": model, "available_models": "See provider documentation"},
                )
            elif response.status_code == 429:
                # Rate limit exceeded - will trigger retry
                retry_after = response.headers.get("Retry-After", "unknown")
                raise RateLimitError(
                    f"API rate limit exceeded (retry after: {retry_after}s)",
                    {"retry_after": retry_after},
                )
            elif response.status_code >= 500:
                # Server error - will trigger retry
                raise FastAPIError(
                    f"Server error: {response.status_code}",
                    {"status_code": response.status_code, "provider": self.provider},
                )

            response.raise_for_status()
            return response.json()

        except httpx.RequestError as e:
            self.logger.error(f"Request error for {self.provider}: {e}")
            raise FastAPIError(
                f"Request failed: {str(e)}",
                {
                    "provider": self.provider,
                    "endpoint": self.endpoint,
                    "error_type": type(e).__name__,
                },
            ) from e

    async def generate_text(
        self,
        messages: list[Message],
        model: str,
        *,
        temperature: float = 0.2,
        top_p: float = 0.9,
        max_tokens: int = 1200,
        top_k: int | None = None,
        repetition_penalty: float | None = None,
        stream: bool = False,
    ) -> str:
        """Generate text using OpenAI-compatible chat completions endpoint.

        Args:
            messages: List of message dictionaries with 'role' and 'content'
            model: Model identifier (e.g., 'cirrascale/allenai/OLMo-2-1124-7B-Instruct')
            temperature: Sampling temperature (0.0 = deterministic, 1.0 = creative)
            top_p: Nucleus sampling parameter (0.0 to 1.0, lower = more focused)
            max_tokens: Maximum tokens to generate
            top_k: Top-k sampling parameter (limits vocabulary to k most likely tokens)
            repetition_penalty: Repetition penalty (>1.0 reduces repetition)
            stream: Whether to stream the response (not yet implemented)

        Returns:
            Generated text response

        Raises:
            FastAPIError: Various API errors
            NotImplementedError: If streaming is requested

        Example:
            >>> async with FastAPIClient(endpoint, api_key) as client:
            ...     response = await client.generate_text(
            ...         messages=[{"role": "user", "content": "Hello"}],
            ...         model="cirrascale/allenai/OLMo-2-1124-7B-Instruct",
            ...         temperature=0.7
            ...     )
        """
        # Validate inputs
        if not messages:
            raise ValueError("Messages list cannot be empty")
        if not model:
            raise ValueError("Model identifier cannot be empty")
        if not 0 <= temperature <= 2:
            raise ValueError(f"Temperature must be between 0 and 2, got {temperature}")
        if not 0 <= top_p <= 1:
            raise ValueError(f"Top_p must be between 0 and 1, got {top_p}")
        if max_tokens <= 0:
            raise ValueError(f"Max_tokens must be positive, got {max_tokens}")

        if stream:
            raise NotImplementedError("Streaming not yet implemented")

        # Strip provider prefix if present for API call
        api_model = model
        if "/" in model:
            # For Cirrascale, we need to extract just the model name
            # e.g., "cirrascale/allenai/OLMo-2-1124-7B-Instruct" -> "OLMo-2-1124-7B-Instruct"
            parts = model.split("/")
            if len(parts) >= 2 and parts[0] == self.provider:
                api_model = parts[-1]  # Get the last part which is the model name
            else:
                api_model = model

        # Build request payload following OpenAI API format
        payload = {
            "model": api_model,
            "messages": messages,
            "temperature": temperature,
            "top_p": top_p,
            "max_tokens": max_tokens,
        }

        # Add optional parameters if provided
        if top_k is not None:
            payload["top_k"] = top_k
        if repetition_penalty is not None:
            payload["repetition_penalty"] = repetition_penalty

        self.logger.debug(f"Sending request to {self.provider} for model {api_model}")
        import json

        self.logger.debug(f"Full payload: {json.dumps(payload, indent=2)}")

        try:
            response = await self._make_request_with_retry(payload)

            # Extract text from OpenAI-compatible response format
            if "choices" in response and len(response["choices"]) > 0:
                choice = response["choices"][0]
                if "message" in choice and "content" in choice["message"]:
                    return choice["message"]["content"]
                else:
                    raise FastAPIError(
                        "Invalid response format: missing message content",
                        {"response_structure": str(choice)[:200]},
                    )
            else:
                raise FastAPIError(
                    "Invalid response format: missing choices",
                    {"response_keys": list(response.keys())},
                )

        except Exception as e:
            self.logger.error(
                f"Error generating text with {self.provider}: {e}",
                extra={"model": model, "messages_count": len(messages)},
            )
            raise

    async def generate_text_with_metadata(
        self,
        messages: list[Message],
        model: str,
        *,
        temperature: float = 0.2,
        top_p: float = 0.9,
        max_tokens: int = 1200,
        top_k: int | None = None,
        repetition_penalty: float | None = None,
        stream: bool = False,
    ) -> dict[str, Any]:
        """Generate text and return full metadata from the API response.

        This method returns the complete response data including usage statistics,
        tokens, model information, and other metadata needed for research tracking.

        Args:
            messages: List of message dictionaries with 'role' and 'content'
            model: Model identifier
            temperature: Sampling temperature
            top_p: Nucleus sampling parameter
            max_tokens: Maximum tokens to generate
            top_k: Top-k sampling parameter
            repetition_penalty: Repetition penalty
            stream: Whether to stream the response

        Returns:
            Dictionary containing:
                - content: The generated text
                - prompt_tokens: Number of prompt tokens
                - completion_tokens: Number of completion tokens
                - total_tokens: Total tokens used
                - model: Model identifier used
                - raw_response: Complete API response

        Raises:
            FastAPIError: Various API errors
        """
        # Validate inputs
        if not messages:
            raise ValueError("Messages list cannot be empty")
        if not model:
            raise ValueError("Model identifier cannot be empty")
        if stream:
            raise NotImplementedError("Streaming not yet implemented")

        # Strip provider prefix if present for API call
        api_model = model
        if "/" in model:
            parts = model.split("/")
            if len(parts) >= 2 and parts[0] == self.provider:
                api_model = parts[-1]
            else:
                api_model = model

        # Build request payload
        payload = {
            "model": api_model,
            "messages": messages,
            "temperature": temperature,
            "top_p": top_p,
            "max_tokens": max_tokens,
        }

        if top_k is not None:
            payload["top_k"] = top_k
        if repetition_penalty is not None:
            payload["repetition_penalty"] = repetition_penalty

        self.logger.debug(f"Sending request to {self.provider} for model {api_model}")

        try:
            response = await self._make_request_with_retry(payload)

            # Extract content
            content = ""
            if "choices" in response and len(response["choices"]) > 0:
                choice = response["choices"][0]
                if "message" in choice and "content" in choice["message"]:
                    content = choice["message"]["content"]
                else:
                    raise FastAPIError(
                        "Invalid response format: missing message content",
                        {"response_structure": str(choice)[:200]},
                    )
            else:
                raise FastAPIError(
                    "Invalid response format: missing choices",
                    {"response_keys": list(response.keys())},
                )

            # Extract usage data if available
            usage = response.get("usage", {})
            prompt_tokens = usage.get("prompt_tokens", 0)
            completion_tokens = usage.get("completion_tokens", 0)
            total_tokens = usage.get("total_tokens", prompt_tokens + completion_tokens)

            # Build metadata response
            return {
                "content": content,
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion_tokens,
                "total_tokens": total_tokens,
                "model": response.get("model", api_model),
                "created": response.get("created"),
                "id": response.get("id"),
                "system_fingerprint": response.get("system_fingerprint"),
                "finish_reason": response["choices"][0].get("finish_reason")
                if response.get("choices")
                else None,
                "raw_response": response,
            }

        except Exception as e:
            self.logger.error(
                f"Error generating text with {self.provider}: {e}",
                extra={"model": model, "messages_count": len(messages)},
            )
            raise


# Global client cache for reuse across calls
_client_cache: dict[str, FastAPIClient] = {}


def create_client_from_env(provider: str = "cirrascale") -> FastAPIClient:
    """Create or retrieve a cached FastAPI client using environment variables.

    This function uses a cache to reuse clients for the same provider,
    improving performance by avoiding repeated client initialization.

    Args:
        provider: Provider name to configure

    Returns:
        Configured FastAPIClient instance (may be cached)

    Raises:
        EnvironmentError: If required environment variables are missing
    """
    # Check cache first
    if provider in _client_cache:
        return _client_cache[provider]

    from provider_routing import FASTAPI_PROVIDERS

    if provider not in FASTAPI_PROVIDERS:
        raise ValueError(f"Unknown provider: {provider}")

    config = FASTAPI_PROVIDERS[provider]
    api_key = os.getenv(config["api_key_env"])

    if not api_key:
        raise OSError(
            f"Missing required API key: {config['api_key_env']}. "
            f"Please set it in your .env file or environment."
        )

    # Create and cache the client
    client = FastAPIClient.get_instance(
        endpoint=config["endpoint"],
        api_key=api_key,
        provider=provider,
    )
    _client_cache[provider] = client

    return client


# Synchronous wrapper for compatibility with existing code
def generate_text_sync(
    client: FastAPIClient,
    messages: list[Message],
    model: str,
    **kwargs: Any,
) -> str:
    """Synchronous wrapper for generate_text.

    This function provides a synchronous interface to the async generate_text
    method for compatibility with existing synchronous code.

    Args:
        client: FastAPIClient instance
        messages: List of message dictionaries
        model: Model identifier
        **kwargs: Additional parameters for generate_text

    Returns:
        Generated text response
    """
    return asyncio.run(client.generate_text(messages, model, **kwargs))