feat: provider_cost_avoided_micro in Image/VideoResult; cache scope in API docs

uzunenes · cursoragent · uzunenes · commit a4636dd4dc36 · 2026-02-13T11:41:25.000+03:00
- ImageResult and VideoResult: add provider_cost_avoided_micro (micro-USD saved on cache hit)
- docs/API.md: document cache scope (org default, global), TTL=0, provider_cost_avoided_micro
- examples 07_cache_demo, 08_semantic_cache_demo: print provider_cost_avoided_micro
- examples/README: cache section with org-scope and new field

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/docs/API.md b/docs/API.md
@@ -2,6 +2,29 @@
 
 Short reference. Full OpenAPI spec: your API base URL + `/docs`.
 
+## Testing against the live API
+
+Use the **visgate-python** SDK and examples to hit the live API:
+
+```bash
+pip install visgate-sdk
+export VISGATE_API_KEY=vg-...
+
+# Health and models (minimal check)
+python examples/01_live_api_smoke.py
+
+# Exact cache: identical requests; second = cache hit
+python examples/07_cache_demo.py
+
+# Semantic cache: similar prompts; second may = cache hit (Vertex AI + Firestore)
+python examples/08_semantic_cache_demo.py
+
+# Run all capability examples
+python examples/run_all_capabilities.py
+```
+
+Base URL defaults to `https://visgateai.com/api/v1`. Override with `VISGATE_BASE_URL` for staging or local.
+
 ## Installation
 
 ```bash
@@ -125,6 +148,7 @@ result = client.images.generate(
     num_images=1,  # Optional, default 1
     seed=None,  # Optional, for reproducibility
     params=None,  # Optional, additional model-specific parameters
+    include_steps=False,  # Optional, if True response includes step timing (cache/provider/storage)
 )
 ```
 
@@ -137,6 +161,7 @@ result = await client.images.generate(
     width=1024,
     height=1024,
     num_images=1,
+    include_steps=False,  # Set True to get step timing in result.steps
 )
 ```
 
@@ -147,9 +172,19 @@ result = await client.images.generate(
 - `model` (str): Model identifier used
 - `provider` (str): Provider name
 - `cost` (float): Cost in USD
-- `cache_hit` (bool): Whether the result was served from cache
+- `cache_hit` (bool): Whether the result was served from cache (same model+prompt+size returns cached result with lower latency).
+- `provider_cost_avoided_micro` (int | None): When `cache_hit` is true, provider cost avoided in micro-USD (1e-6 USD). Omitted on cache miss.
 - `latency_ms` (int | None): Request latency in milliseconds
 - `created_at` (datetime): Creation timestamp
+- `output_storage` (str | None): Host/domain where the output is stored (e.g. `storage.googleapis.com` or provider CDN). Present when the API returns it.
+- `output_size_bytes` (int | None): Size of the primary output in bytes, when available.
+- `steps` (list | None): Per-step timing and metadata (e.g. cache lookup, provider call, storage). Only present when `include_steps=True` was passed.
+
+## Cache
+
+- **Scope:** Cache is **org-scoped** by default (`VISGATE_CACHE_SCOPE=org`). Keys include organization ID so different orgs do not share cache. Set `VISGATE_CACHE_SCOPE=global` to share cache across organizations. TTL is configurable; `VISGATE_CACHE_TTL_SECONDS=0` means never expire.
+- **Exact cache:** Same model + prompt + size → same cache key. Second request returns from cache (lower latency); response includes `cache_hit: true` and `provider_cost_avoided_micro` (cost saved in micro-USD). Example: `examples/07_cache_demo.py`.
+- **Semantic cache:** Similar wording, different text. The API uses Vertex AI embeddings and Firestore to match prompts; when similarity is above threshold, the result is served from cache and the provider is not called. Response includes `cache_hit: true` and `provider_cost_avoided_micro`. Different models’ results can be reused (no model filter). Example: `examples/08_semantic_cache_demo.py`.
 
 ## Videos Resource
 
@@ -185,6 +220,7 @@ result = await client.videos.generate(
 - `provider` (str): Provider name
 - `cost` (float): Cost in USD
 - `cache_hit` (bool): Whether the result was served from cache
+- `provider_cost_avoided_micro` (int | None): When `cache_hit` is true, provider cost avoided in micro-USD (1e-6 USD). Omitted on cache miss.
 - `latency_ms` (int | None): Request latency in milliseconds
 - `created_at` (datetime): Creation timestamp
 
diff --git a/examples/07_cache_demo.py b/examples/07_cache_demo.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+"""Cache demo: two identical image requests — second should be cache hit.
+
+Run against live API. First request fills the cache; second returns from cache
+(cache_hit=True, lower latency).
+
+  VISGATE_API_KEY=vg-... python examples/07_cache_demo.py
+"""
+from __future__ import annotations
+
+from _common import create_client
+
+
+def main() -> int:
+    prompt = "a red apple on a wooden table, studio lighting"
+    model = "fal-ai/flux/schnell"
+    width, height = 1024, 1024
+
+    with create_client() as client:
+        # 1) First request — cache miss
+        r1 = client.images.generate(
+            model=model,
+            prompt=prompt,
+            width=width,
+            height=height,
+            num_images=1,
+        )
+        print(f"Request 1: cache_hit={r1.cache_hit}, latency_ms={r1.latency_ms}, cost={r1.cost}")
+
+        # 2) Second request — same params, expect cache hit
+        r2 = client.images.generate(
+            model=model,
+            prompt=prompt,
+            width=width,
+            height=height,
+            num_images=1,
+        )
+        print(
+            f"Request 2: cache_hit={r2.cache_hit}, latency_ms={r2.latency_ms}, cost={r2.cost}, "
+            f"provider_cost_avoided_micro={r2.provider_cost_avoided_micro}"
+        )
+
+        if r2.cache_hit and r2.latency_ms is not None and r1.latency_ms is not None:
+            if r2.latency_ms < r1.latency_ms:
+                print("OK: Second request was faster (cache hit).")
+            else:
+                print("OK: Second request was cache hit (latency may vary).")
+        elif r2.cache_hit:
+            print("OK: Second request was cache hit.")
+        else:
+            print("Note: Second request was not a cache hit (TTL or key may differ).")
+
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main())
diff --git a/examples/08_semantic_cache_demo.py b/examples/08_semantic_cache_demo.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+"""Semantic cache demo: similar (not identical) prompts — second may be cache hit.
+
+The API uses Vertex AI embeddings and Firestore to match semantically similar
+prompts. When a match is found above the similarity threshold, the result is
+returned from cache without calling the provider, reducing cost significantly.
+
+Run against live API. First request fills the cache; second uses a different
+wording but same meaning — may return cache_hit=True if API has semantic
+search enabled and embeddings are available.
+
+  VISGATE_API_KEY=vg-... python examples/08_semantic_cache_demo.py
+"""
+from __future__ import annotations
+
+from _common import create_client
+
+
+def main() -> int:
+    prompt1 = "a red apple on a wooden table, studio lighting"
+    prompt2 = "red apple on wooden table with studio lights"
+    model = "fal-ai/flux/schnell"
+    width, height = 1024, 1024
+
+    with create_client() as client:
+        # 1) First request — cache miss, result and embedding stored
+        r1 = client.images.generate(
+            model=model,
+            prompt=prompt1,
+            width=width,
+            height=height,
+            num_images=1,
+            include_steps=True,
+        )
+        print(f"Request 1 (exact): cache_hit={r1.cache_hit}, latency_ms={r1.latency_ms}, cost={r1.cost}")
+
+        # 2) Second request — semantically similar prompt; may hit semantic cache
+        r2 = client.images.generate(
+            model=model,
+            prompt=prompt2,
+            width=width,
+            height=height,
+            num_images=1,
+            include_steps=True,
+        )
+        print(
+            f"Request 2 (similar): cache_hit={r2.cache_hit}, latency_ms={r2.latency_ms}, cost={r2.cost}, "
+            f"provider_cost_avoided_micro={r2.provider_cost_avoided_micro}"
+        )
+
+        if r2.cache_hit:
+            print("OK: Second request was cache hit (semantic match). Cost avoided vs provider.")
+        else:
+            print("Note: Second request was not a cache hit (semantic search may need embeddings/Vertex AI enabled).")
+
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main())
diff --git a/examples/README.md b/examples/README.md
@@ -29,6 +29,8 @@ export VISGATE_RUNWAY_API_KEY="..."
 | `04_videos_all_providers.py` | Video generation | Yes |
 | `05_usage_history_verify.py` | Usage, logs, dashboard | Yes |
 | `06_provider_balances.py` | Provider balance and limits | Yes |
+| `07_cache_demo.py` | Exact cache: two identical image requests, second = cache hit | Yes |
+| `08_semantic_cache_demo.py` | Semantic cache: similar prompt, cache hit, lower cost (Vertex AI + Firestore) | Yes |
 
 ## Run All
 
@@ -37,3 +39,9 @@ VISGATE_API_KEY=vg-... python examples/run_all_capabilities.py
 ```
 
 The first two steps run without an API key. `VISGATE_API_KEY` is required from step 3 onward.
+
+## Testing cache
+
+- **Exact cache:** Same model + prompt + size. Second request returns from cache; response includes `cache_hit=True` and `provider_cost_avoided_micro` (cost saved in micro-USD). Run `python examples/07_cache_demo.py`.
+- **Semantic cache:** Similar but different wording; API matches via Vertex AI embedding + Firestore. Second request may return `cache_hit=True` and `provider_cost_avoided_micro`. Run `python examples/08_semantic_cache_demo.py`.
+- Cache is org-scoped by default; see API docs for `VISGATE_CACHE_SCOPE` and TTL.
diff --git a/src/visgate_sdk/resources/images.py b/src/visgate_sdk/resources/images.py
@@ -23,8 +23,12 @@ class ImageResult:
         provider: Provider name (e.g. ``"fal"``).
         cost: Cost in USD.
         cache_hit: Whether the result was served from cache.
+        provider_cost_avoided_micro: When cache_hit is True, provider cost avoided in micro-USD (1e-6 USD).
         latency_ms: Server-side latency in milliseconds.
         created_at: Timestamp of the request.
+        output_storage: Host/domain where output is stored (e.g. provider CDN). Present when API returns it.
+        output_size_bytes: Size of primary output in bytes, when available.
+        steps: Per-step timing/metadata (cache, provider, storage). Present when include_steps=True.
     """
 
     id: str
@@ -33,8 +37,12 @@ class ImageResult:
     provider: str
     cost: float
     cache_hit: bool = False
+    provider_cost_avoided_micro: Optional[int] = None
     latency_ms: Optional[int] = None
     created_at: Optional[datetime] = None
+    output_storage: Optional[str] = None
+    output_size_bytes: Optional[int] = None
+    steps: Optional[List[Dict[str, Any]]] = None
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> ImageResult:
@@ -45,8 +53,12 @@ def from_dict(cls, data: Dict[str, Any]) -> ImageResult:
             provider=data["provider"],
             cost=data.get("cost", 0.0),
             cache_hit=data.get("cache_hit", False),
+            provider_cost_avoided_micro=data.get("provider_cost_avoided_micro"),
             latency_ms=data.get("latency_ms"),
             created_at=parse_datetime(data.get("created_at")),
+            output_storage=data.get("output_storage"),
+            output_size_bytes=data.get("output_size_bytes"),
+            steps=data.get("steps"),
         )
 
     def __repr__(self) -> str:
@@ -73,6 +85,7 @@ def generate(
         num_images: int = 1,
         seed: Optional[int] = None,
         params: Optional[Dict[str, Any]] = None,
+        include_steps: bool = False,
     ) -> ImageResult:
         """Generate image(s).
 
@@ -85,6 +98,7 @@ def generate(
             num_images: Number of images to generate. Defaults to 1.
             seed: Random seed for reproducibility.
             params: Additional model-specific parameters.
+            include_steps: If True, response includes step timing (cache/provider/storage) in result.steps.
 
         Returns:
             ImageResult with generated image URLs and metadata.
@@ -103,7 +117,10 @@ def generate(
         if params:
             payload.update(params)
 
-        data = self._client._request("POST", "/images/generate", json=payload)
+        query_params = {"include_steps": str(include_steps).lower()} if include_steps else None
+        data = self._client._request(
+            "POST", "/images/generate", json=payload, params=query_params
+        )
         return ImageResult.from_dict(data)
 
 
@@ -124,6 +141,7 @@ async def generate(
         num_images: int = 1,
         seed: Optional[int] = None,
         params: Optional[Dict[str, Any]] = None,
+        include_steps: bool = False,
     ) -> ImageResult:
         """Generate image(s). See :meth:`Images.generate` for details."""
         payload: Dict[str, Any] = {
@@ -140,5 +158,8 @@ async def generate(
         if params:
             payload.update(params)
 
-        data = await self._client._request("POST", "/images/generate", json=payload)
+        query_params = {"include_steps": str(include_steps).lower()} if include_steps else None
+        data = await self._client._request(
+            "POST", "/images/generate", json=payload, params=query_params
+        )
         return ImageResult.from_dict(data)
diff --git a/src/visgate_sdk/resources/videos.py b/src/visgate_sdk/resources/videos.py
@@ -23,6 +23,7 @@ class VideoResult:
         provider: Provider name (e.g. ``"runway"``).
         cost: Cost in USD.
         cache_hit: Whether the result was served from cache.
+        provider_cost_avoided_micro: When cache_hit is True, provider cost avoided in micro-USD (1e-6 USD).
         latency_ms: Server-side latency in milliseconds.
         created_at: Timestamp of the request.
     """
@@ -33,6 +34,7 @@ class VideoResult:
     provider: str
     cost: float
     cache_hit: bool = False
+    provider_cost_avoided_micro: Optional[int] = None
     latency_ms: Optional[int] = None
     created_at: Optional[datetime] = None
 
@@ -45,6 +47,7 @@ def from_dict(cls, data: Dict[str, Any]) -> VideoResult:
             provider=data["provider"],
             cost=data.get("cost", 0.0),
             cache_hit=data.get("cache_hit", False),
+            provider_cost_avoided_micro=data.get("provider_cost_avoided_micro"),
             latency_ms=data.get("latency_ms"),
             created_at=parse_datetime(data.get("created_at")),
         )