fips-agents · rdwj · Apr 28, 2026 · Apr 28, 2026
@@ -4,6 +4,12 @@ All notable changes to the `fipsagents` package will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/).
 
+## [0.14.1] - 2026-04-27
+
+### Fixed
+
+- **Cost tracking now records token usage in production** — `LLMClient.call_model_stream_raw` now sets `stream_options={"include_usage": True}` by default when streaming, so vLLM and other OpenAI-compat servers emit the terminal usage chunk that `OpenAIChatServer._persist_cost_data` relies on. Without this, `StreamMetrics.prompt_tokens` / `completion_tokens` stayed `None` and `cost_data` accumulators on the session never advanced past `{}`. Surfaced during the cluster smoke for [#116](https://github.com/fips-agents/agent-template/issues/116); fixes [#118](https://github.com/fips-agents/agent-template/issues/118). Callers can opt out by passing `stream_options={"include_usage": False}` (or supplying a different value) — the default uses `setdefault` semantics.
+
 ## [0.14.0] - 2026-04-27
 
 ### Added

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "fipsagents"
-version = "0.14.0"
+version = "0.14.1"
 description = "Production-ready AI agent framework for FIPS/OpenShift environments"
 readme = "README.md"
 license = {file = "LICENSE"}

@@ -312,6 +312,10 @@ async def call_model_stream_raw(
         call_kwargs = self._base_kwargs(**kwargs)
         call_kwargs["messages"] = messages
         call_kwargs["stream"] = True
+        # Opt into vLLM/OpenAI usage chunk on the terminal stream event so
+        # the server's cost-tracking accumulator sees prompt/completion
+        # tokens. setdefault keeps caller-supplied stream_options intact.
+        call_kwargs.setdefault("stream_options", {"include_usage": True})
         if tools is not None:
             call_kwargs["tools"] = tools
         try:

@@ -359,3 +359,59 @@ def always_fails(resp: ModelResponse) -> str:
                         validator_fn=always_fails,
                         max_retries=2,
                     )
+
+
+# ---------------------------------------------------------------------------
+# LLMClient.call_model_stream_raw — stream_options.include_usage default
+# ---------------------------------------------------------------------------
+
+
+class TestLLMClientCallModelStreamRaw:
+    @pytest.mark.asyncio
+    async def test_sets_include_usage_by_default(self):
+        """Streaming calls must request the terminal usage chunk so the
+        server-layer cost-tracking accumulator sees prompt/completion tokens.
+        Regression for #118.
+        """
+        config = LLMConfig(name="test-model")
+
+        async def empty_stream():
+            if False:
+                yield  # type: ignore[unreachable]
+
+        with patch("fipsagents.baseagent.llm.AsyncOpenAI") as mock_cls:
+            mock_client = mock_cls.return_value
+            mock_create = AsyncMock(return_value=empty_stream())
+            mock_client.chat.completions.create = mock_create
+            client = LLMClient(config)
+            async for _ in client.call_model_stream_raw(
+                [{"role": "user", "content": "hi"}],
+            ):
+                pass
+
+        kwargs = mock_create.call_args.kwargs
+        assert kwargs["stream"] is True
+        assert kwargs["stream_options"] == {"include_usage": True}
+
+    @pytest.mark.asyncio
+    async def test_caller_can_override_stream_options(self):
+        """Callers passing stream_options explicitly win over the default."""
+        config = LLMConfig(name="test-model")
+
+        async def empty_stream():
+            if False:
+                yield  # type: ignore[unreachable]
+
+        with patch("fipsagents.baseagent.llm.AsyncOpenAI") as mock_cls:
+            mock_client = mock_cls.return_value
+            mock_create = AsyncMock(return_value=empty_stream())
+            mock_client.chat.completions.create = mock_create
+            client = LLMClient(config)
+            async for _ in client.call_model_stream_raw(
+                [{"role": "user", "content": "hi"}],
+                stream_options={"include_usage": False},
+            ):
+                pass
+
+        kwargs = mock_create.call_args.kwargs
+        assert kwargs["stream_options"] == {"include_usage": False}