From d1810cee76542e46fb2df845842d12200c22eb21 Mon Sep 17 00:00:00 2001
From: rdwj <wjackson@redhat.com>
Date: Mon, 27 Apr 2026 21:55:51 -0500
Subject: [PATCH] fix(llm): set stream_options.include_usage so cost tracking
 works

call_model_stream_raw now defaults stream_options to
{"include_usage": True}. Without it, vLLM (and any OpenAI-compatible
server) never emits a terminal usage chunk on streaming responses,
which means StreamMetrics.prompt_tokens / completion_tokens stay None
and OpenAIChatServer._persist_cost_data returns early -- cost_data
accumulators stayed empty in production despite full unit-test green.

Surfaced during the cluster smoke for #116. setdefault semantics so
callers can opt out by passing stream_options={include_usage: False}.

Bumps fipsagents to 0.14.1.

Closes #118.

Assisted-by: Claude Code (Opus 4.7)
---
 CHANGELOG.md                                  |  6 ++
 packages/fipsagents/pyproject.toml            |  2 +-
 .../src/fipsagents/baseagent/llm.py           |  4 ++
 packages/fipsagents/tests/test_llm.py         | 56 +++++++++++++++++++
 4 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b7876c4..3ebb564 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to the `fipsagents` package will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/).
 
+## [0.14.1] - 2026-04-27
+
+### Fixed
+
+- **Cost tracking now records token usage in production** — `LLMClient.call_model_stream_raw` now sets `stream_options={"include_usage": True}` by default when streaming, so vLLM and other OpenAI-compat servers emit the terminal usage chunk that `OpenAIChatServer._persist_cost_data` relies on. Without this, `StreamMetrics.prompt_tokens` / `completion_tokens` stayed `None` and `cost_data` accumulators on the session never advanced past `{}`. Surfaced during the cluster smoke for [#116](https://github.com/fips-agents/agent-template/issues/116); fixes [#118](https://github.com/fips-agents/agent-template/issues/118). Callers can opt out by passing `stream_options={"include_usage": False}` (or supplying a different value) — the default uses `setdefault` semantics.
+
 ## [0.14.0] - 2026-04-27
 
 ### Added
diff --git a/packages/fipsagents/pyproject.toml b/packages/fipsagents/pyproject.toml
index 7cf2c27..a53ceff 100644
--- a/packages/fipsagents/pyproject.toml
+++ b/packages/fipsagents/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "fipsagents"
-version = "0.14.0"
+version = "0.14.1"
 description = "Production-ready AI agent framework for FIPS/OpenShift environments"
 readme = "README.md"
 license = {file = "LICENSE"}
diff --git a/packages/fipsagents/src/fipsagents/baseagent/llm.py b/packages/fipsagents/src/fipsagents/baseagent/llm.py
index c335108..6cefb18 100644
--- a/packages/fipsagents/src/fipsagents/baseagent/llm.py
+++ b/packages/fipsagents/src/fipsagents/baseagent/llm.py
@@ -312,6 +312,10 @@ async def call_model_stream_raw(
         call_kwargs = self._base_kwargs(**kwargs)
         call_kwargs["messages"] = messages
         call_kwargs["stream"] = True
+        # Opt into vLLM/OpenAI usage chunk on the terminal stream event so
+        # the server's cost-tracking accumulator sees prompt/completion
+        # tokens. setdefault keeps caller-supplied stream_options intact.
+        call_kwargs.setdefault("stream_options", {"include_usage": True})
         if tools is not None:
             call_kwargs["tools"] = tools
         try:
diff --git a/packages/fipsagents/tests/test_llm.py b/packages/fipsagents/tests/test_llm.py
index a35b42e..25c9052 100644
--- a/packages/fipsagents/tests/test_llm.py
+++ b/packages/fipsagents/tests/test_llm.py
@@ -359,3 +359,59 @@ def always_fails(resp: ModelResponse) -> str:
                         validator_fn=always_fails,
                         max_retries=2,
                     )
+
+
+# ---------------------------------------------------------------------------
+# LLMClient.call_model_stream_raw — stream_options.include_usage default
+# ---------------------------------------------------------------------------
+
+
+class TestLLMClientCallModelStreamRaw:
+    @pytest.mark.asyncio
+    async def test_sets_include_usage_by_default(self):
+        """Streaming calls must request the terminal usage chunk so the
+        server-layer cost-tracking accumulator sees prompt/completion tokens.
+        Regression for #118.
+        """
+        config = LLMConfig(name="test-model")
+
+        async def empty_stream():
+            if False:
+                yield  # type: ignore[unreachable]
+
+        with patch("fipsagents.baseagent.llm.AsyncOpenAI") as mock_cls:
+            mock_client = mock_cls.return_value
+            mock_create = AsyncMock(return_value=empty_stream())
+            mock_client.chat.completions.create = mock_create
+            client = LLMClient(config)
+            async for _ in client.call_model_stream_raw(
+                [{"role": "user", "content": "hi"}],
+            ):
+                pass
+
+        kwargs = mock_create.call_args.kwargs
+        assert kwargs["stream"] is True
+        assert kwargs["stream_options"] == {"include_usage": True}
+
+    @pytest.mark.asyncio
+    async def test_caller_can_override_stream_options(self):
+        """Callers passing stream_options explicitly win over the default."""
+        config = LLMConfig(name="test-model")
+
+        async def empty_stream():
+            if False:
+                yield  # type: ignore[unreachable]
+
+        with patch("fipsagents.baseagent.llm.AsyncOpenAI") as mock_cls:
+            mock_client = mock_cls.return_value
+            mock_create = AsyncMock(return_value=empty_stream())
+            mock_client.chat.completions.create = mock_create
+            client = LLMClient(config)
+            async for _ in client.call_model_stream_raw(
+                [{"role": "user", "content": "hi"}],
+                stream_options={"include_usage": False},
+            ):
+                pass
+
+        kwargs = mock_create.call_args.kwargs
+        assert kwargs["stream_options"] == {"include_usage": False}