From d1810cee76542e46fb2df845842d12200c22eb21 Mon Sep 17 00:00:00 2001 From: rdwj Date: Mon, 27 Apr 2026 21:55:51 -0500 Subject: [PATCH] fix(llm): set stream_options.include_usage so cost tracking works call_model_stream_raw now defaults stream_options to {"include_usage": True}. Without it, vLLM (and any OpenAI-compatible server) never emits a terminal usage chunk on streaming responses, which means StreamMetrics.prompt_tokens / completion_tokens stay None and OpenAIChatServer._persist_cost_data returns early -- cost_data accumulators stayed empty in production despite full unit-test green. Surfaced during the cluster smoke for #116. setdefault semantics so callers can opt out by passing stream_options={include_usage: False}. Bumps fipsagents to 0.14.1. Closes #118. Assisted-by: Claude Code (Opus 4.7) --- CHANGELOG.md | 6 ++ packages/fipsagents/pyproject.toml | 2 +- .../src/fipsagents/baseagent/llm.py | 4 ++ packages/fipsagents/tests/test_llm.py | 56 +++++++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7876c4..3ebb564 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to the `fipsagents` package will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/). +## [0.14.1] - 2026-04-27 + +### Fixed + +- **Cost tracking now records token usage in production** — `LLMClient.call_model_stream_raw` now sets `stream_options={"include_usage": True}` by default when streaming, so vLLM and other OpenAI-compat servers emit the terminal usage chunk that `OpenAIChatServer._persist_cost_data` relies on. Without this, `StreamMetrics.prompt_tokens` / `completion_tokens` stayed `None` and `cost_data` accumulators on the session never advanced past `{}`. Surfaced during the cluster smoke for [#116](https://github.com/fips-agents/agent-template/issues/116); fixes [#118](https://github.com/fips-agents/agent-template/issues/118). Callers can opt out by passing `stream_options={"include_usage": False}` (or supplying a different value) — the default uses `setdefault` semantics. + ## [0.14.0] - 2026-04-27 ### Added diff --git a/packages/fipsagents/pyproject.toml b/packages/fipsagents/pyproject.toml index 7cf2c27..a53ceff 100644 --- a/packages/fipsagents/pyproject.toml +++ b/packages/fipsagents/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "fipsagents" -version = "0.14.0" +version = "0.14.1" description = "Production-ready AI agent framework for FIPS/OpenShift environments" readme = "README.md" license = {file = "LICENSE"} diff --git a/packages/fipsagents/src/fipsagents/baseagent/llm.py b/packages/fipsagents/src/fipsagents/baseagent/llm.py index c335108..6cefb18 100644 --- a/packages/fipsagents/src/fipsagents/baseagent/llm.py +++ b/packages/fipsagents/src/fipsagents/baseagent/llm.py @@ -312,6 +312,10 @@ async def call_model_stream_raw( call_kwargs = self._base_kwargs(**kwargs) call_kwargs["messages"] = messages call_kwargs["stream"] = True + # Opt into vLLM/OpenAI usage chunk on the terminal stream event so + # the server's cost-tracking accumulator sees prompt/completion + # tokens. setdefault keeps caller-supplied stream_options intact. + call_kwargs.setdefault("stream_options", {"include_usage": True}) if tools is not None: call_kwargs["tools"] = tools try: diff --git a/packages/fipsagents/tests/test_llm.py b/packages/fipsagents/tests/test_llm.py index a35b42e..25c9052 100644 --- a/packages/fipsagents/tests/test_llm.py +++ b/packages/fipsagents/tests/test_llm.py @@ -359,3 +359,59 @@ def always_fails(resp: ModelResponse) -> str: validator_fn=always_fails, max_retries=2, ) + + +# --------------------------------------------------------------------------- +# LLMClient.call_model_stream_raw — stream_options.include_usage default +# --------------------------------------------------------------------------- + + +class TestLLMClientCallModelStreamRaw: + @pytest.mark.asyncio + async def test_sets_include_usage_by_default(self): + """Streaming calls must request the terminal usage chunk so the + server-layer cost-tracking accumulator sees prompt/completion tokens. + Regression for #118. + """ + config = LLMConfig(name="test-model") + + async def empty_stream(): + if False: + yield # type: ignore[unreachable] + + with patch("fipsagents.baseagent.llm.AsyncOpenAI") as mock_cls: + mock_client = mock_cls.return_value + mock_create = AsyncMock(return_value=empty_stream()) + mock_client.chat.completions.create = mock_create + client = LLMClient(config) + async for _ in client.call_model_stream_raw( + [{"role": "user", "content": "hi"}], + ): + pass + + kwargs = mock_create.call_args.kwargs + assert kwargs["stream"] is True + assert kwargs["stream_options"] == {"include_usage": True} + + @pytest.mark.asyncio + async def test_caller_can_override_stream_options(self): + """Callers passing stream_options explicitly win over the default.""" + config = LLMConfig(name="test-model") + + async def empty_stream(): + if False: + yield # type: ignore[unreachable] + + with patch("fipsagents.baseagent.llm.AsyncOpenAI") as mock_cls: + mock_client = mock_cls.return_value + mock_create = AsyncMock(return_value=empty_stream()) + mock_client.chat.completions.create = mock_create + client = LLMClient(config) + async for _ in client.call_model_stream_raw( + [{"role": "user", "content": "hi"}], + stream_options={"include_usage": False}, + ): + pass + + kwargs = mock_create.call_args.kwargs + assert kwargs["stream_options"] == {"include_usage": False}