Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to the `fipsagents` package will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/).

## [0.14.1] - 2026-04-27

### Fixed

- **Cost tracking now records token usage in production** — `LLMClient.call_model_stream_raw` now sets `stream_options={"include_usage": True}` by default when streaming, so vLLM and other OpenAI-compat servers emit the terminal usage chunk that `OpenAIChatServer._persist_cost_data` relies on. Without this, `StreamMetrics.prompt_tokens` / `completion_tokens` stayed `None` and `cost_data` accumulators on the session never advanced past `{}`. Surfaced during the cluster smoke for [#116](https://github.com/fips-agents/agent-template/issues/116); fixes [#118](https://github.com/fips-agents/agent-template/issues/118). Callers can opt out by passing `stream_options={"include_usage": False}` (or supplying a different value) — the default uses `setdefault` semantics.

## [0.14.0] - 2026-04-27

### Added
Expand Down
2 changes: 1 addition & 1 deletion packages/fipsagents/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "fipsagents"
version = "0.14.0"
version = "0.14.1"
description = "Production-ready AI agent framework for FIPS/OpenShift environments"
readme = "README.md"
license = {file = "LICENSE"}
Expand Down
4 changes: 4 additions & 0 deletions packages/fipsagents/src/fipsagents/baseagent/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,10 @@ async def call_model_stream_raw(
call_kwargs = self._base_kwargs(**kwargs)
call_kwargs["messages"] = messages
call_kwargs["stream"] = True
# Opt into vLLM/OpenAI usage chunk on the terminal stream event so
# the server's cost-tracking accumulator sees prompt/completion
# tokens. setdefault keeps caller-supplied stream_options intact.
call_kwargs.setdefault("stream_options", {"include_usage": True})
if tools is not None:
call_kwargs["tools"] = tools
try:
Expand Down
56 changes: 56 additions & 0 deletions packages/fipsagents/tests/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,3 +359,59 @@ def always_fails(resp: ModelResponse) -> str:
validator_fn=always_fails,
max_retries=2,
)


# ---------------------------------------------------------------------------
# LLMClient.call_model_stream_raw — stream_options.include_usage default
# ---------------------------------------------------------------------------


class TestLLMClientCallModelStreamRaw:
@pytest.mark.asyncio
async def test_sets_include_usage_by_default(self):
"""Streaming calls must request the terminal usage chunk so the
server-layer cost-tracking accumulator sees prompt/completion tokens.
Regression for #118.
"""
config = LLMConfig(name="test-model")

async def empty_stream():
if False:
yield # type: ignore[unreachable]

with patch("fipsagents.baseagent.llm.AsyncOpenAI") as mock_cls:
mock_client = mock_cls.return_value
mock_create = AsyncMock(return_value=empty_stream())
mock_client.chat.completions.create = mock_create
client = LLMClient(config)
async for _ in client.call_model_stream_raw(
[{"role": "user", "content": "hi"}],
):
pass

kwargs = mock_create.call_args.kwargs
assert kwargs["stream"] is True
assert kwargs["stream_options"] == {"include_usage": True}

@pytest.mark.asyncio
async def test_caller_can_override_stream_options(self):
"""Callers passing stream_options explicitly win over the default."""
config = LLMConfig(name="test-model")

async def empty_stream():
if False:
yield # type: ignore[unreachable]

with patch("fipsagents.baseagent.llm.AsyncOpenAI") as mock_cls:
mock_client = mock_cls.return_value
mock_create = AsyncMock(return_value=empty_stream())
mock_client.chat.completions.create = mock_create
client = LLMClient(config)
async for _ in client.call_model_stream_raw(
[{"role": "user", "content": "hi"}],
stream_options={"include_usage": False},
):
pass

kwargs = mock_create.call_args.kwargs
assert kwargs["stream_options"] == {"include_usage": False}