Skip to content

Commit 6a4c564

Browse files
authored
Merge pull request #883 from microsasa/fix/880-token-value-equality-8342e9917f56ad22
fix: extract shared parse_token_int and add value-equality tests (#880)
2 parents d04b63d + aac155e commit 6a4c564

3 files changed

Lines changed: 95 additions & 42 deletions

File tree

src/copilot_usage/models.py

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
"session_sort_key",
4141
"shutdown_output_tokens",
4242
"total_output_tokens",
43+
"parse_token_int",
4344
]
4445

4546
# ---------------------------------------------------------------------------
@@ -208,6 +209,36 @@ class ToolRequest(BaseModel):
208209
type: str = ""
209210

210211

212+
def parse_token_int(raw: object) -> int | None:
213+
"""Parse a raw ``outputTokens`` value into a positive ``int``, or ``None``.
214+
215+
Centralises the token-validation rules shared by
216+
:meth:`AssistantMessageData._sanitize_non_numeric_tokens` (Pydantic
217+
boundary) and :func:`~copilot_usage.parser._extract_output_tokens`
218+
(parser fast path).
219+
220+
Rules:
221+
222+
- ``bool`` / ``str`` → ``None`` (invalid, not coerced)
223+
- non-whole ``float`` → ``None``
224+
- zero or negative ``int`` / ``float`` → ``None``
225+
- positive whole-number ``float`` → coerced to ``int``
226+
- positive ``int`` → returned as-is
227+
- any other type → ``None``
228+
"""
229+
if isinstance(raw, (bool, str)):
230+
return None
231+
if isinstance(raw, float):
232+
if not raw.is_integer():
233+
return None
234+
tokens = int(raw)
235+
elif isinstance(raw, int):
236+
tokens = raw
237+
else:
238+
return None
239+
return tokens if tokens > 0 else None
240+
241+
211242
class AssistantMessageData(BaseModel):
212243
"""Payload for ``assistant.message`` events."""
213244

@@ -221,25 +252,20 @@ class AssistantMessageData(BaseModel):
221252
def _sanitize_non_numeric_tokens(cls, v: object) -> object:
222253
"""Map non-positive, non-numeric, and non-whole-float token counts to ``0``.
223254
224-
JSON ``true``/``false``, numeric strings like ``"100"``,
225-
non-positive numeric values, and non-integer floats (e.g. ``1.5``)
226-
are not valid token counts. Returning ``0`` preserves parsing of
227-
the rest of the assistant message payload while preventing these
228-
values from being lax-coerced into token counts.
229-
230-
This aligns with ``_extract_output_tokens`` in the parser fast path:
231-
both paths agree that only positive whole-number values contribute
232-
tokens.
255+
Delegates to :func:`parse_token_int` for the actual validation
256+
logic. ``None`` (JSON ``null``) and types the helper recognises
257+
(``bool``, ``str``, ``int``, ``float``) are mapped to ``0`` when
258+
they don't represent a positive whole-number count, so that
259+
Pydantic's downstream ``int`` coercion always succeeds. Unknown
260+
types are passed through so that Pydantic can raise its own
261+
``ValidationError``.
233262
"""
234-
if isinstance(v, (bool, str)):
235-
return 0
236-
if isinstance(v, float):
237-
if not v.is_integer() or v <= 0:
238-
return 0
239-
return int(v)
240-
if isinstance(v, int) and v <= 0:
263+
if v is None:
241264
return 0
242-
return v
265+
if not isinstance(v, (bool, str, int, float)):
266+
return v
267+
result = parse_token_int(v)
268+
return result if result is not None else 0
243269

244270
reasoningText: str | None = None
245271
reasoningOpaque: str | None = None

src/copilot_usage/parser.py

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
TokenUsage,
3737
add_to_model_metrics,
3838
copy_model_metrics,
39+
parse_token_int,
3940
session_sort_key,
4041
)
4142

@@ -275,35 +276,17 @@ def _read_config_model(config_path: Path | None = None) -> str | None:
275276
def _extract_output_tokens(ev: SessionEvent) -> int | None:
276277
"""Extract ``outputTokens`` from an ``assistant.message`` event via direct dict access.
277278
278-
Mirrors the domain intent of :class:`AssistantMessageData`'s
279-
``_sanitize_non_numeric_tokens`` field-validator: only positive
280-
whole-number values contribute tokens. Both paths agree on whether a
281-
value contributes tokens; the representation differs for
282-
non-contributing values — this function returns ``None``, whereas the
283-
Pydantic model stores ``0``.
284-
285-
Specifically:
286-
287-
- ``bool`` / ``str`` → ``None`` (invalid, not coerced)
288-
- zero or negative ``int`` / ``float`` → ``None`` (non-positive)
289-
- whole-number positive ``float`` → coerced to ``int``
290-
- non-whole ``float`` / other non-numeric → ``None``
279+
Delegates to :func:`~copilot_usage.models.parse_token_int` for the
280+
actual validation logic, ensuring that the fast path and the Pydantic
281+
model path agree on whether a value contributes to token totals. For
282+
contributing values, they also agree on the numeric token count;
283+
non-contributing values may be represented differently here as ``None``
284+
and in the model as ``0``.
291285
292286
Callers are responsible for verifying ``ev.type`` before calling; this
293287
function reads only the ``outputTokens`` key from the event data dict.
294288
"""
295-
raw = ev.data.get("outputTokens")
296-
if raw is None or isinstance(raw, (bool, str)):
297-
return None
298-
if isinstance(raw, float):
299-
if not raw.is_integer():
300-
return None
301-
tokens = int(raw)
302-
elif isinstance(raw, int):
303-
tokens = raw
304-
else:
305-
return None
306-
return tokens if tokens > 0 else None
289+
return parse_token_int(ev.data.get("outputTokens"))
307290

308291

309292
def _full_scandir_discovery(

tests/copilot_usage/test_parser.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4808,6 +4808,7 @@ def test_missing_key_no_pydantic(self) -> None:
48084808
# ---------------------------------------------------------------------------
48094809

48104810
_EQUIVALENCE_CASES: list[tuple[str, object]] = [
4811+
("none_null", None),
48114812
("bool_true", True),
48124813
("bool_false", False),
48134814
("str_numeric", "100"),
@@ -4850,6 +4851,49 @@ def test_positive_contribution_agreement(
48504851
)
48514852

48524853

4854+
_VALUE_EQUALITY_CASES: list[tuple[str, object, int]] = [
4855+
("positive_int_1", 1, 1),
4856+
("positive_int_42", 42, 42),
4857+
("whole_float_1234", 1234.0, 1234),
4858+
("whole_float_1", 1.0, 1),
4859+
]
4860+
4861+
4862+
class TestExtractOutputTokensValueEquality:
4863+
"""When both paths agree a value contributes, the numeric counts must be identical."""
4864+
4865+
@pytest.mark.parametrize(
4866+
("label", "raw_value", "expected"),
4867+
_VALUE_EQUALITY_CASES,
4868+
ids=[c[0] for c in _VALUE_EQUALITY_CASES],
4869+
)
4870+
def test_token_value_agreement(
4871+
self, label: str, raw_value: object, expected: int
4872+
) -> None:
4873+
"""For contributing values, fast-path and model-path produce the same integer count."""
4874+
_ = label
4875+
fast_path_result = _extract_output_tokens(_make_assistant_event(raw_value))
4876+
model = AssistantMessageData.model_validate({"outputTokens": raw_value})
4877+
4878+
assert fast_path_result is not None, (
4879+
f"Expected fast path to contribute for {raw_value!r}"
4880+
)
4881+
assert model.outputTokens > 0, f"Expected model to contribute for {raw_value!r}"
4882+
assert fast_path_result == expected, (
4883+
f"Fast-path count mismatch for {raw_value!r}: "
4884+
f"got {fast_path_result}, expected {expected}"
4885+
)
4886+
assert model.outputTokens == expected, (
4887+
f"Model count mismatch for {raw_value!r}: "
4888+
f"got {model.outputTokens}, expected {expected}"
4889+
)
4890+
assert fast_path_result == model.outputTokens, (
4891+
f"Value divergence for {raw_value!r}: "
4892+
f"_extract_output_tokens → {fast_path_result}, "
4893+
f"AssistantMessageData.outputTokens → {model.outputTokens}"
4894+
)
4895+
4896+
48534897
class TestExtractOutputTokensIntegration:
48544898
"""Integration tests exercising _extract_output_tokens through parse → build_session_summary."""
48554899

0 commit comments

Comments
 (0)