From 931e7e45b6c6e79f6356090ec42cae0e53d69944 Mon Sep 17 00:00:00 2001 From: xodn348 Date: Wed, 6 May 2026 09:08:28 +0000 Subject: [PATCH] fix(tokenizers): raise ValueError instead of TypeError when Content-Length is absent --- src/cohere/manually_maintained/tokenizers.py | 5 ++- tests/test_tokenizer_config_size.py | 47 ++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 tests/test_tokenizer_config_size.py diff --git a/src/cohere/manually_maintained/tokenizers.py b/src/cohere/manually_maintained/tokenizers.py index c2681c62f..2cee825f3 100644 --- a/src/cohere/manually_maintained/tokenizers.py +++ b/src/cohere/manually_maintained/tokenizers.py @@ -99,4 +99,7 @@ def _get_tokenizer_config_size(tokenizer_url: str) -> float: if size: break - return round(int(typing.cast(int, size)) / 1024 / 1024, 2) + if size is None: + raise ValueError("Content-Length unavailable (server may use chunked transfer encoding)") + + return round(int(size) / 1024 / 1024, 2) diff --git a/tests/test_tokenizer_config_size.py b/tests/test_tokenizer_config_size.py new file mode 100644 index 000000000..c103d9973 --- /dev/null +++ b/tests/test_tokenizer_config_size.py @@ -0,0 +1,47 @@ +import sys +import types +import unittest +from unittest.mock import MagicMock, patch + +# Stub out the `tokenizers` C-extension so the module can be imported in CI +# without the native library present. +if "tokenizers" not in sys.modules: + tokenizers_stub = types.ModuleType("tokenizers") + tokenizers_stub.Tokenizer = object # type: ignore[attr-defined] + sys.modules["tokenizers"] = tokenizers_stub + +from cohere.manually_maintained.tokenizers import _get_tokenizer_config_size + + +class TestGetTokenizerConfigSize(unittest.TestCase): + def _make_head_response(self, headers: dict) -> MagicMock: + resp = MagicMock() + resp.headers = headers + return resp + + def test_content_length_header(self) -> None: + with patch("requests.head", return_value=self._make_head_response({"Content-Length": "2097152"})): + size = _get_tokenizer_config_size("https://example.com/tokenizer.json") + self.assertAlmostEqual(size, 2.0) + + def test_goog_stored_content_length_header(self) -> None: + with patch("requests.head", return_value=self._make_head_response({"x-goog-stored-content-length": "1048576"})): + size = _get_tokenizer_config_size("https://example.com/tokenizer.json") + self.assertAlmostEqual(size, 1.0) + + def test_goog_header_takes_priority_over_content_length(self) -> None: + with patch( + "requests.head", + return_value=self._make_head_response( + {"x-goog-stored-content-length": "1048576", "Content-Length": "2097152"} + ), + ): + size = _get_tokenizer_config_size("https://example.com/tokenizer.json") + self.assertAlmostEqual(size, 1.0) + + def test_raises_value_error_when_no_size_header(self) -> None: + """Chunked-transfer responses omit Content-Length; must raise ValueError, not TypeError.""" + with patch("requests.head", return_value=self._make_head_response({})): + with self.assertRaises(ValueError) as ctx: + _get_tokenizer_config_size("https://example.com/tokenizer.json") + self.assertIn("Content-Length unavailable", str(ctx.exception))