From d0b2ac26d231d3dd533d0f93a9584c38516689d6 Mon Sep 17 00:00:00 2001 From: voorhs Date: Sun, 10 May 2026 15:43:14 +0300 Subject: [PATCH 1/5] implement --- src/autointent/_wrappers/embedder/openai.py | 27 ++++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/autointent/_wrappers/embedder/openai.py b/src/autointent/_wrappers/embedder/openai.py index 5cb41145..97af725d 100644 --- a/src/autointent/_wrappers/embedder/openai.py +++ b/src/autointent/_wrappers/embedder/openai.py @@ -24,6 +24,7 @@ import numpy.typing as npt import openai + from tiktoken import Encoding from typing_extensions import NotRequired from autointent.configs import TaskTypeEnum @@ -31,6 +32,27 @@ logger = logging.getLogger(__name__) +# Third-party embedding model ids (e.g. OpenRouter) are unknown to tiktoken; use a conservative encoding +# only for counting tokens when splitting batches. +_FALLBACK_TIKTOKEN_ENCODING = "cl100k_base" + + +def _tiktoken_encoding_for_embedding_model(model_name: str) -> Encoding: + """Resolve tiktoken encoding for batch sizing; fallback for unknown provider model ids.""" + require("tiktoken", "openai") + import tiktoken + + try: + return tiktoken.encoding_for_model(model_name) + except KeyError: + logger.warning( + "tiktoken has no mapping for embedding model %r; using %r for token counting " + "(per-request batch limits are approximate).", + model_name, + _FALLBACK_TIKTOKEN_ENCODING, + ) + return tiktoken.get_encoding(_FALLBACK_TIKTOKEN_ENCODING) + class EmbeddingsCreateKwargs(TypedDict): input: list[str] @@ -325,10 +347,7 @@ def _batch_strings_by_token_budget( if max_tokens_per_batch is None: return [texts[i : i + max_strings_per_batch] for i in range(0, len(texts), max_strings_per_batch)] - require("tiktoken", "openai") - import tiktoken - - encoding = tiktoken.encoding_for_model(model_name) + encoding = _tiktoken_encoding_for_embedding_model(model_name) batches: list[list[str]] = [] current_batch: list[str] = [] current_tokens = 0 From c2f4339608a64715e807ac833bd7b18c27fa945f Mon Sep 17 00:00:00 2001 From: voorhs Date: Sun, 10 May 2026 15:43:18 +0300 Subject: [PATCH 2/5] add tests --- tests/embedder/test_openai_token_batching.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/embedder/test_openai_token_batching.py b/tests/embedder/test_openai_token_batching.py index 88105f51..fded49b9 100644 --- a/tests/embedder/test_openai_token_batching.py +++ b/tests/embedder/test_openai_token_batching.py @@ -23,6 +23,17 @@ def test_batch_strings_none_max_tokens_uses_batch_size_only() -> None: assert batches == [["a", "b", "c"], ["d", "e", "f"]] +def test_batch_strings_unknown_model_uses_fallback_encoding() -> None: + """Third-party embedding ids (e.g. OpenRouter) are not in tiktoken's model map.""" + batches = _batch_strings_by_token_budget( + ["hello", "world"], + model_name="qwen/qwen3-embedding-8b", + max_strings_per_batch=10, + max_tokens_per_batch=100, + ) + assert batches == [["hello", "world"]] + + def test_batch_strings_respects_token_budget() -> None: encoding = tiktoken.encoding_for_model("text-embedding-3-small") batches = _batch_strings_by_token_budget( From 46ff7e3999e3c41cf98807d738a02c7c279c4a9d Mon Sep 17 00:00:00 2001 From: voorhs Date: Sun, 10 May 2026 19:12:59 +0300 Subject: [PATCH 3/5] propagade error message from openai api --- src/autointent/_wrappers/embedder/openai.py | 42 +++++++++++++++++++- tests/embedder/test_openai_token_batching.py | 18 +++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/src/autointent/_wrappers/embedder/openai.py b/src/autointent/_wrappers/embedder/openai.py index 97af725d..9eada4f0 100644 --- a/src/autointent/_wrappers/embedder/openai.py +++ b/src/autointent/_wrappers/embedder/openai.py @@ -35,6 +35,44 @@ # Third-party embedding model ids (e.g. OpenRouter) are unknown to tiktoken; use a conservative encoding # only for counting tokens when splitting batches. _FALLBACK_TIKTOKEN_ENCODING = "cl100k_base" +_ERROR_DETAIL_LIMIT = 2000 + + +def _compact_error_detail(value: object) -> str: + """Render provider error details without letting huge bodies flood logs/results.""" + if isinstance(value, (dict, list, tuple)): + try: + text = json.dumps(value, ensure_ascii=False) + except TypeError: + text = repr(value) + else: + text = str(value) + + if len(text) <= _ERROR_DETAIL_LIMIT: + return text + return f"{text[:_ERROR_DETAIL_LIMIT]}... " + + +def _openai_api_error_message(exc: BaseException, *, batch_size: int) -> str: + """Build a RuntimeError message that preserves useful OpenAI/provider details.""" + details = [f"{exc.__class__.__name__}: {_compact_error_detail(exc)}"] + + for attr in ("status_code", "code", "type", "body"): + value = getattr(exc, attr, None) + if value is not None: + details.append(f"{attr}={_compact_error_detail(value)}") + + response = getattr(exc, "response", None) + if response is not None: + status_code = getattr(response, "status_code", None) + if status_code is not None: + details.append(f"response_status_code={status_code}") + + response_text = getattr(response, "text", None) + if response_text: + details.append(f"response_text={_compact_error_detail(response_text)}") + + return f"Error calling OpenAI API (batch_size={batch_size}): {'; '.join(details)}" def _tiktoken_encoding_for_embedding_model(model_name: str) -> Encoding: @@ -230,7 +268,7 @@ def _process_embeddings_sync(self, utterances: list[str]) -> npt.NDArray[np.floa all_embeddings.extend(batch_embeddings) except Exception as e: - msg = "Error calling OpenAI API" + msg = _openai_api_error_message(e, batch_size=len(batch)) logger.exception(msg) raise RuntimeError(msg) from e @@ -275,7 +313,7 @@ async def _process_batch_async(self, batch: list[str]) -> list[list[float]]: response = await client.embeddings.create(**kwargs) return [data.embedding for data in response.data] except Exception as e: - msg = f"Error calling OpenAI API for batch: {e}" + msg = _openai_api_error_message(e, batch_size=len(batch)) logger.exception(msg) raise RuntimeError(msg) from e diff --git a/tests/embedder/test_openai_token_batching.py b/tests/embedder/test_openai_token_batching.py index fded49b9..afe6d560 100644 --- a/tests/embedder/test_openai_token_batching.py +++ b/tests/embedder/test_openai_token_batching.py @@ -8,6 +8,7 @@ from autointent._wrappers.embedder.openai import ( # noqa: E402 OpenaiEmbeddingBackend, _batch_strings_by_token_budget, + _openai_api_error_message, ) from autointent.configs import OpenaiEmbeddingConfig # noqa: E402 @@ -72,3 +73,20 @@ def test_embedding_request_batches_on_backend() -> None: backend = OpenaiEmbeddingBackend(config) batches = backend._embedding_request_batches(["hello"] * 12) assert sum(len(b) for b in batches) == 12 + + +def test_openai_api_error_message_preserves_provider_details() -> None: + class ProviderError(Exception): + def __init__(self, message: str) -> None: + super().__init__(message) + self.status_code = 400 + self.code = "context_length_exceeded" + self.body = {"error": {"message": "input is too long"}} + + message = _openai_api_error_message(ProviderError("No embedding data received"), batch_size=3) + + assert "Error calling OpenAI API (batch_size=3)" in message + assert "ProviderError: No embedding data received" in message + assert "status_code=400" in message + assert "code=context_length_exceeded" in message + assert "input is too long" in message From 70dbb9ee5a95fbd02c915ed99892fd3c52aafbe1 Mon Sep 17 00:00:00 2001 From: voorhs Date: Mon, 11 May 2026 12:02:57 +0300 Subject: [PATCH 4/5] change default for openai max tokens in batch --- src/autointent/configs/_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/autointent/configs/_embedder.py b/src/autointent/configs/_embedder.py index 26891173..0e7440e2 100644 --- a/src/autointent/configs/_embedder.py +++ b/src/autointent/configs/_embedder.py @@ -90,7 +90,7 @@ class OpenaiEmbeddingConfig(BaseEmbedderConfig): model_name: str = Field("text-embedding-3-small", description="Name of the OpenAI embedding model.") batch_size: int = Field(100, description="Batch size for API requests.") max_tokens_in_batch: PositiveInt | None = Field( - None, + 200_000, description=( "When set, cap each embeddings API call by the summed tiktoken length of inputs " "(using the encoding for `model_name`). Requests are also limited to at most " From 2c807d9491e704d4423f1bfbcc0494c9e9c9614a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 11 May 2026 09:03:58 +0000 Subject: [PATCH 5/5] Update optimizer_config.schema.json --- docs/optimizer_config.schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/optimizer_config.schema.json b/docs/optimizer_config.schema.json index a556c72b..6f325f62 100644 --- a/docs/optimizer_config.schema.json +++ b/docs/optimizer_config.schema.json @@ -720,7 +720,7 @@ "type": "null" } ], - "default": null, + "default": 200000, "description": "When set, cap each embeddings API call by the summed tiktoken length of inputs (using the encoding for `model_name`). Requests are also limited to at most `batch_size` strings. Use values around 200000 to avoid OpenAI `max_tokens_per_request` errors on long texts. Requires `tiktoken` (installed with `autointent[openai]`).", "title": "Max Tokens In Batch" },