From 5c80731a63ddee9e6906c713fae6e3d8c8722086 Mon Sep 17 00:00:00 2001 From: Aleksandr Kovalko Date: Tue, 20 Jan 2026 23:20:43 +0700 Subject: [PATCH 1/2] Zero token counts for empty transcripts --- README.md | 3 ++- database/models.py | 4 ++++ requirements.txt | 1 + schedulers/transcription.py | 20 +++++++++++++++++--- utils/tokens.py | 31 +++++++++++++++++++++++++++++++ 5 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 utils/tokens.py diff --git a/README.md b/README.md index a8c0df7..c3ecc3e 100644 --- a/README.md +++ b/README.md @@ -158,6 +158,7 @@ CREATE TABLE IF NOT EXISTS transcription_history ( price_rub DECIMAL(10,2), result_s3_path TEXT, result_json TEXT, + llm_tokens_by_model JSON, operation_id VARCHAR(128), message_id INTEGER, chat_id BIGINT, @@ -268,4 +269,4 @@ In that case install libmysqlclient-dev: `sudo apt install libmysqlclient-dev` o [1]: https://t.me/ClearTranscriptBot [2]: https://cloud.yandex.com/docs/speechkit/ -[3]: https://core.telegram.org/bots/api \ No newline at end of file +[3]: https://core.telegram.org/bots/api diff --git a/database/models.py b/database/models.py index 1f7a2c7..67c0911 100644 --- a/database/models.py +++ b/database/models.py @@ -5,6 +5,7 @@ DateTime, ForeignKey, Integer, + JSON, Numeric, String, Text, @@ -59,6 +60,9 @@ class TranscriptionHistory(Base): # Raw recognition result returned by SpeechKit result_json = Column(Text, nullable=True) + # Token counts for transcribed text by model + llm_tokens_by_model = Column(JSON, nullable=True) + # Duration of the audio in seconds duration_seconds = Column(Integer, nullable=True) diff --git a/requirements.txt b/requirements.txt index 100ebf0..bd33c81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ boto3 pytz requests httpx +tiktoken # Monitoring / error reporting sentry_sdk diff --git a/schedulers/transcription.py b/schedulers/transcription.py index d41ab6f..01b86bf 100644 --- a/schedulers/transcription.py +++ b/schedulers/transcription.py @@ -13,6 +13,7 @@ from utils.speechkit import fetch_transcription_result, parse_text, format_duration from utils.tg import safe_edit_message_text from utils.s3 import upload_file +from utils.tokens import tokens_by_model EDIT_INTERVAL_SEC = 5 # не редактировать чаще, чем раз в 5 сек @@ -85,7 +86,10 @@ async def check_running_tasks(context: ContextTypes.DEFAULT_TYPE) -> None: ) continue - text = parse_text(result) + raw_text = parse_text(result) + token_counts = tokens_by_model(raw_text) + + text = raw_text if not text.strip(): text = "(речь в записи отсутствует или слишком неразборчива для распознавания)" @@ -117,13 +121,23 @@ async def check_running_tasks(context: ContextTypes.DEFAULT_TYPE) -> None: try: await context.bot.send_document(chat_id=task.telegram_id, document=path.open("rb")) - update_transcription(task.id, status="completed", result_s3_path=s3_uri) + update_transcription( + task.id, + status="completed", + result_s3_path=s3_uri, + llm_tokens_by_model=token_counts, + ) except Exception as e: logging.error(f"Failed to send result for task {task.id}: {e}") if os.getenv("ENABLE_SENTRY") == "1": sentry_sdk.capture_exception(e) - update_transcription(task.id, status="failed", result_s3_path=s3_uri) + update_transcription( + task.id, + status="failed", + result_s3_path=s3_uri, + llm_tokens_by_model=token_counts, + ) await safe_edit_message_text( context.bot, diff --git a/utils/tokens.py b/utils/tokens.py new file mode 100644 index 0000000..8bc1805 --- /dev/null +++ b/utils/tokens.py @@ -0,0 +1,31 @@ +"""Utilities for counting LLM tokens.""" +import tiktoken + + +LLM_TOKEN_MODELS = [ + "gpt-5.2", + "gpt-5.1", + "gpt-5-mini", + "gpt-5-nano", +] + +DEFAULT_ENCODING = "o200k_base" + + +def count_tokens(text: str, model: str = LLM_TOKEN_MODELS[0]) -> int: + """Count tokens in *text* using tiktoken encoding for *model*.""" + try: + encoding = tiktoken.encoding_for_model(model) + except KeyError: + encoding = tiktoken.get_encoding(DEFAULT_ENCODING) + return len(encoding.encode(text)) + + +def tokens_by_model(text: str) -> dict[str, int]: + """Return token counts for *text* across supported models.""" + if not text.strip(): + return {model: 0 for model in LLM_TOKEN_MODELS} + return { + model: count_tokens(text, model=model) + for model in LLM_TOKEN_MODELS + } From 0d272de4d1ffc3ee1f37f5f56e7326547a020faa Mon Sep 17 00:00:00 2001 From: Aleksandr Kovalko Date: Tue, 20 Jan 2026 23:38:53 +0700 Subject: [PATCH 2/2] fixup! Zero token counts for empty transcripts --- README.md | 30 +++++++++++++++--------------- schedulers/transcription.py | 7 +++---- utils/tokens.py | 10 ++++++---- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index c3ecc3e..fc45759 100644 --- a/README.md +++ b/README.md @@ -150,20 +150,20 @@ CREATE TABLE IF NOT EXISTS users ( -- History of transcription requests made by users CREATE TABLE IF NOT EXISTS transcription_history ( - id INTEGER PRIMARY KEY AUTO_INCREMENT, - telegram_id BIGINT NOT NULL REFERENCES users(telegram_id), - status VARCHAR(32) NOT NULL, - audio_s3_path TEXT NOT NULL, - duration_seconds INTEGER, - price_rub DECIMAL(10,2), - result_s3_path TEXT, - result_json TEXT, + id INTEGER PRIMARY KEY AUTO_INCREMENT, + telegram_id BIGINT NOT NULL REFERENCES users(telegram_id), + status VARCHAR(32) NOT NULL, + audio_s3_path TEXT NOT NULL, + duration_seconds INTEGER, + price_rub DECIMAL(10,2), + result_s3_path TEXT, + result_json TEXT, llm_tokens_by_model JSON, - operation_id VARCHAR(128), - message_id INTEGER, - chat_id BIGINT, - started_at TIMESTAMP, - finished_at TIMESTAMP + operation_id VARCHAR(128), + message_id INTEGER, + chat_id BIGINT, + started_at TIMESTAMP, + finished_at TIMESTAMP ); -- Index to speed up lookups by user @@ -264,8 +264,8 @@ In that case install libmysqlclient-dev: `sudo apt install libmysqlclient-dev` o ## References -- [Yandex Cloud SpeechKit docs][2] -- [Telegram Bot API][3] +- [Yandex Cloud SpeechKit docs][2] +- [Telegram Bot API][3] [1]: https://t.me/ClearTranscriptBot [2]: https://cloud.yandex.com/docs/speechkit/ diff --git a/schedulers/transcription.py b/schedulers/transcription.py index 01b86bf..03ad705 100644 --- a/schedulers/transcription.py +++ b/schedulers/transcription.py @@ -86,11 +86,10 @@ async def check_running_tasks(context: ContextTypes.DEFAULT_TYPE) -> None: ) continue - raw_text = parse_text(result) - token_counts = tokens_by_model(raw_text) + text = parse_text(result).strip() + token_counts = tokens_by_model(text) - text = raw_text - if not text.strip(): + if not text: text = "(речь в записи отсутствует или слишком неразборчива для распознавания)" source_stem = Path(task.audio_s3_path).stem diff --git a/utils/tokens.py b/utils/tokens.py index 8bc1805..f2b6043 100644 --- a/utils/tokens.py +++ b/utils/tokens.py @@ -1,6 +1,8 @@ """Utilities for counting LLM tokens.""" import tiktoken +from typing import Optional + LLM_TOKEN_MODELS = [ "gpt-5.2", @@ -12,16 +14,16 @@ DEFAULT_ENCODING = "o200k_base" -def count_tokens(text: str, model: str = LLM_TOKEN_MODELS[0]) -> int: +def count_tokens(text: str, model: str = LLM_TOKEN_MODELS[0]) -> Optional[int]: """Count tokens in *text* using tiktoken encoding for *model*.""" try: encoding = tiktoken.encoding_for_model(model) + return len(encoding.encode(text)) except KeyError: - encoding = tiktoken.get_encoding(DEFAULT_ENCODING) - return len(encoding.encode(text)) + return None -def tokens_by_model(text: str) -> dict[str, int]: +def tokens_by_model(text: str) -> dict[str, Optional[int]]: """Return token counts for *text* across supported models.""" if not text.strip(): return {model: 0 for model in LLM_TOKEN_MODELS}