From 5c80731a63ddee9e6906c713fae6e3d8c8722086 Mon Sep 17 00:00:00 2001
From: Aleksandr Kovalko <gistrec@gmail.com>
Date: Tue, 20 Jan 2026 23:20:43 +0700
Subject: [PATCH 1/2] Zero token counts for empty transcripts

---
 README.md                   |  3 ++-
 database/models.py          |  4 ++++
 requirements.txt            |  1 +
 schedulers/transcription.py | 20 +++++++++++++++++---
 utils/tokens.py             | 31 +++++++++++++++++++++++++++++++
 5 files changed, 55 insertions(+), 4 deletions(-)
 create mode 100644 utils/tokens.py

diff --git a/README.md b/README.md
index a8c0df7..c3ecc3e 100644
--- a/README.md
+++ b/README.md
@@ -158,6 +158,7 @@ CREATE TABLE IF NOT EXISTS transcription_history (
     price_rub        DECIMAL(10,2),
     result_s3_path   TEXT,
     result_json      TEXT,
+    llm_tokens_by_model   JSON,
     operation_id     VARCHAR(128),
     message_id       INTEGER,
     chat_id          BIGINT,
@@ -268,4 +269,4 @@ In that case install libmysqlclient-dev: `sudo apt install libmysqlclient-dev` o
 
 [1]: https://t.me/ClearTranscriptBot
 [2]: https://cloud.yandex.com/docs/speechkit/
-[3]: https://core.telegram.org/bots/api
\ No newline at end of file
+[3]: https://core.telegram.org/bots/api
diff --git a/database/models.py b/database/models.py
index 1f7a2c7..67c0911 100644
--- a/database/models.py
+++ b/database/models.py
@@ -5,6 +5,7 @@
     DateTime,
     ForeignKey,
     Integer,
+    JSON,
     Numeric,
     String,
     Text,
@@ -59,6 +60,9 @@ class TranscriptionHistory(Base):
     # Raw recognition result returned by SpeechKit
     result_json = Column(Text, nullable=True)
 
+    # Token counts for transcribed text by model
+    llm_tokens_by_model = Column(JSON, nullable=True)
+
     # Duration of the audio in seconds
     duration_seconds = Column(Integer, nullable=True)
 
diff --git a/requirements.txt b/requirements.txt
index 100ebf0..bd33c81 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ boto3
 pytz
 requests
 httpx
+tiktoken
 
 # Monitoring / error reporting
 sentry_sdk
diff --git a/schedulers/transcription.py b/schedulers/transcription.py
index d41ab6f..01b86bf 100644
--- a/schedulers/transcription.py
+++ b/schedulers/transcription.py
@@ -13,6 +13,7 @@
 from utils.speechkit import fetch_transcription_result, parse_text, format_duration
 from utils.tg import safe_edit_message_text
 from utils.s3 import upload_file
+from utils.tokens import tokens_by_model
 
 
 EDIT_INTERVAL_SEC = 5  # не редактировать чаще, чем раз в 5 сек
@@ -85,7 +86,10 @@ async def check_running_tasks(context: ContextTypes.DEFAULT_TYPE) -> None:
             )
             continue
 
-        text = parse_text(result)
+        raw_text = parse_text(result)
+        token_counts = tokens_by_model(raw_text)
+
+        text = raw_text
         if not text.strip():
             text = "(речь в записи отсутствует или слишком неразборчива для распознавания)"
 
@@ -117,13 +121,23 @@ async def check_running_tasks(context: ContextTypes.DEFAULT_TYPE) -> None:
 
         try:
             await context.bot.send_document(chat_id=task.telegram_id, document=path.open("rb"))
-            update_transcription(task.id, status="completed", result_s3_path=s3_uri)
+            update_transcription(
+                task.id,
+                status="completed",
+                result_s3_path=s3_uri,
+                llm_tokens_by_model=token_counts,
+            )
         except Exception as e:
             logging.error(f"Failed to send result for task {task.id}: {e}")
             if os.getenv("ENABLE_SENTRY") == "1":
                 sentry_sdk.capture_exception(e)
 
-            update_transcription(task.id, status="failed", result_s3_path=s3_uri)
+            update_transcription(
+                task.id,
+                status="failed",
+                result_s3_path=s3_uri,
+                llm_tokens_by_model=token_counts,
+            )
 
             await safe_edit_message_text(
                 context.bot,
diff --git a/utils/tokens.py b/utils/tokens.py
new file mode 100644
index 0000000..8bc1805
--- /dev/null
+++ b/utils/tokens.py
@@ -0,0 +1,31 @@
+"""Utilities for counting LLM tokens."""
+import tiktoken
+
+
+LLM_TOKEN_MODELS = [
+    "gpt-5.2",
+    "gpt-5.1",
+    "gpt-5-mini",
+    "gpt-5-nano",
+]
+
+DEFAULT_ENCODING = "o200k_base"
+
+
+def count_tokens(text: str, model: str = LLM_TOKEN_MODELS[0]) -> int:
+    """Count tokens in *text* using tiktoken encoding for *model*."""
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        encoding = tiktoken.get_encoding(DEFAULT_ENCODING)
+    return len(encoding.encode(text))
+
+
+def tokens_by_model(text: str) -> dict[str, int]:
+    """Return token counts for *text* across supported models."""
+    if not text.strip():
+        return {model: 0 for model in LLM_TOKEN_MODELS}
+    return {
+        model: count_tokens(text, model=model)
+        for model in LLM_TOKEN_MODELS
+    }

From 0d272de4d1ffc3ee1f37f5f56e7326547a020faa Mon Sep 17 00:00:00 2001
From: Aleksandr Kovalko <gistrec@gmail.com>
Date: Tue, 20 Jan 2026 23:38:53 +0700
Subject: [PATCH 2/2] fixup! Zero token counts for empty transcripts

---
 README.md                   | 30 +++++++++++++++---------------
 schedulers/transcription.py |  7 +++----
 utils/tokens.py             | 10 ++++++----
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index c3ecc3e..fc45759 100644
--- a/README.md
+++ b/README.md
@@ -150,20 +150,20 @@ CREATE TABLE IF NOT EXISTS users (
 
 -- History of transcription requests made by users
 CREATE TABLE IF NOT EXISTS transcription_history (
-    id               INTEGER         PRIMARY KEY AUTO_INCREMENT,
-    telegram_id      BIGINT          NOT NULL REFERENCES users(telegram_id),
-    status           VARCHAR(32)     NOT NULL,
-    audio_s3_path    TEXT            NOT NULL,
-    duration_seconds INTEGER,
-    price_rub        DECIMAL(10,2),
-    result_s3_path   TEXT,
-    result_json      TEXT,
+    id                    INTEGER         PRIMARY KEY AUTO_INCREMENT,
+    telegram_id           BIGINT          NOT NULL REFERENCES users(telegram_id),
+    status                VARCHAR(32)     NOT NULL,
+    audio_s3_path         TEXT            NOT NULL,
+    duration_seconds      INTEGER,
+    price_rub             DECIMAL(10,2),
+    result_s3_path        TEXT,
+    result_json           TEXT,
     llm_tokens_by_model   JSON,
-    operation_id     VARCHAR(128),
-    message_id       INTEGER,
-    chat_id          BIGINT,
-    started_at       TIMESTAMP,
-    finished_at      TIMESTAMP
+    operation_id          VARCHAR(128),
+    message_id            INTEGER,
+    chat_id               BIGINT,
+    started_at            TIMESTAMP,
+    finished_at           TIMESTAMP
 );
 
 -- Index to speed up lookups by user
@@ -264,8 +264,8 @@ In that case install libmysqlclient-dev: `sudo apt install libmysqlclient-dev` o
 
 ## References
 
-- [Yandex Cloud SpeechKit docs][2]  
-- [Telegram Bot API][3]  
+- [Yandex Cloud SpeechKit docs][2]
+- [Telegram Bot API][3]
 
 [1]: https://t.me/ClearTranscriptBot
 [2]: https://cloud.yandex.com/docs/speechkit/
diff --git a/schedulers/transcription.py b/schedulers/transcription.py
index 01b86bf..03ad705 100644
--- a/schedulers/transcription.py
+++ b/schedulers/transcription.py
@@ -86,11 +86,10 @@ async def check_running_tasks(context: ContextTypes.DEFAULT_TYPE) -> None:
             )
             continue
 
-        raw_text = parse_text(result)
-        token_counts = tokens_by_model(raw_text)
+        text = parse_text(result).strip()
+        token_counts = tokens_by_model(text)
 
-        text = raw_text
-        if not text.strip():
+        if not text:
             text = "(речь в записи отсутствует или слишком неразборчива для распознавания)"
 
         source_stem = Path(task.audio_s3_path).stem
diff --git a/utils/tokens.py b/utils/tokens.py
index 8bc1805..f2b6043 100644
--- a/utils/tokens.py
+++ b/utils/tokens.py
@@ -1,6 +1,8 @@
 """Utilities for counting LLM tokens."""
 import tiktoken
 
+from typing import Optional
+
 
 LLM_TOKEN_MODELS = [
     "gpt-5.2",
@@ -12,16 +14,16 @@
 DEFAULT_ENCODING = "o200k_base"
 
 
-def count_tokens(text: str, model: str = LLM_TOKEN_MODELS[0]) -> int:
+def count_tokens(text: str, model: str = LLM_TOKEN_MODELS[0]) -> Optional[int]:
     """Count tokens in *text* using tiktoken encoding for *model*."""
     try:
         encoding = tiktoken.encoding_for_model(model)
+        return len(encoding.encode(text))
     except KeyError:
-        encoding = tiktoken.get_encoding(DEFAULT_ENCODING)
-    return len(encoding.encode(text))
+        return None
 
 
-def tokens_by_model(text: str) -> dict[str, int]:
+def tokens_by_model(text: str) -> dict[str, Optional[int]]:
     """Return token counts for *text* across supported models."""
     if not text.strip():
         return {model: 0 for model in LLM_TOKEN_MODELS}