gistrec · gistrec · Jan 20, 2026 · Jan 20, 2026
diff --git a/README.md b/README.md
@@ -150,19 +150,20 @@ CREATE TABLE IF NOT EXISTS users (
 
 -- History of transcription requests made by users
 CREATE TABLE IF NOT EXISTS transcription_history (
-    id               INTEGER         PRIMARY KEY AUTO_INCREMENT,
-    telegram_id      BIGINT          NOT NULL REFERENCES users(telegram_id),
-    status           VARCHAR(32)     NOT NULL,
-    audio_s3_path    TEXT            NOT NULL,
-    duration_seconds INTEGER,
-    price_rub        DECIMAL(10,2),
-    result_s3_path   TEXT,
-    result_json      TEXT,
-    operation_id     VARCHAR(128),
-    message_id       INTEGER,
-    chat_id          BIGINT,
-    started_at       TIMESTAMP,
-    finished_at      TIMESTAMP
+    id                    INTEGER         PRIMARY KEY AUTO_INCREMENT,
+    telegram_id           BIGINT          NOT NULL REFERENCES users(telegram_id),
+    status                VARCHAR(32)     NOT NULL,
+    audio_s3_path         TEXT            NOT NULL,
+    duration_seconds      INTEGER,
+    price_rub             DECIMAL(10,2),
+    result_s3_path        TEXT,
+    result_json           TEXT,
+    llm_tokens_by_model   JSON,
+    operation_id          VARCHAR(128),
+    message_id            INTEGER,
+    chat_id               BIGINT,
+    started_at            TIMESTAMP,
+    finished_at           TIMESTAMP
 );
 
 -- Index to speed up lookups by user
@@ -263,9 +264,9 @@ In that case install libmysqlclient-dev: `sudo apt install libmysqlclient-dev` o
 
 ## References
 
-- [Yandex Cloud SpeechKit docs][2]  
-- [Telegram Bot API][3]  
+- [Yandex Cloud SpeechKit docs][2]
+- [Telegram Bot API][3]
 
 [1]: https://t.me/ClearTranscriptBot
 [2]: https://cloud.yandex.com/docs/speechkit/
-[3]: https://core.telegram.org/bots/api
+[3]: https://core.telegram.org/bots/api
diff --git a/database/models.py b/database/models.py
@@ -5,6 +5,7 @@
     DateTime,
     ForeignKey,
     Integer,
+    JSON,
     Numeric,
     String,
     Text,
@@ -59,6 +60,9 @@ class TranscriptionHistory(Base):
     # Raw recognition result returned by SpeechKit
     result_json = Column(Text, nullable=True)
 
+    # Token counts for transcribed text by model
+    llm_tokens_by_model = Column(JSON, nullable=True)
+
     # Duration of the audio in seconds
     duration_seconds = Column(Integer, nullable=True)
 

diff --git a/requirements.txt b/requirements.txt
@@ -14,6 +14,7 @@ boto3
 pytz
 requests
 httpx
+tiktoken
 
 # Monitoring / error reporting
 sentry_sdk
diff --git a/schedulers/transcription.py b/schedulers/transcription.py
@@ -13,6 +13,7 @@
 from utils.speechkit import fetch_transcription_result, parse_text, format_duration
 from utils.tg import safe_edit_message_text
 from utils.s3 import upload_file
+from utils.tokens import tokens_by_model
 
 
 EDIT_INTERVAL_SEC = 5  # не редактировать чаще, чем раз в 5 сек
@@ -85,8 +86,10 @@ async def check_running_tasks(context: ContextTypes.DEFAULT_TYPE) -> None:
             )
             continue
 
-        text = parse_text(result)
-        if not text.strip():
+        text = parse_text(result).strip()
+        token_counts = tokens_by_model(text)
+
+        if not text:
             text = "(речь в записи отсутствует или слишком неразборчива для распознавания)"
 
         source_stem = Path(task.audio_s3_path).stem
@@ -117,13 +120,23 @@ async def check_running_tasks(context: ContextTypes.DEFAULT_TYPE) -> None:
 
         try:
             await context.bot.send_document(chat_id=task.telegram_id, document=path.open("rb"))
-            update_transcription(task.id, status="completed", result_s3_path=s3_uri)
+            update_transcription(
+                task.id,
+                status="completed",
+                result_s3_path=s3_uri,
+                llm_tokens_by_model=token_counts,
+            )
         except Exception as e:
             logging.error(f"Failed to send result for task {task.id}: {e}")
             if os.getenv("ENABLE_SENTRY") == "1":
                 sentry_sdk.capture_exception(e)
 
-            update_transcription(task.id, status="failed", result_s3_path=s3_uri)
+            update_transcription(
+                task.id,
+                status="failed",
+                result_s3_path=s3_uri,
+                llm_tokens_by_model=token_counts,
+            )
 
             await safe_edit_message_text(
                 context.bot,

diff --git a/utils/tokens.py b/utils/tokens.py
@@ -0,0 +1,33 @@
+"""Utilities for counting LLM tokens."""
+import tiktoken
+
+from typing import Optional
+
+
+LLM_TOKEN_MODELS = [
+    "gpt-5.2",
+    "gpt-5.1",
+    "gpt-5-mini",
+    "gpt-5-nano",
+]
+
+DEFAULT_ENCODING = "o200k_base"
+
+
+def count_tokens(text: str, model: str = LLM_TOKEN_MODELS[0]) -> Optional[int]:
+    """Count tokens in *text* using tiktoken encoding for *model*."""
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+        return len(encoding.encode(text))
+    except KeyError:
+        return None
+
+
+def tokens_by_model(text: str) -> dict[str, Optional[int]]:
+    """Return token counts for *text* across supported models."""
+    if not text.strip():
+        return {model: 0 for model in LLM_TOKEN_MODELS}
+    return {
+        model: count_tokens(text, model=model)
+        for model in LLM_TOKEN_MODELS
+    }