diff --git a/README.md b/README.md index a8c0df7..fc45759 100644 --- a/README.md +++ b/README.md @@ -150,19 +150,20 @@ CREATE TABLE IF NOT EXISTS users ( -- History of transcription requests made by users CREATE TABLE IF NOT EXISTS transcription_history ( - id INTEGER PRIMARY KEY AUTO_INCREMENT, - telegram_id BIGINT NOT NULL REFERENCES users(telegram_id), - status VARCHAR(32) NOT NULL, - audio_s3_path TEXT NOT NULL, - duration_seconds INTEGER, - price_rub DECIMAL(10,2), - result_s3_path TEXT, - result_json TEXT, - operation_id VARCHAR(128), - message_id INTEGER, - chat_id BIGINT, - started_at TIMESTAMP, - finished_at TIMESTAMP + id INTEGER PRIMARY KEY AUTO_INCREMENT, + telegram_id BIGINT NOT NULL REFERENCES users(telegram_id), + status VARCHAR(32) NOT NULL, + audio_s3_path TEXT NOT NULL, + duration_seconds INTEGER, + price_rub DECIMAL(10,2), + result_s3_path TEXT, + result_json TEXT, + llm_tokens_by_model JSON, + operation_id VARCHAR(128), + message_id INTEGER, + chat_id BIGINT, + started_at TIMESTAMP, + finished_at TIMESTAMP ); -- Index to speed up lookups by user @@ -263,9 +264,9 @@ In that case install libmysqlclient-dev: `sudo apt install libmysqlclient-dev` o ## References -- [Yandex Cloud SpeechKit docs][2] -- [Telegram Bot API][3] +- [Yandex Cloud SpeechKit docs][2] +- [Telegram Bot API][3] [1]: https://t.me/ClearTranscriptBot [2]: https://cloud.yandex.com/docs/speechkit/ -[3]: https://core.telegram.org/bots/api \ No newline at end of file +[3]: https://core.telegram.org/bots/api diff --git a/database/models.py b/database/models.py index 1f7a2c7..67c0911 100644 --- a/database/models.py +++ b/database/models.py @@ -5,6 +5,7 @@ DateTime, ForeignKey, Integer, + JSON, Numeric, String, Text, @@ -59,6 +60,9 @@ class TranscriptionHistory(Base): # Raw recognition result returned by SpeechKit result_json = Column(Text, nullable=True) + # Token counts for transcribed text by model + llm_tokens_by_model = Column(JSON, nullable=True) + # Duration of the audio in seconds duration_seconds = Column(Integer, nullable=True) diff --git a/requirements.txt b/requirements.txt index 100ebf0..bd33c81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ boto3 pytz requests httpx +tiktoken # Monitoring / error reporting sentry_sdk diff --git a/schedulers/transcription.py b/schedulers/transcription.py index d41ab6f..03ad705 100644 --- a/schedulers/transcription.py +++ b/schedulers/transcription.py @@ -13,6 +13,7 @@ from utils.speechkit import fetch_transcription_result, parse_text, format_duration from utils.tg import safe_edit_message_text from utils.s3 import upload_file +from utils.tokens import tokens_by_model EDIT_INTERVAL_SEC = 5 # не редактировать чаще, чем раз в 5 сек @@ -85,8 +86,10 @@ async def check_running_tasks(context: ContextTypes.DEFAULT_TYPE) -> None: ) continue - text = parse_text(result) - if not text.strip(): + text = parse_text(result).strip() + token_counts = tokens_by_model(text) + + if not text: text = "(речь в записи отсутствует или слишком неразборчива для распознавания)" source_stem = Path(task.audio_s3_path).stem @@ -117,13 +120,23 @@ async def check_running_tasks(context: ContextTypes.DEFAULT_TYPE) -> None: try: await context.bot.send_document(chat_id=task.telegram_id, document=path.open("rb")) - update_transcription(task.id, status="completed", result_s3_path=s3_uri) + update_transcription( + task.id, + status="completed", + result_s3_path=s3_uri, + llm_tokens_by_model=token_counts, + ) except Exception as e: logging.error(f"Failed to send result for task {task.id}: {e}") if os.getenv("ENABLE_SENTRY") == "1": sentry_sdk.capture_exception(e) - update_transcription(task.id, status="failed", result_s3_path=s3_uri) + update_transcription( + task.id, + status="failed", + result_s3_path=s3_uri, + llm_tokens_by_model=token_counts, + ) await safe_edit_message_text( context.bot, diff --git a/utils/tokens.py b/utils/tokens.py new file mode 100644 index 0000000..f2b6043 --- /dev/null +++ b/utils/tokens.py @@ -0,0 +1,33 @@ +"""Utilities for counting LLM tokens.""" +import tiktoken + +from typing import Optional + + +LLM_TOKEN_MODELS = [ + "gpt-5.2", + "gpt-5.1", + "gpt-5-mini", + "gpt-5-nano", +] + +DEFAULT_ENCODING = "o200k_base" + + +def count_tokens(text: str, model: str = LLM_TOKEN_MODELS[0]) -> Optional[int]: + """Count tokens in *text* using tiktoken encoding for *model*.""" + try: + encoding = tiktoken.encoding_for_model(model) + return len(encoding.encode(text)) + except KeyError: + return None + + +def tokens_by_model(text: str) -> dict[str, Optional[int]]: + """Return token counts for *text* across supported models.""" + if not text.strip(): + return {model: 0 for model in LLM_TOKEN_MODELS} + return { + model: count_tokens(text, model=model) + for model in LLM_TOKEN_MODELS + }