-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspeech_to_text.py
More file actions
102 lines (91 loc) · 3.39 KB
/
speech_to_text.py
File metadata and controls
102 lines (91 loc) · 3.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# speech_to_text.py (Deluxe)
# Uses faster-whisper (preferred) -> whisper -> SpeechRecognition (pocketsphinx/google) fallback.
from __future__ import annotations
import os, tempfile, subprocess
from pathlib import Path
from typing import List, Dict, Optional
# try faster-whisper
_HAS_FAST_WHISPER = False
try:
from faster_whisper import WhisperModel
_HAS_FAST_WHISPER = True
except Exception:
_HAS_FAST_WHISPER = False
# try openai/whisper
_HAS_WHISPER = False
try:
import whisper
_HAS_WHISPER = True
except Exception:
_HAS_WHISPER = False
# speech_recognition fallback
_HAS_SR = False
try:
import speech_recognition as sr
_HAS_SR = True
except Exception:
_HAS_SR = False
# pocketsphinx availability
_HAS_POCKETS = False
if _HAS_SR:
try:
import pocketsphinx # type: ignore
_HAS_POCKETS = True
except Exception:
_HAS_POCKETS = False
def _ensure_wav(in_path: str, out_wav: str, ffmpeg_bin: str = "ffmpeg"):
cmd = [ffmpeg_bin, "-y", "-i", str(in_path), "-ar", "16000", "-ac", "1", "-vn", "-f", "wav", str(out_wav)]
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
if p.returncode != 0:
raise RuntimeError("ffmpeg convert failed: " + p.stderr)
def transcribe_file(path: str, model: str = "small", device: str = "cpu", ffmpeg_bin: str = "ffmpeg") -> str:
src = Path(path)
if not src.exists():
raise FileNotFoundError("File not found: " + path)
# faster-whisper branch
if _HAS_FAST_WHISPER:
try:
# streaming decode with faster-whisper is efficient
m = WhisperModel(model, device=device, compute_type="float32")
segments, info = m.transcribe(str(src))
text = " ".join([s.text for s in segments])
return text.strip()
except Exception as e:
print(f"[stt] faster-whisper error, falling back: {e}")
if _HAS_WHISPER:
try:
m = whisper.load_model(model)
res = m.transcribe(str(src))
return res.get("text", "").strip()
except Exception as e:
print(f"[stt] whisper error, falling back: {e}")
# speech_recognition fallback
if _HAS_SR:
r = sr.Recognizer()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp_wav = tmp.name
try:
_ensure_wav(str(src), tmp_wav, ffmpeg_bin=ffmpeg_bin)
with sr.AudioFile(tmp_wav) as af:
audio = r.record(af)
if _HAS_POCKETS:
try:
return r.recognize_sphinx(audio).strip()
except Exception as e:
print(f"[stt] pocketsphinx error: {e}")
try:
return r.recognize_google(audio).strip()
except Exception as e:
raise RuntimeError("SpeechRecognition (Google) failed: " + str(e))
finally:
try: os.unlink(tmp_wav)
except Exception: pass
raise RuntimeError("No STT backend available. Install faster-whisper/whisper or speech_recognition (+pocketsphinx).")
def transcribe_batch(paths: List[str], model: str = "small", device: str = "cpu") -> Dict[str,str]:
results = {}
for p in paths:
try:
results[p] = transcribe_file(p, model=model, device=device)
except Exception as e:
results[p] = f"ERROR: {e}"
return results