-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmd_to_speech.py
More file actions
209 lines (167 loc) · 6.95 KB
/
md_to_speech.py
File metadata and controls
209 lines (167 loc) · 6.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python3
import re
import sys
import os
from kokoro_onnx import Kokoro
import soundfile as sf
import numpy as np
def clean_markdown(text):
"""
Clean markdown text to make it suitable for TTS.
Removes problematic characters and formatting.
"""
# Remove FastAPI-specific embedded code references
text = re.sub(r'\{\*[^*]*\*\}', '', text)
# Remove markdown headers (# ## ###, etc.) but keep the text
text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
# Remove code blocks (```...```)
text = re.sub(r'```[\s\S]*?```', '', text)
# Remove inline code backticks but keep the content (`...` -> ...)
text = re.sub(r'`([^`]*)`', r'\1', text)
# Remove HTML-like tags and abbreviations
text = re.sub(r'<[^>]*>', '', text)
# Remove markdown links but keep the text [text](url) -> text
text = re.sub(r'\[([^\]]*)\]\([^)]*\)', r'\1', text)
# Remove markdown emphasis (* ** _ __)
text = re.sub(r'[*_]{1,2}([^*_]*)[*_]{1,2}', r'\1', text)
# Replace problematic characters
text = text.replace('\\', ' ')
text = text.replace('/', ' ')
text = text.replace('|', ' ')
text = text.replace('+', ' plus ')
text = text.replace('&', ' and ')
# Remove comment blocks (/// ... ///)
text = re.sub(r'///[\s\S]*?///', '', text)
# Remove markdown tables (lines with |)
lines = text.split('\n')
filtered_lines = []
for line in lines:
if '|' not in line or not line.strip().startswith('|'):
filtered_lines.append(line)
text = '\n'.join(filtered_lines)
# Remove excessive whitespace and empty lines
text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
text = re.sub(r'\s+', ' ', text)
# Remove remaining special characters that might cause issues
text = re.sub(r'[{}()\[\]<>]', '', text)
return text.strip()
def chunk_text(text, max_length=1000):
"""Split text into smaller chunks for TTS processing."""
sentences = text.replace('\n', ' ').split('.')
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
sentence_length = len(sentence)
if current_length + sentence_length > max_length and current_chunk:
chunks.append('. '.join(current_chunk) + '.')
current_chunk = [sentence]
current_length = sentence_length
else:
current_chunk.append(sentence)
current_length += sentence_length
if current_chunk:
chunks.append('. '.join(current_chunk) + '.')
return chunks
def markdown_to_speech(file_path, output_path=None, voice='af_heart', lang_code='a'):
"""
Convert a markdown file to speech.
Args:
file_path: Path to the markdown file
output_path: Output audio file path (optional)
voice: Voice to use for TTS
lang_code: Language code for TTS (not used in kokoro-onnx)
"""
# Read the markdown file
try:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
return False
except Exception as e:
print(f"Error reading file: {e}")
return False
# Clean the markdown
cleaned_text = clean_markdown(text)
if not cleaned_text.strip():
print("Warning: No text content found after cleaning.")
return False
print(f"Original text length: {len(text)} characters")
print(f"Cleaned text length: {len(cleaned_text)} characters")
print("\nFirst 200 characters of cleaned text:")
print(cleaned_text[:200] + "..." if len(cleaned_text) > 200 else cleaned_text)
print("\n" + "="*50)
# Initialize the TTS model
try:
# Look for model files in kokoro-tts directory
script_dir = os.path.dirname(os.path.abspath(__file__))
kokoro_dir = os.path.join(script_dir, 'kokoro-tts')
model_path = os.path.join(kokoro_dir, 'kokoro-v1.0.onnx')
voices_path = os.path.join(kokoro_dir, 'voices-v1.0.bin')
if not os.path.exists(model_path):
print(f"Error: Model file not found at {model_path}")
return False
if not os.path.exists(voices_path):
print(f"Error: Voices file not found at {voices_path}")
return False
kokoro = Kokoro(model_path, voices_path)
print(f"Initialized Kokoro TTS with voice: {voice}")
except Exception as e:
print(f"Error initializing TTS model: {e}")
return False
# Generate speech
try:
# Split text into chunks for better processing
chunks = chunk_text(cleaned_text, max_length=800)
print(f"Processing {len(chunks)} text chunks...")
audio_chunks = []
for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)}")
try:
# Generate audio for this chunk
audio = kokoro.create(chunk, voice=voice, speed=1.0)
audio_chunks.append(audio)
except Exception as e:
print(f"Error processing chunk {i+1}: {e}")
continue
if not audio_chunks:
print("No audio chunks generated")
return False
# Concatenate all audio chunks
full_audio = np.concatenate(audio_chunks, axis=0)
# Determine output path
if output_path is None or output_path.strip() == "":
base_name = os.path.splitext(os.path.basename(file_path))[0]
output_path = f"{base_name}_speech.wav"
# Save the audio file
sf.write(output_path, full_audio, 24000)
print(f"\nGenerated audio file: {output_path}")
print(f"Total chunks processed: {len(audio_chunks)}")
print(f"Audio duration: {len(full_audio) / 24000:.2f} seconds")
return True
except Exception as e:
print(f"Error during TTS generation: {e}")
return False
def main():
if len(sys.argv) < 2:
print("Usage: python md_to_speech.py <markdown_file> [output_file] [voice] [lang_code]")
print("Example: python md_to_speech.py docs/tutorial.md tutorial_speech.wav af_heart a")
sys.exit(1)
file_path = sys.argv[1]
output_path = sys.argv[2] if len(sys.argv) > 2 and sys.argv[2].strip() else None
voice = sys.argv[3] if len(sys.argv) > 3 else 'af_heart'
lang_code = sys.argv[4] if len(sys.argv) > 4 else 'a'
print(f"Converting '{file_path}' to speech...")
print(f"Voice: {voice}, Language: {lang_code}")
success = markdown_to_speech(file_path, output_path, voice, lang_code)
if success:
print("Conversion completed successfully!")
else:
print("Conversion failed!")
sys.exit(1)
if __name__ == "__main__":
main()