-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGetYTSubs.py
More file actions
26 lines (21 loc) · 827 Bytes
/
GetYTSubs.py
File metadata and controls
26 lines (21 loc) · 827 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import json
# Load the JSON content from the file
with open('fx.txt', 'r', encoding='utf-8') as file:
data = json.load(file)
# Extract all 'utf8' segments across all events into one list
words = []
for event in data.get('events', []):
if 'segs' in event:
for seg in event['segs']:
if 'utf8' in seg:
token = seg['utf8'].strip()
if token: # skip empty strings or newline characters
words.append(token)
# Join all words with a single space into one continuous paragraph
full_text = ' '.join(words)
# Optional: fix spacing around punctuation
import re
full_text = re.sub(r'\s+([.,!?;:])', r'\1', full_text)
# Save the final output to a file
with open('extracted_text.txt', 'w', encoding='utf-8') as output_file:
output_file.write(full_text)