-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathclean_vtt.py
More file actions
43 lines (37 loc) · 1.22 KB
/
clean_vtt.py
File metadata and controls
43 lines (37 loc) · 1.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# pip install webvtt-py
import argparse
import io
import json
import os
import shelve
import time
from pathlib import Path
import tiktoken
import webvtt
from openai import APITimeoutError, OpenAI
from rows.utils import CsvLazyDictWriter
from tqdm import tqdm
def vtt_clean(vtt_content, same_line=False):
result_lines, last_line = [], None
for caption in webvtt.read_buffer(io.StringIO(vtt_content)):
new_lines = caption.text.strip().splitlines()
for line in new_lines:
line = line.strip()
if not line or line == last_line:
continue
result_lines.append(f"{str(caption.start).split('.')[0]} {line}\n" if not same_line else f"{line} ")
last_line = line
return "".join(result_lines)
parser = argparse.ArgumentParser()
parser.add_argument("input_path", type=Path)
parser.add_argument("output_path", type=Path)
args = parser.parse_args()
for filename in tqdm(args.input_path.glob("*.vtt")):
new_filename = args.output_path / filename.name
if new_filename.exists():
continue
with filename.open() as fobj:
data = fobj.read()
result = vtt_clean(data)
with new_filename.open(mode="w") as fobj:
fobj.write(result)