-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdraft_srt.py
More file actions
216 lines (175 loc) · 6.94 KB
/
draft_srt.py
File metadata and controls
216 lines (175 loc) · 6.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import re
from collections import namedtuple
from datetime import timedelta
from typing import List, Generator
import srt
Srt = srt.Subtitle
TIME_OFFSET_INDEX=0
EN_INDEX=1
ZH_INDEX=2
PART_SRT_SEP_REGEX=re.compile(r'\s?(?<!\\)\|\s?')
COMMA_REGEX = re.compile(r'\s?(?<!\\),\s?') # 匹配 , ,不包含 '\,'
FIX_EN_SRT_REGEX = re.compile(r'(?<!\\)\{(.*?)(?<!\\)\}') # 匹配大括号内的内容,不包含 '\{' '\}'
FIX_SPACE_REGEX = re.compile(r'( +)(?=[ ])|( +)(?=[,.!?] )|( +)(?=[,.!?]\Z)') # 匹配多个空格或者断句标点之前的空格
FIX_COMMA_REGEX = re.compile(r'\\,') # 匹配 '\,'
FIX_BACKSLASH_REGEX = re.compile(r'(\\){2}') # 匹配 '\\'
TIME_OFFSET_REGEX = re.compile(r'^\d+,\d+$')
TimeOffset = namedtuple('TimeOffset', ['start', 'end'])
class DraftSrt(Srt):
@classmethod
def from_srt(cls, srt):
obj = cls(srt.index, srt.start, srt.end, srt.content, proprietary=srt.proprietary)
if not obj.check_time_offset():
raise ValueError(obj.to_srt())
if not obj.check_zh_text():
raise ValueError(obj.to_srt())
return obj
@property
def time_offset(self) -> List[str]:
time_offset_string_list = self.get_content_lines()[TIME_OFFSET_INDEX].split(' ')
return [self.to_time_offset(item) for item in time_offset_string_list]
@staticmethod
def to_time_offset(time_offset_string: str) -> TimeOffset:
if not TIME_OFFSET_REGEX.match(time_offset_string):
raise TimeOffsetParseException('TimeOffset parse error, string is {}'.format((time_offset_string)))
tmp = [int(_) for _ in time_offset_string.split(',')]
return TimeOffset(*tmp)
@property
def en_text(self) -> str or None:
if not self.has_en_text():
return None
return self.get_content_lines()[EN_INDEX]
@en_text.setter
def en_text(self, text: str):
content_lines = self.get_content_lines()
content_lines[EN_INDEX] = text
self.content = '\n'.join(content_lines)
@property
def zh_text(self) -> str or None:
if not self.has_zh_text():
return None
return self.get_content_lines()[ZH_INDEX]
@zh_text.setter
def zh_text(self, text: str):
content_lines = self.get_content_lines()
content_lines[ZH_INDEX] = text
self.content = '\n'.join(content_lines)
def get_content_lines(self) -> List[str]:
return self.content.split('\n')
def has_en_text(self) -> bool:
return len(self.get_content_lines()) >= EN_INDEX + 1
def has_zh_text(self) -> bool:
return len(self.get_content_lines()) == ZH_INDEX + 1
def check_time_offset(self) -> bool:
if not self.has_en_text():
return True
return len(self.time_offset) == len(self.en_text.split())
def check_zh_text(self) -> bool:
# 不包含英文字幕
if not self.has_en_text():
return True
# 包含英文字幕,不包含中文字幕
if not self.has_zh_text():
return True
# 既包含英文字幕,又包含中文字幕
return len(PART_SRT_SEP_REGEX.split(self.en_text)) == len(PART_SRT_SEP_REGEX.split(self.zh_text))
def time_offset_to_timedelta(self, t):
return self.start + timedelta(milliseconds=t)
def get_part_srts(self) -> List[Srt or None]:
"""
abc d| efg|
abc d
efg
None
"""
if not self.has_en_text():
return []
part_srt_en_texts = PART_SRT_SEP_REGEX.split(self.en_text)
zh_text = self.zh_text
if zh_text is None:
zh_text = ''
part_srt_zh_texts = PART_SRT_SEP_REGEX.split(zh_text)
res = []
part_srt_start_index = 0
for i in range(len(part_srt_en_texts)):
item = part_srt_en_texts[i]
length = len(item.split())
if item != '':
if self.has_zh_text():
content = '\n'.join([part_srt_en_texts[i], part_srt_zh_texts[i]])
else:
content = '\n'.join([part_srt_en_texts[i]])
res.append(
Srt(index=0,
start=self.time_offset_to_timedelta(self.time_offset[part_srt_start_index].start),
end=self.time_offset_to_timedelta(self.time_offset[part_srt_start_index+length-1].end),
content=content)
)
part_srt_start_index += length
else:
res.append(None)
return res
@staticmethod
def to_final_srts(draft_srts: List['DraftSrt'], langs: List[str] = ['en', 'zh']) -> Generator[Srt, None, None]:
tmp = []
for draft_srt in draft_srts:
part_srts = draft_srt.get_part_srts()
length = len(part_srts)
for i in range(length):
tmp.append(part_srts[i])
if i != length - 1:
yield __class__.merge_part_srts(tmp, langs)
tmp = []
yield __class__.merge_part_srts(tmp, langs)
@staticmethod
def merge_part_srts(part_srts: List[Srt], langs: List[str]=['en', 'zh']) -> Srt:
part_srts = [_ for _ in part_srts if _ is not None] # 非空
if not part_srts:
raise ValueError('some final srt is blank')
length = len(part_srts)
start = part_srts[0].start
end = part_srts[length - 1].end
if 'en' in langs:
en_text = ' '.join([_.content.split('\n')[0] for _ in part_srts])
en_text = __class__.fix_en_text(en_text)
if 'zh' in langs:
zh_text = ''.join([_.content.split('\n')[1] for _ in part_srts if len(_.content.split('\n')) > 1])
l = locals()
text = '\n'.join([
l['{}_text'.format(lang)]
for lang in langs
])
return Srt(
index=0,
start=start,
end=end,
content=text
)
@staticmethod
def fix_en_text(text: str) -> str:
"""
处理英文字幕中的 {word1,word2} {word - - -} {}
"""
def f(m):
tmp = m.group(1)
# '{}' -> ''
# '{word1,word2}' -> 'word1 word2'
# '{word - -}' -> 'word'
# '{ - -}' -> ''
# '{word1,word2 - -}' -> 'word1 word2'
# '{word1\,word2 - -}' -> 'word1,word2'
# '{word1\,,word2 - -}' -> 'word1, word2'
res = ' '.join(
re.split(
COMMA_REGEX,
re.split(' ', tmp)[0]
)
)
return res
text1 = FIX_EN_SRT_REGEX.sub(f, text) # 处理大括号内容
text2 = FIX_SPACE_REGEX.sub('', text1).strip() # 处理不合适的空格
text3 = FIX_COMMA_REGEX.sub(',', text2)
text4 = FIX_BACKSLASH_REGEX.sub(r'\\', text3)
return text4
class TimeOffsetParseException(Exception):
pass