-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsegment.py
More file actions
71 lines (60 loc) · 2.24 KB
/
segment.py
File metadata and controls
71 lines (60 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"Copied from jun wei script"
import jieba
import argparse
import os
from multiprocessing import Pool, cpu_count
BASE_DIR = '/home/b06901141/video-subtitle-generator--b06901141/'
# load stopwords set
stopword_set = set()
with open(os.path.join(BASE_DIR, 'material/jieba_dict/stopwords.txt'), 'r', encoding='utf-8') as stopwords:
for stopword in stopwords:
stopword_set.add(stopword.strip('\n'))
punctuation_set = set()
with open(os.path.join(BASE_DIR, 'material/jieba_dict/punctuations.txt'), 'r', encoding='utf-8') as punctuations:
for punctuation in punctuations:
punctuation_set.add(punctuation.strip('\n'))
def cut(sentence):
sentence = sentence.strip('\n').upper()
output = ""
words = jieba.cut(sentence, cut_all=False)
for word in words:
word = "".join(word.split(" "))
if word not in punctuation_set:
output += word + ' '
output = output[:-1] + '\n'
return output
def cutHungyi(sentence):
sentence = sentence.replace('\n', "")
output = ""
words = jieba.cut(sentence, cut_all=False)
for word in words:
if word not in punctuation_set:
output += word + ' '
output = " ".join(output.rstrip(" ").split()) + '\n'
return output
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--input', type=str, required=True)
parser.add_argument('--output', type=str, required=True)
parser.add_argument('--njobs', type=int, default=1)
parser.add_argument('--hungyi', action='store_false')
args = parser.parse_args()
# jieba custom setting.
jieba.set_dictionary(os.path.join(BASE_DIR,
'material/jieba_dict/dict.txt.big'))
print("Start cuting text from %s..." % args.input)
with open(args.input, 'r', encoding='utf-8') as content:
output_data = []
p = Pool(processes=args.njobs)
if args.hungyi:
output_data = p.map(cutHungyi, content)
else:
output_data = p.map(cut, content)
p.close()
p.join()
with open(args.output, 'w', encoding='utf-8') as fout:
for data in output_data:
fout.write(data)
print("Done")
if __name__ == '__main__':
main()