-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvn_accent_type.py
More file actions
165 lines (144 loc) · 5.74 KB
/
vn_accent_type.py
File metadata and controls
165 lines (144 loc) · 5.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# -*- coding: utf-8 -*-
import regex as re
bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']
nguyen_am_to_ids = {}
for i in range(len(bang_nguyen_am)):
for j in range(len(bang_nguyen_am[i]) - 1):
nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)
def vn_word_to_telex_type(word):
dau_cau = 0
new_word = ''
for char in word:
x, y = nguyen_am_to_ids.get(char, (-1, -1))
if x == -1:
new_word += char
continue
if y != 0:
dau_cau = y
new_word += bang_nguyen_am[x][-1]
new_word += bang_ky_tu_dau[dau_cau]
return new_word
def vn_sentence_to_telex_type(sentence):
"""
Chuyển câu tiếng việt có dấu về kiểu gõ telex.
:param sentence:
:return:
"""
words = sentence.split()
for index, word in enumerate(words):
words[index] = vn_word_to_telex_type(word)
return ' '.join(words)
"""
End section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey
"""
"""
Start section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF
"""
def chuan_hoa_dau_tu_tieng_viet(word):
if not is_valid_vietnam_word(word):
return word
chars = list(word)
dau_cau = 0
nguyen_am_index = []
qu_or_gi = False
for index, char in enumerate(chars):
x, y = nguyen_am_to_ids.get(char, (-1, -1))
if x == -1:
continue
elif x == 9: # check qu
if index != 0 and chars[index - 1] == 'q':
chars[index] = 'u'
qu_or_gi = True
elif x == 5: # check gi
if index != 0 and chars[index - 1] == 'g':
chars[index] = 'i'
qu_or_gi = True
if y != 0:
dau_cau = y
chars[index] = bang_nguyen_am[x][0]
if not qu_or_gi or index != 1:
nguyen_am_index.append(index)
if len(nguyen_am_index) < 2:
if qu_or_gi:
if len(chars) == 2:
x, y = nguyen_am_to_ids.get(chars[1])
chars[1] = bang_nguyen_am[x][dau_cau]
else:
x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
if x != -1:
chars[2] = bang_nguyen_am[x][dau_cau]
else:
chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
return ''.join(chars)
return word
for index in nguyen_am_index:
x, y = nguyen_am_to_ids[chars[index]]
if x == 4 or x == 8: # ê, ơ
chars[index] = bang_nguyen_am[x][dau_cau]
# for index2 in nguyen_am_index:
# if index2 != index:
# x, y = nguyen_am_to_ids[chars[index]]
# chars[index2] = bang_nguyen_am[x][0]
return ''.join(chars)
if len(nguyen_am_index) == 2:
if nguyen_am_index[-1] == len(chars) - 1:
x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
# chars[nguyen_am_index[1]] = bang_nguyen_am[x][0]
else:
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
else:
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]]
# chars[nguyen_am_index[2]] = bang_nguyen_am[x][0]
return ''.join(chars)
def is_valid_vietnam_word(word):
chars = list(word)
nguyen_am_index = -1
for index, char in enumerate(chars):
x, y = nguyen_am_to_ids.get(char, (-1, -1))
if x != -1:
if nguyen_am_index == -1:
nguyen_am_index = index
else:
if index - nguyen_am_index != 1:
return False
nguyen_am_index = index
return True
def chuan_hoa_dau_cau_tieng_viet(sentence):
"""
Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ.
:param sentence:
:return:
"""
sentence = sentence.lower()
words = sentence.split()
for index, word in enumerate(words):
cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1/\2/\3', word).split('/')
print(cw)
cw[1] = chuan_hoa_dau_tu_tieng_viet(cw[1])
words[index] = ''.join(cw)
return ' '.join(words)
if __name__ == '__main__':
print(chuan_hoa_dau_cau_tieng_viet('anh hoà, đang làm.. gì'))
# anh hòa, đang làm.. gì