-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathmy_tokenizer.py
More file actions
31 lines (24 loc) · 886 Bytes
/
my_tokenizer.py
File metadata and controls
31 lines (24 loc) · 886 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# -*- coding: utf-8 -*-
import MeCab
# path = '-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd'
path = ''
# -u /mnt/2bddf92b-47f9-4809-95a5-b91e7f25af27/myData/GitHub/bookdown_textmining/data/motohiro.dic
tagger = MeCab.Tagger(path)
def mecab_tokens(text, pos = ['名詞', '形容詞', '動詞']):
text = ''.join(text.split())
node = tagger.parseToNode(text)
word_list = []
while node:
# print(node.surface)
if node.surface != '':
elem = node.feature.split(',')
term = elem[6] if elem[6] != '*' else node.surface
# print(elem[0])
if len(pos) < 1 or elem[0] in pos:
# print(term)
word_list.append(term)
node = node.next
return word_list
if __name__ == '__main__':
out = mecab_tokens("今日の午後は八宝菜を食べました。")
print(out)