-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlemmatizer.py
More file actions
36 lines (29 loc) · 1021 Bytes
/
lemmatizer.py
File metadata and controls
36 lines (29 loc) · 1021 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
# to convert pos_tag to compatible wordnet tags
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
lemmatizer= WordNetLemmatizer()
example_text= "I like to play. I played yesterday. I am playing right now. He plays soccer"
tokenized= sent_tokenize(example_text)
def process_content():
try:
for i in tokenized:
words= nltk.word_tokenize(i)
tagged= nltk.pos_tag(words)
for i, w in enumerate(words):
print(lemmatizer.lemmatize(w, get_wordnet_pos(tagged[i][1])))
except Exception as e:
print(str(e))
process_content()