-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstopwords.py
More file actions
32 lines (25 loc) · 1.01 KB
/
stopwords.py
File metadata and controls
32 lines (25 loc) · 1.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
"""
Filtering stopwords
"""
# pylint: disable=C0103
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from samples import sample, sample_ct
# Loading and showing stopwords from various modules.
en_sw = set(stopwords.words("english"))
pt_sw = set(stopwords.words("portuguese"))
es_sw = set(stopwords.words("spanish"))
fr_sw = set(stopwords.words("french"))
print(f"en stopwords: {en_sw}\n")
print(f"pt stopwords: {pt_sw}\n")
print(f"es stopwords: {es_sw}\n")
print(f"fr stopwords: {fr_sw}\n")
# Showing text withou stopwords.
words = [word for word in word_tokenize(sample.lower()) if word not in en_sw]
print(f"Sample(default) with no stopwords: {words}\n")
# Contractions
words = [word for word in word_tokenize(sample_ct.lower()) if word not in en_sw]
print(f"Sample(contractions) with no stopwords: {words}\n")
# Work around removing 'am 's n't as it is not in the stopwords list
words = [word for word in words if "'" not in word and word.isalpha()]
print(f"Contractions revised: {words}\n")