-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcollocations.py
More file actions
28 lines (22 loc) · 1.18 KB
/
collocations.py
File metadata and controls
28 lines (22 loc) · 1.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
"""
Collocations - Words that usually go together in sentences
"""
# pylint: disable=C0103
from nltk.corpus import webtext, stopwords
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
# Words from the script of Monty Python and the Holy Grail.
holy_grail = [wd.lower() for wd in webtext.words('grail.txt')]
bc_finder = BigramCollocationFinder.from_words(holy_grail)
# Naive
collocations = bc_finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)
print(f"Naive top 20 bigram collocations of the holy grail: {collocations}\n")
# Refined
bc_finder.apply_word_filter(lambda wd: len(wd) < 3 or wd in set(stopwords.words('english')))
collocations = bc_finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)
print(f"Refined top 20 bigram collocations of the holy grail: {collocations}\n")
# Trigrams collocations
tc_finder = TrigramCollocationFinder.from_words(holy_grail)
tc_finder.apply_word_filter(lambda wd: len(wd) < 3 or wd in set(stopwords.words('english')))
trigrams = tc_finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)
print(f"Top 20 trigram collocations of the holy grail: {trigrams}\n")