-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_generator.py
More file actions
104 lines (87 loc) · 2.92 KB
/
text_generator.py
File metadata and controls
104 lines (87 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import progressbar
import random
import re
from collections import Counter
from nltk import trigrams
from nltk.tokenize import WhitespaceTokenizer
LINES = 100
ENDING = re.compile(r'[.!?]$')
_trigrams = []
def gen_model(start_token):
"""
Generates the frequency list of tails from a starting token
:param start_token: starting token
:return: list containing tails and their counts [tail, count]
"""
# Gets tails of the head
tails = [token[1] for token in _trigrams if token[0] == start_token]
# Forms a list of tail and its count (tail, count)
output = [(tail, count) for tail, count in Counter(tails).items() if not re.match(ENDING, tail)]
return output
def gen_sentence(start_gen):
"""
Generates a sentence based on a starting token
:param start_gen: generator that yields starting words
:return: a string representing the generated sentence
"""
start = start_gen.__next__()
output = start.split()
i = 1
while True:
start = output[i]
if len(output) >= 5 and re.search(ENDING, start):
break
if re.match(ENDING, start):
word = start_gen.__next__()
else:
word = gen_next(' '.join([output[i - 1], start]))
output.append(word)
i += 1
return ' '.join(output)
def gen_next(token):
"""
Generates the next token based on a starting token
:param token: starting token
:return: a string representing the generated token
"""
model = gen_model(token)
tails, weights = map(list, zip(*model))
return random.choices(tails, weights=weights)[0]
def gen_start():
"""
Generates a random starting token for a sentence
:return: a string representing the generated token
"""
capitals = [token[0] for token in _trigrams if token[0][0].isupper()]
while True:
yield random.choice(capitals)
def generate_paragraph(lines=10):
"""
Generates n number of lines based on a corpus
:param lines: number of lines to print
"""
global _trigrams
while True:
print('Enter corpus filename:')
filename = input()
try:
file = open(filename, "r", encoding='utf-8')
except FileNotFoundError:
print('File does not exist!')
continue
tk = WhitespaceTokenizer()
words = tk.tokenize(' '.join(file.readlines()).replace('"', ''))
_trigrams = list(trigrams(words))
_trigrams = [(' '.join((token[0], token[1])), token[2]) for token in _trigrams]
start_gen = gen_start()
output = ''
for _ in progressbar.progressbar(range(lines)):
# Needed because some head tokens will not generate any tails
while True:
try:
output += f'{gen_sentence(start_gen)}\n'
except ValueError:
continue
break
return output
print(generate_paragraph(LINES))