-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstory_corpus_processer.py
More file actions
164 lines (126 loc) · 4.98 KB
/
story_corpus_processer.py
File metadata and controls
164 lines (126 loc) · 4.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import nltk
from collections import Counter
STORY_CORPUS = "story_corpus.txt"
STORY_SEP = "=="
SYSTEM_SEP = "===="
DATASET = "wikilarge"
def parse_data(file_loc):
""" Reads in the corpus in file_loc, strips story and system
separators and returns a list of lines, each containing one
paragraph of story text.
"""
print("reading in: " + file_loc)
with open(file_loc) as f:
lines = f.readlines()
# remove story and system separators
return [l for l in lines if not
(l.startswith(STORY_SEP) or l.startswith(SYSTEM_SEP))]
def tokenize(lines):
""" For each sentence in `line` this method performs
word tokenization and replaces brackets as well as
quotation marks with unified symbols as employed by
the appropirate training corpus.
"""
print("Tokenizing " + str(len(lines)) + " lines...")
tokenized_lines = []
for l in lines:
tokenized_lines.append(nltk.word_tokenize(l))
return [" ".join(_fix_q_marks_and_brackets(l)) for l in tokenized_lines]
def save_data(lines, filename):
""" Saves each element in `lines` into a file `filename`, separated by
newlines.
"""
with open(filename, "w") as f:
for line in lines:
f.write(line + "\n")
def split_in_sentences(lines):
""" Performs sentence tokenization. """
text = " ".join(lines)
return nltk.tokenize.sent_tokenize(text)
def create_sent_pairs(lines):
""" Takes a list of sentences and creates a list of tuples,
where each tuple consists of two subsequent sentences in `lines`.
Each sentence is contained in exactly one tuple (no overlapping).
"""
tup_lines = []
for sen1, sen2 in _iterate_pairs(lines):
tup_lines.append(sen1 + " " + sen2)
return tup_lines
def ne_anonymization(lines):
""" For each sentence in lines, this methods identifies named entities
anonymizes them by replacing each occurence with a placeholder of form
CATEGORY@NUM, where CTAGEORY stands for the type of ne, and NUM is the
ordinal number of this entity regarding this sentence (NOT the whole
corpus). Returns a list of sentences.
"""
new_lines = []
for sent in lines:
new_sent = []
tag_counter = Counter()
tag_dict = {}
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label'):
entity = chunk[0][0].lower()
ner_tag = chunk.label()
if ner_tag == "GPE":
ner_tag = "LOCATION"
if entity+ner_tag not in tag_dict.keys():
tag_counter[ner_tag] += 1
tag_dict[entity+ner_tag] = tag_counter[ner_tag]
new_sent.append(ner_tag + "@" + str(tag_dict[entity+ner_tag]))
else:
new_sent.append(chunk[0])
assert len(chunk) == 2
new_lines.append(" ".join(new_sent))
return new_lines
def deunk(orig_loc, generated_loc, safe_loc):
""" Removes <unk> words from text generated by a textual embellishment system,
by replacing them with the words at the same position in the file that was
used as input to the embellishment system.
Takes as input the location of a file generated by an embellishment system,
the location of the file that was used as input for the embellishment
system, as well as the location of an output file.
"""
with open(orig_loc, "r") as f:
orig_lines = f.readlines()
with open("generated_loc", "r") as f:
gen_lines = f.readlines()
gen_lines_deunked = []
for i, line in enumerate(gen_lines):
gen_line_list = line.split(" ")
orig_line_list = orig_lines[i].split(" ")
# print(gen_line_list)
for j, word in enumerate(gen_line_list):
if word == "<unk>":
gen_line_list[j] = orig_line_list[j]
gen_lines_deunked.append(" ".join(gen_line_list))
with open(safe_loc, "w") as f:
f.writelines(gen_lines_deunked)
count, same = 0, 0
for i, line in enumerate(gen_lines_deunked):
count += 1
if line == orig_lines[i]:
same += 1
print("Correct reproduction: " + str(same) + "/" + str(count) + " = "
+ str(same/float(count)))
def _fix_q_marks_and_brackets(line):
line = [l.replace("(", "-LRB-").replace(")", "-RRB-") for l in line]
if DATASET == "docaligned":
return line
if DATASET == "wikilarge":
return [l.replace("``", "''").replace("''", "''") for l in line]
def _iterate_pairs(l):
pairs = list(zip(l, l[1:]))[::2]
if (len(l) % 2 == 1):
# if l has uneven number of elements, zip ignores the (unpaired)
# last one
pairs.append((l[-1], ""))
return pairs
DATASET = "wikilarge"
lines = parse_data(STORY_CORPUS)
if DATASET == "wikilarge":
lines = split_in_sentences(lines)
lines = tokenize(lines)
lines = create_sent_pairs(lines)
lines = ne_anonymization(lines)
save_data(lines, "story_corpus.pairs.ne.txt")