-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrainer.py
More file actions
66 lines (63 loc) · 2.38 KB
/
trainer.py
File metadata and controls
66 lines (63 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from collections import defaultdict
#树节点
class TreeNode(object):
def __init__(self, s):
self.val = s
self.left = None
self.right = None
#将语法树由括号表示解析成树节点表示
def parse(sentence = ""):
sentence = sentence.replace("(", " ( ")
sentence = sentence.replace(")", " ) ")
symlist = sentence.split()
stack = [TreeNode("")]
while symlist:
cursym = symlist.pop(0)
if cursym == "(":
if not stack[-1].left:
stack[-1].left = TreeNode(symlist.pop(0))
stack.append(stack[-1].left)
elif not stack[-1].right:
stack[-1].right = TreeNode(symlist.pop(0))
stack.append(stack[-1].right)
else:
raise SyntaxError("error sentence!")
elif cursym == ")":
stack.pop()
else:
if not stack[-1].left:
stack[-1].left = TreeNode(cursym)
else:
raise SyntaxError("error sentence!")
return stack[-1].left
#训练PCFG模型
def train(outfile = "", corpus = []):
corpusTrees = []
for sentence in corpus:
corpusTrees.append(parse(sentence))
ruleMaps = defaultdict(float)
leftMaps = defaultdict(float)
for root in corpusTrees:
stack = [root]
while stack:
node = stack.pop()
if node.left and node.right:
ruleMaps[node.val + "#" + node.left.val + " " + node.right.val] += 1
leftMaps[node.val] +=1
stack.append(node.right)
stack.append(node.left)
elif node.left:
ruleMaps[node.val + "#" + node.left.val] += 1
leftMaps[node.val] +=1
for key in ruleMaps:
ruleMaps[key] /= leftMaps[key.split("#")[0]]
sortedMaps = sorted(ruleMaps.items(), key=lambda x:(x[0]))
sortedMaps.sort(key=lambda x:(x[1]), reverse = True)
with open(outfile, "w") as f:
for rule in sortedMaps:
string = rule[0].replace("#", " # ") + " # %.8f\n" % rule[1]
f.write(string)
if __name__ == '__main__':
corpus1 = "(S(NP(DT the)(NN boy))(VP(VP(VBD saw)(NP(DT a)(NN girl)))(PP(IN with)(NP(DT a)(NN telescope)))))"
corpus2 = "(S(NP(DT the)(NN girl))(VP(VBD saw)(NP(NP(DT a)(NN boy))(PP(IN with)(NP(DT a)(NN telescope))))))"
train("model.txt", [corpus1, corpus2])