Python-POS-Tagger/HelveySean_Assignment_4.py at master · seanhelvey/Python-POS-Tagger · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#! /usr/bin/env python

#before running the file, make sure path to traningFile, developmentFile are correct here:
trainingFile = "/Users/seanhelvey/Desktop/HelveySean_Assignment_4/training.pos"
developmentFile = "/Users/seanhelvey/Desktop/HelveySean_Assignment_4/development.text"

import copy
import collections

class bigram(object):
    def __init__(self):
        self.priorWord = ''
        self.priorTag = ''
        self.currentWord = ''
        self.currentTag = ''

        self.transition = ()
        self.emission = ()

        self.wordCount = 0
        self.tagCount = 0
        self.priorTagCount = 0
        self.transCount = 0
        self.emitCount = 0

        self.possTags = []

        self.transProb = 0
        self.emishProb = 0
        self.finalProb = 0

    def scoreCalc(self):
        self.transProb = self.transCount/float(self.priorTagCount)
        self.emishProb = self.emitCount/float(self.tagCount)
        self.finalProb = self.transProb * self.emishProb

class sentences(object):
    def __init__(self):
        self.list = []

        self.sentsMax = 0

    def addWord(self,gram):
        for sentence in self.list:
            sentence.bigrams.append(gram)

    def addSentence(self):
        sent = sentence()
        self.list.append(sent)

    def replicate(self,gram):

        holdList = []
        for s in self.list:
            c = sentence()
            c = copy.deepcopy(s)

            c.bigrams.pop()
            c.bigrams.append(gram)

            holdList.append(c)

        self.list = self.list[:] + holdList[:]

class sentence(object):
    def __init__(self):
        self.bigrams = []
        self.score = 0
        self.max = 0

    def sentScore(self):
        switch = 0
        for gram in self.bigrams:
            if switch == 0:
                self.score = gram.finalProb
                switch = 1
            else:
                self.score = self.score * float(gram.finalProb)

#open trainingFile for read
file=open(trainingFile,'r')

#list of bigrams for aggregating data
theBigrams = []

#dictionairies for counting
wordDic = collections.defaultdict(int)
tagDic = collections.defaultdict(int)
transDic = collections.defaultdict(int)
emitDic = collections.defaultdict(int)
possDic1 = collections.defaultdict(list)
possDic2 = collections.defaultdict(int)

#last pos -> most likely pos
lastDic = collections.defaultdict(str)

#variables to store for the next iteration
lastWord = ''
lastTag = ''
lastTagCount = 0

#FIRST PASS~~~~~~
#parsing input data
#aggregating dictionairies
file.seek(0)
for line in file:

    gram = bigram()
    thisLine = line.split()
    listLen = len(thisLine)

    #assign PRIOR word and tag to gram
    gram.priorWord = lastWord
    gram.priorTag = lastTag
    gram.priorTagCount = lastTagCount

    #if the current line contains a word and a tag
    if listLen > 1:

        #assign CURRENT word and tag to gram
        gram.currentWord = thisLine[0]
        gram.currentTag = thisLine[1]

        #and add to dictionairy -> list
        possDic1[gram.currentWord].append(gram.currentTag)

    else:
        gram.currentWord = ''
        gram.currentTag = ''

    #store transition & emission
    gram.transition = (gram.priorTag,gram.currentTag)
    gram.emission = (gram.currentTag,gram.currentWord)

    #increment dictionairies
    transDic[gram.transition] += 1
    emitDic[gram.emission] += 1
    wordDic[gram.currentWord] += 1
    tagDic[gram.currentTag] += 1
    possDic2[gram.currentWord] += 1

    #add the gram to our list
    theBigrams.append(gram)

    #set temp variables for next gram
    lastWord = gram.currentWord
    lastTag = gram.currentTag
    lastTagCount = gram.tagCount


#Uniqify thingys in possDic1
for thingy in possDic1:
    possDic1[thingy]=list(set(possDic1[thingy]))

copyTagDic = copy.deepcopy(tagDic)
copyTransDic = copy.deepcopy(transDic)

#lastDic will have tag -> tag+1 in strings
for tag in copyTagDic:
    lastDic[tag] = ""

#Counting total transitions from tag
for trans in transDic:
    copyTagDic[trans[0]] += 1

#Taking transDic from count to prob
for trans in transDic:
    copyTransDic[trans] = transDic[trans]/ float(copyTagDic[trans[0]])

#Setting copyTagDic back to zero
for tag in copyTagDic:
    copyTagDic[tag] = 0

#Setting max
for item in copyTransDic:
    if copyTagDic[item[0]] < copyTransDic[item]:
        copyTagDic[item[0]] = copyTransDic[item]

#Storing in lastDic
for item in copyTransDic:
    if copyTagDic[item[0]] == copyTransDic[item]:

        #mapping tag -> tag + 1
        lastDic[item[0]] = item[1]

gramDic = collections.defaultdict(bigram)

#COUNTING~~~~~~~~~~~~~
#We want the data from the dictionairies
#stored locally with each bigram object
for item in theBigrams:

    item.wordCount = wordDic[item.currentWord]
    item.tagCount = tagDic[item.currentTag]
    item.priorTagCount = tagDic[item.priorTag]
    item.transCount = transDic[item.transition]

    if item.emission[0] == item.currentTag and item.emission[1] == item.currentWord:
        item.emitCount = emitDic[item.emission]

    item.possTags = possDic1[item.currentWord]

    item.scoreCalc()
    gramDic[(item.currentWord,item.currentTag)] = item

#SECOND PASS~~~~~~~~
#Building up lists of possible word combinations or "sentences"
#Computing likelihood of each tag sequence

file=open(developmentFile,'r')

sentsList = []
newSents = 1
sentsListNum = 0
theLastTag = ''
for word in file:

    wordList = word.split()
    currentTag = ''

    if len(wordList) > 0:
        theWord = wordList[0]
        tags = possDic1[theWord]

        if newSents == 1:
            newSents = 0
            sents = sentences()
            sents.addSentence()

        if len(tags) >= 1:

            tagMax = 0
            for tag in tags:
                gram = gramDic[(theWord,tag)]
                currentTag = tag

                if gram.finalProb >= tagMax:
                    tagMax = gram.finalProb

            for tag in tags:
                gram = gramDic[(theWord,tag)]

                if gram.finalProb == tagMax:
                    sents.addWord(gram)

        else:
            if len(tags) == 0:
                gram = bigram()
                gram.currentWord = theWord
                gram.currentTag = lastDic[theLastTag]
                gram.finalProb = .0001
                gramDic[(gram.currentWord,gram.currentTag)] = gram

            else:
                gram = gramDic[(theWord,tags[0])]
                currentTag = tags[0]

            sents.addWord(gram)

    else:
        newSents = 1
        sentsList.append(sents)
        sentsListNum = sentsListNum + 1

    theLastTag = currentTag

#Find max
for sents in sentsList:
    for sent in sents.list:
        sent.sentScore()
        if sent.score > sents.sentsMax:
            sents.sentsMax = sent.score


outfile = open("out.txt","w")

#Write output
for sents in sentsList:
    num = 0
    for sent in sents.list:
        if sent.score == sents.sentsMax and num == 0:
            for gram in sent.bigrams:
                outfile.write(str(gram.currentWord)+"\t"+str(gram.currentTag)+"\n")
            num += 1
            outfile.write("\n")

outfile.write("\n")