-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprepare.py
More file actions
117 lines (93 loc) · 2.8 KB
/
prepare.py
File metadata and controls
117 lines (93 loc) · 2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
description: process the text for further use.
containing functions:
rmTags(filePath) : filePath.noTag
rmPunct(filePath) : filePath.noPunct
wordsFreq(filePath) : wordsVec, wordsCount
lenFreq(wordsVec) : lenVec
hotWords(wordsVec,n) : hotWordsVec
"""
import re
import os
import operator
import string
def rmTags(filePath):
"""
# description: remove HTML tags in the given file, and returns a new
file without the tags#
input: the file(e.g. post.txt) with unwanted HTML tags
output: "post.txt.tagFree" with no HTML tags
"""
f = open(filePath, 'r')
html = f.readlines()
f.close()
opPath = filePath + ".noTag"
if os.path.isfile(opPath):
os.remove(opPath)
out = open(opPath, 'a')
tags = re.compile(r'<[^>]+>', re.S)
for line in html:
line = tags.sub(' ', line)
out.write(line)
out.close()
return opPath
def rmPunct(filePath):
"""
similar to rmTags, this time remove punctuations
"""
f = open(filePath, 'r')
text = f.readlines()
f.close()
opPath = filePath + '.noPunct'
if os.path.isfile(opPath):
os.remove(opPath)
opFile = open(opPath, 'a')
for line in text:
opLine = line.replace('-', ' ')
opLine = ''.join(c for c in opLine if c not in string.punctuation)
opFile.write(opLine)
opFile.close()
return opPath
# no word form detection, eg. 'go' and 'went' regarded as two words
def wordsFreq(filePath):
"""
# description: calculate the vector of given file(e.g. post.txt) #
input: path to a file
output: 1. the vector of this text document, with each element in the from of
"word":times appeared, in the form of a dict
2. the numbers of words in this document, which is an integer
"""
f = open(filePath)
text = f.readlines()
f.close()
tokens = []
freqVocab = {}
# get words
for line in text:
newLine = line
newLine = newLine.replace('\t', '')
newLine = newLine.replace('\n', ' ')
newLine = newLine.lower()
words = [i for i in newLine.split(' ') if i != '']
tokens.extend(words)
wordsCount = len(tokens)
vocab = list(set(tokens))
freqVocab = dict(zip(vocab, [0] * len(vocab)))
# get frequency of each word
for t in tokens:
freqVocab[t] += 1
wordsVec = sorted(freqVocab.items(), key=operator.itemgetter(1), reverse=1)
return wordsVec, wordsCount
def lenFreq(wordsVec):
lenVec = [0] * 20
for i in range(len(wordsVec)):
lenVec[len(i[0])] += i[1]
return lenVec
def hotWords(wordsVec, n):
return [t[0] for t in wordsVec[:n]]
def prepare(filePath):
f1 = rmTags(filePath)
f = rmPunct(f1)
wordsVec, wordsCount = wordsFreq(f)
hot = hotWords(wordsVec, 30)
return hot