-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtrain.py
More file actions
70 lines (66 loc) · 2.09 KB
/
train.py
File metadata and controls
70 lines (66 loc) · 2.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os
import string
files = {'teen_f': [],
'teen_m': [],
'adult_f': [],
'adult_m': [],
'mature_f': [],
'mature_m': [], }
def count(file_path, vocab):
tot = 0
f = open(file_path, encoding="latin-1")
for line in f.readlines():
if line == '':
continue
if line[0] == '<':
continue
# rm punctuation
new_line = line.replace('-', ' ')
new_line = new_line.replace(' ', ' ')
new_line = new_line.replace('\t', ' ')
new_line = ''.join(c.lower() for c in new_line
if c not in string.punctuation)
for word in new_line.split(' '):
tot += 1
if len(word) <= 20:
if word not in vocab.keys():
vocab[word] = 1
else:
vocab[word] += 1
return tot
def train(data_folder):
# load data
for root, dir_list, file_list in os.walk(data_folder):
for f in file_list:
# get tag
gender = f.split('.')[1]
age = int(f.split('.')[2])
if age < 20:
if gender == 'female':
tag = 'teen_f'
else:
tag = 'teen_m'
elif age < 30:
if gender == 'female':
tag = 'adult_f'
else:
tag = 'adult_m'
else:
if gender == 'female':
tag = 'mature_f'
else:
tag = 'mature_m'
files[tag].append(f)
# count words
for tag in files:
vocab = {}
tot = 0
for f in files[tag]:
file_path = os.path.join(data_folder, f)
tot += count(file_path, vocab)
print('{0} has {1} words in total'.format(tag, tot))
# write the dict into a csv file
outfile = open(tag + '.csv', 'w', encoding="utf-8")
for word, num in vocab.items():
if num != 1:
outfile.write('{0},{1}\n'.format(word, num * 1.0 / tot))