-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_reddit_db.py
More file actions
executable file
·101 lines (87 loc) · 2.69 KB
/
parse_reddit_db.py
File metadata and controls
executable file
·101 lines (87 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import sqlite3
import requests
import enchant
import nltk.data
english_enchant = enchant.Dict("en_US")
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
con = sqlite3.connect('/media/glen/Seagate Backup Plus Drive/redditcomments.sqlite')
def append_result(text, label):
"""
appends text and label to objectivity.json
"""
a_dict = {'text': text, 'label': label}
with open('objectivity.json') as f:
data = json.load(f)
data.append(a_dict)
with open('objectivity.json', 'w') as f:
json.dump(data, f)
def text_in_dataset(text):
"""
returns true if text is already in the json dataset
false otherwise
"""
with open('objectivity.json') as f:
data = json.load(f)
for x in data:
if x['text'] == text:
return True
return False
def is_english(text):
"""
returns true if text is mostly english, false otherwise
"""
words = set([x for x in text.split(' ') if x.strip() != ''])
scores = []
for word in words:
if english_enchant.check(word):
scores.append(1)
else:
scores.append(0)
score = sum(scores) / len(scores)
if score > .7:
return True
return False
def check_text(text):
"""
checks text with ensemble of machine learning services
and returns label if score is high enough
"""
if not is_english(text):
return 'not english'
ports = ['5000', '5001'] # local api ports for objectivity services
scores = []
for port in ports:
r = requests.post('http://0.0.0.0' + ':' + port, data={'text': text})
scores.append(json.loads(r.text)['objectivity'])
score = sum(scores) / len(scores)
if score > .95:
label = 'objective'
elif score < .6:
label = 'subjective'
else:
label = 'error rate'
return label
with con:
cur = con.cursor()
cur.execute("SELECT * FROM May2015;")
for i in range(100000):
rows = cur.fetchone()
chunk = rows[17]
sentences = tokenizer.tokenize(chunk)
for s in sentences:
text = s
if not text_in_dataset(text):
print(text)
label = check_text(text)
if label != 'error rate' and label != 'not english':
append_result(text, label)
print(label)
elif label == 'not english':
print('Text is mostly not English')
else:
print('Ensemble score uncertain between objective and subjective')
else:
print('Text already in dataset, skipping.')