marlowe_dataset/parse_reddit_db.py at master · iepathos/marlowe_dataset · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import sqlite3
import requests
import enchant
import nltk.data

english_enchant = enchant.Dict("en_US")
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
con = sqlite3.connect('/media/glen/Seagate Backup Plus Drive/redditcomments.sqlite')


def append_result(text, label):
    """
    appends text and label to objectivity.json
    """
    a_dict = {'text': text, 'label': label}

    with open('objectivity.json') as f:
        data = json.load(f)

    data.append(a_dict)

    with open('objectivity.json', 'w') as f:
        json.dump(data, f)


def text_in_dataset(text):
    """
    returns true if text is already in the json dataset
    false otherwise
    """
    with open('objectivity.json') as f:
        data = json.load(f)
    for x in data:
        if x['text'] == text:
            return True
    return False


def is_english(text):
    """
    returns true if text is mostly english, false otherwise
    """
    words = set([x for x in text.split(' ') if x.strip() != ''])
    scores = []
    for word in words:
        if english_enchant.check(word):
            scores.append(1)
        else:
            scores.append(0)
    score = sum(scores) / len(scores)
    if score > .7:
        return True
    return False


def check_text(text):
    """
    checks text with ensemble of machine learning services
    and returns label if score is high enough
    """
    if not is_english(text):
        return 'not english'
    ports = ['5000', '5001']  # local api ports for objectivity services
    scores = []
    for port in ports:
        r = requests.post('http://0.0.0.0' + ':' + port, data={'text': text})
        scores.append(json.loads(r.text)['objectivity'])
    score = sum(scores) / len(scores)
    if score > .95:
        label = 'objective'
    elif score < .6:
        label = 'subjective'
    else:
        label = 'error rate'
    return label


with con:
    cur = con.cursor()
    cur.execute("SELECT * FROM May2015;")
    for i in range(100000):
        rows = cur.fetchone()
        chunk = rows[17]
        sentences = tokenizer.tokenize(chunk)
        for s in sentences:
            text = s
            if not text_in_dataset(text):
                print(text)
                label = check_text(text)
                if label != 'error rate' and label != 'not english':
                    append_result(text, label)
                    print(label)
                elif label == 'not english':
                    print('Text is mostly not English')
                else:
                    print('Ensemble score uncertain between objective and subjective')
            else:
                print('Text already in dataset, skipping.')