Data_Proc/CleanTweets.py at master · VT-ATD/Data_Proc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
## -*- coding: utf-8 -*-
##  CleanTweets.py
##
##  Takes in a file of twitter data, outputs an analogous file of processed
##  twitter data, along with a subset of the original data.
##
##  Requires file: `blacklist.txt` in the python path.
##                  Blacklist must have one word per line.
##
##  Use:
##      >> python CleanTweets.py target_json_file
##
##  `target_json_file` requires one json per line, with the minimum field
##       {"body":"This is an original tweet, #datascience"....}
##
##

__author__ = "Peter Hauck"
__email__ = "phauck@vt.edu"


import sys
import os
import sys
import json
import string
import re
import operator
import nltk
from nltk.corpus import stopwords
import numpy as np
from operator import itemgetter
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

token_dict = {}
stemmer = PorterStemmer()
#need this
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

#need this
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

def get_tokens(text_list):
    for indx, text in enumerate(text_list):
        lowers = text.lower()
        #remove the punctuation using the character deletion step of translate
        #no_punctuation = lowers.translate(None, string.punctuation)
        tokens = nltk.word_tokenize(lowers)
        text_list[indx] = tokens
    return text_list
#need this
def preprocess(sentence,blacklist):
        sentence = sentence.lower()
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(sentence)
        allbadwords = stopwords.words('english') + blacklist
        filtered_words = [w for w in tokens if not w in allbadwords]
        return " ".join(filtered_words)

def is_json(myjson):
  try:
    json_object = json.loads(myjson)
  except ValueError, e:
    return False
  return True

def is_ascii(s):
    try:
        s.decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True


def main():

    try:
        with open('blacklist.txt') as f:
             blacklist = f.read().splitlines()
        f.close()

    fname_test_tweets = sys.argv[1]
    fname_out = fname_test_tweets +"_prcsd"
    test_set = []

    with open(fname_test_tweets) as f:
         with open(fname_out,'w') as f1:
              for line in f:

                  if is_json(line):
                     j_content = json.loads(line)
                     tweet_message = j_content['body']
                     preprcssd_tweet = preprocess(tweet_message,blacklist)

                     if is_ascii(preprcssd_tweet):
                        prcsd_twt_str = preprcssd_tweet
                        j_content['prcsd_body'] = prcsd_twt_str
                        less_content = {}
                        less_content['prcsd_body'] = prcsd_twt_str
                        less_content['body'] = j_content['body']
                        less_content['gnip'] = j_content['gnip']
                        newline = json.dumps(less_content)+"\n"
                        f1.writelines(newline)
                     else:
                        print 'Non-English Tweet Passed though Filtration', tweet_message
    f.close()


if __name__ == '__main__':
    main()