-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtroll_data.py
More file actions
98 lines (68 loc) · 2.4 KB
/
troll_data.py
File metadata and controls
98 lines (68 loc) · 2.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import calendar
import nltk
import string
import re
from collections import Counter
from nltk.corpus import stopwords
#could do timeline of tweets
#do timeline of creation of accounts
#most frequent words found in the russian troll tweets in general
#most frequent words before election
#most frequent words after election
def count_per_year(dates):
count = {}
for date in dates:
if date[0]:
separated = date[0].split()
year = int(separated[-1])
count[year] = count.get(year, 0) + 1
year_count = []
for key, value in sorted(count.items()):
if key > 2012:
year_count.append(value)
return year_count
def count_per_month(only_months):
years = {}
#create double dictionary to get counts per year per month of tweets
for date in only_months:
years[int(date[0])] = years.get(int(date[0]), {})
month_dict = years.get(int(date[0]))
month_dict[int(date[1])] = month_dict.get(int(date[1]), 0) + 1
#create list of counts per month per year to send
month_counts = []
for key, value in sorted(years.items()):
if key > 2015:
counts = []
for month, count in sorted(years.get(key).items()):
counts.append(count)
month_counts.append(counts)
return month_counts
def get_words(all_text):
only_words = []
regex = re.compile('[^a-zA-Z]')
stop = set(stopwords.words("english") + ["rt", "via", "amp"])
for item in all_text:
if item[0]:
no_punct = item[0].encode("utf-8").translate(None, string.punctuation)
words = no_punct.split()
for word in words:
word = regex.sub('', word).lower()
if len(word) > 1:
if word not in stop and "http" not in word:
only_words.append(word)
c = Counter(only_words)
top_100 = normalize_count(c)
return top_100
def normalize_count(c):
#total number of valid words in corpus of text
key_max = max(c.keys(), key=(lambda k: c[k]))
key_min = min(c.keys(), key=(lambda k: c[k]))
old_max = c[key_max] + 10000
old_min = 10
total_count = len(list(c.elements()))
top_100 = []
for item in c.most_common(100):
old = (item[1] - old_min)/float(old_max - old_min)
new = ((50 - 10) * old) + 10
top_100.append([item[0], new])
return top_100