-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathinterestDatasetGenerator.py
More file actions
176 lines (150 loc) · 5.53 KB
/
interestDatasetGenerator.py
File metadata and controls
176 lines (150 loc) · 5.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# -*- coding: utf-8 -*-
"""
Created on Sun Oct 11 12:19:25 2015
@author: Waffleboy
"""
import time,tweepy,csv,random
#from accesstokenTwitter import *
#Uncomment above if you have a config file with a lst
#of accesstokens,
#else:
#add manually below in the form of
# ClientID/ Client Secret/ Access Token / Access Secret
#Eg,
#accesstokenlist=[]
#accesstokenlist.append(['clientid','clientsecret','accesstoken','accesssecret'])
def dicOfAccounts():
dic={}
#eg,
dic['News'] = ['cnn','bbc','nytimes']
dic['Politics']=['barackobama']
return dic
##Run one time only.
def makeCSV():
with open('twitterInterests.csv','w') as f:
writer = csv.writer(f)
writer.writerow(["Twitter Account","text","Interest"])
def verifyTwitterAccounts(dic):
print('Beginning verification of twitter accounts')
currKeyID=0
currentKey=accesstokenlist[currKeyID]
auth = tweepy.auth.OAuthHandler(currentKey[0], currentKey[1])
auth.set_access_token(currentKey[2], currentKey[3])
api = tweepy.API(auth)
error=False
errorlist=[]
for key,value in dic.items():
try:
for i in range(len(value)):
api.get_user(value[i])
except:
errorlist.append(value[i])
error=True
if error == False:
print('Error: The following users are private/do not exist :' + str(errorlist))
else:
return "No errors"
"""
PreCond: Takes in a dic of category:[twitternames]
Description: Writes CSV of the each accounts latest 3.2k tweets in utf-8 format.
"""
def extractTweets(dic):
currKeyID=0
currentKey=accesstokenlist[currKeyID]
numtoken=len(accesstokenlist) # Total number of access keys
auth = tweepy.auth.OAuthHandler(currentKey[0], currentKey[1])
auth.set_access_token(currentKey[2], currentKey[3])
api = tweepy.API(auth)
rateID=0
timeStart=time.time()
def changekey():
nonlocal currKeyID
nonlocal currentKey
nonlocal numtoken
nonlocal api,auth
currKeyID = (currKeyID+1)%numtoken
currentKey=accesstokenlist[currKeyID]
auth = tweepy.auth.OAuthHandler(currentKey[0], currentKey[1])
auth.set_access_token(currentKey[2], currentKey[3])
api = tweepy.API(auth)
def updateAPIRate():
nonlocal rateID
x=api.rate_limit_status()
rateID=x['resources']['statuses']['/statuses/user_timeline']['remaining']
def checkRateID():
nonlocal rateID
nonlocal timeStart
if rateID<=1:
changekey()
updateAPIRate()
if rateID<=1:
timeDifference = time.time() - timeStart
if timeDifference > 0:
print('RateID Exhausted, sleeping for rate reset. Key: '+str(currKeyID))
time.sleep(905 - timeDifference)
timeStart = time.time()
def removeLinksAndLastCharacter(lst):
lst=[lst[0]]+lst[2:]
for i in range(1,len(lst)):
text = lst[i][1]
text=text.lower()
x = text.find('http')
while x != -1:
text = lst[i][1][:x] + lst[i][1][x+22:]
lst[i][1] = text
x = text.find('http')
lst[i][1]=lst[i][1][:-1]
lst[i][1]=lst[i][1][2:]
return lst
def shuffleList(lst):
header=[lst[0]]
lst=lst[1:]
random.shuffle(lst)
random.shuffle(lst)
random.shuffle(lst)
random.shuffle(lst)
random.shuffle(lst)
finallst = header + lst
return finallst
for key,value in dic.items():
try:
print('Currently processing topic: '+str(key))
for i in range(len(value)):
print('Currently processing user :' + value[i])
tweetlst = []
new_tweets = api.user_timeline(screen_name = value[i],count=200)
tweetlst.extend(new_tweets)
updateAPIRate()
checkRateID()
oldest = tweetlst[-1].id - 1
while len(new_tweets) > 0:
checkRateID()
rateID-=1
new_tweets = api.user_timeline(screen_name = value[i],count=200,max_id=oldest)
tweetlst.extend(new_tweets)
oldest = tweetlst[-1].id - 1
outtweets = [[value[i],tweet.text.encode('utf8'),str(key)] for tweet in tweetlst]
with open('BTAssignment.csv', 'a',newline="") as f:
writer = csv.writer(f)
writer.writerows(outtweets)
except Exception as e:
print(e, 'Error occured while processing '+key + '' + str(value) + '' + 'Skipping!')
print('Currently using key: '+str(currKeyID))
print('Done with extraction, now removing links and last hyphen from CSV.')
with open('twitterInterests.csv','r') as f:
reader = csv.reader(f)
lst=list(reader)
##Remove links, first b' and last '
lst = removeLinksAndLastCharacter(lst)
#Shuffle the CSV
print('Shuffling CSV')
lst = shuffleList(lst)
print('Shuffling completed! Writing to CSV.')
with open('twitterInterestsProcessed.csv','w',newline='') as f:
writer = csv.writer(f)
writer.writerows(lst)
print('twitterInterestsProcessed.csv is now ready for use')
if __name__ == '__main__':
makeCSV()
dic = dicOfAccounts()
extractTweets(dic)