-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGetStatuses.py
More file actions
129 lines (101 loc) · 4.05 KB
/
GetStatuses.py
File metadata and controls
129 lines (101 loc) · 4.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import tweepy
import json
import glob
import time
import csv
import sys
import os
reload(sys)
sys.setdefaultencoding('utf-8')
TWEETS_DIR = 'tweets'
FILE_NAME = 'statusess.csv'
LIMIT = 2000
# Twitter API credentials
CONSUMER_KEY = os.getenv('CONSUMER_KEY')
CONSUMER_SECRET = os.getenv('CONSUMER_SECRET')
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN_KEY')
ACCESS_TOKEN_SECRET = os.getenv('ACCESS_TOKEN_SECRET')
# authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
users = []
if not os.path.exists(TWEETS_DIR):
os.makedirs(TWEETS_DIR)
# write the csv
with open(TWEETS_DIR + '/' + FILE_NAME, 'a+') as f:
writer = csv.writer(f)
writer.writerow(
["id", "tweet_id", "created_at", "text", "favorite_count", "retweet_count", "phone", "sensitive", "hashtags",
"no_hashtags", "mentions", "no_mentions", "no_urls", "no_media"])
def get_user_names():
for f in glob.glob('twitter-users/*.json'):
data = json.load(file(f))
screen_name = data['screen_name']
users.append(screen_name)
def get_all_tweets(screen_name):
# Twitter only allows access to a users most recent 3240 tweets with this method
new_tweets = []
# make initial request for most recent tweets
try:
new_tweets = api.user_timeline(screen_name=screen_name, count=20, tweet_mode='extended')
except tweepy.TweepError, error:
print str(error)
return
if len(new_tweets) == 0:
return
recent = []
recent.extend(new_tweets)
# save the id of the oldest tweet less one
oldest = recent[-1].id - 1
user_count = 0
# keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0 and user_count < LIMIT:
try:
# all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest, tweet_mode='extended')
# update user tweets count
user_count += len(new_tweets)
# save most recent tweets
tweet_list.extend(new_tweets)
# update the id of the oldest tweet less one
if len(tweet_list) > 0:
oldest = tweet_list[-1].id - 1
else:
break
except tweepy.TweepError, error:
print str(error)
return
def write_file():
# write the csv
with open(TWEETS_DIR + '/' + FILE_NAME, 'a+') as f:
writer = csv.writer(f)
# transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [
[
tweet.author.id, tweet.id, tweet.created_at, tweet.full_text.encode("utf-8").replace('\n', ' '),
tweet.favorite_count,
tweet.retweet_count, tweet.source, tweet.possibly_sensitive,
';'.join([ht.get('text').encode('utf-8') for ht in tweet.entities.get('hashtags')])
if tweet.entities.get('hashtags') else "-",
len(tweet.entities.get('hashtags')),
[um.get('id') for um in tweet.entities.get('user_mentions')],
len(tweet.entities.get('user_mentions')),
len(tweet.entities.get('urls')) if tweet.entities.get('urls') is not None else 0,
len(tweet.entities.get('media')) if tweet.entities.get('media') is not None else 0
]
for tweet in tweet_list
if hasattr(tweet, 'possibly_sensitive') and not tweet.retweeted and 'RT @' not in tweet.full_text]
writer.writerows(outtweets)
if __name__ == '__main__':
print "Loading user information"
get_user_names()
tweet_count = 0
for num, un in enumerate(users):
print "Collecting tweets for user %s of %s: %s" % (num+1, len(users), un)
# initialize a list to hold all the Tweets
tweet_list = []
get_all_tweets(un)
tweet_count += len(tweet_list)
print "%s tweets collected so far" % tweet_count
write_file()