twitter/collecting_data.py at master · thatsLegit/twitter · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import json
#import datetime
from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream
import os
import sys
#import time


#"another" data_analytics_2019 ok
consumer_key = ""
consumer_secret = ""
access_token = ""
access_secret = ""

os.chdir('C:/Users/pitch/PycharmProjects/Twitter')


def twitter_Search_several_tweets(keyword,number_of_tweets):  # keyword,number of tweets needed to be collected
    oauth = OAuth(access_token, access_secret, consumer_key, consumer_secret)
    # create twitter API object
    twitter = Twitter(auth=oauth)
    max_id = -1
    tweet_downloaded = 0
    with open('Search_%d_%s_tweets.json' % (number_of_tweets,keyword), 'w') as f:
        while tweet_downloaded < number_of_tweets:
            try:
                if (max_id <= 0):
                    new_tweets = twitter.search.tweets(q=keyword, count=100)  # maw tweets per request 100
                else:
                    new_tweets = twitter.search.tweets(q=keyword, count=100,max_id=str(max_id - 1))
                new_tweets_status = new_tweets["statuses"]
                if not new_tweets_status:
                    print("Not found")
                    break
                for tweet in new_tweets_status:
                    f.write(json.dumps(tweet))
                    f.write('\n')
                tweet_downloaded += len(new_tweets_status)
                print("Downloaded {0} tweets".format(tweet_downloaded))
                max_id = new_tweets_status[-1]['id']
            except:
                print(" error occurs ")
                break
    print (json.dumps(new_tweets, indent=4))
    print ("There are total {0} tweets downloaded".format(tweet_downloaded))


def extracting_features_binary_class(file_in_tweets_json):

    f_in = open(file_in_tweets_json, "r")
    f_out = open(file_in_tweets_json +'_features.arff', "w")
    st = '@relation ' + file_in_tweets_json.split('.')[0] + '\n' + \
         '@attribute Number_of_followers numeric' + '\n' + \
         '@attribute Number_of_friend numeric' + '\n' + \
         '@attribute Total_of_tweets numeric' + '\n' + \
         '@attribute Contain_Picture {0,1}' + '\n' + \
         '@attribute Contain_hashtag {0,1}' + '\n' + \
         '@attribute @@class@@ {0,1}' + '\n' + '@data' + '\n'
    f_out.write(st)
    for line in f_in:
        tweet = json.loads(line)
        Number_of_followers = 0
        Number_of_friend = 0
        Total_of_tweets = 0
        Contain_Picture = 0
        Contain_hashtag = 0
        Retweet_class = 0
        if not tweet.get('retweeted_status'):
            Number_of_friend = tweet['user']['friends_count']
            Number_of_followers = tweet['user']['followers_count']
            Total_of_tweets = tweet['user']['statuses_count']
            if tweet['entities'].get('media'):  # check if tweet contain photo
                if tweet['entities']['media'][0]['type'] == 'photo':  # type of media ==photo
                    Contain_Picture = 1
            if tweet['entities'].get('hashtags'):  # check if tweet contain hashtag
                Contain_hashtag = 1
            Retweet_class = tweet['retweet_count']
        else: #Si le tweet est un RT, on va prendre le tweet original...
            tweet_orginal = tweet['retweeted_status']
            Number_of_friend = tweet_orginal['user']['friends_count']
            Number_of_followers = tweet_orginal['user']['followers_count']
            Total_of_tweets = tweet_orginal['user']['statuses_count']
            if tweet_orginal['entities'].get('media'):  # check if tweet contain photo
                if tweet_orginal['entities']['media'][0]['type'] == 'photo':  # type of media ==photo
                    Contain_Picture = 1
            if tweet_orginal['entities'].get('hashtags'):  # check if tweet contain hashtag
                Contain_hashtag = 1
            Retweet_class = tweet_orginal['retweet_count']

        if Retweet_class > 0:
            Retweet_class = 1
        print("{0},{1},{2},{3},{4},{5}".format(Number_of_followers, Number_of_friend, Total_of_tweets, Contain_Picture, Contain_hashtag,Retweet_class))
        f_out.write("{0},{1},{2},{3},{4},{5}\n".format(Number_of_followers, Number_of_friend, Total_of_tweets, Contain_Picture, Contain_hashtag, Retweet_class))
    f_out.close()
    f_in.close()


def extracting_features_binary_class_7features(file_in_tweets_json):

    f_in = open(file_in_tweets_json, "r")
    f_out = open(file_in_tweets_json +'_features.arff', "w")
    st = '@relation ' + file_in_tweets_json.split('.')[0] + '\n' + \
         '@attribute Number_of_followers numeric' + '\n' + \
         '@attribute Number_of_friend numeric' + '\n' + \
         '@attribute Total_of_tweets numeric' + '\n' + \
         '@attribute Contain_Picture {0,1}' + '\n' + \
         '@attribute Contain_hashtag {0,1}' + '\n' + \
         '@attribute langage {0,1}' + '\n' + \
         '@attribute verified_account {0,1}' + '\n' + \
         '@attribute @@class@@ {0,1}' + '\n' + '@data' + '\n'
    f_out.write(st)
    for line in f_in:
        tweet = json.loads(line)
        Number_of_followers = 0
        Number_of_friend = 0
        Total_of_tweets = 0
        Contain_Picture = 0
        Contain_hashtag = 0
        langage = 0
        verified_account = 0
        Retweet_class = 0
        if not tweet.get('retweeted_status'):
            Number_of_friend = tweet['user']['friends_count']
            Number_of_followers = tweet['user']['followers_count']
            Total_of_tweets = tweet['user']['statuses_count']
            if tweet['entities'].get('media'):  # check if tweet contain photo
                if tweet['entities']['media'][0]['type'] == 'photo':  # type of media ==photo
                    Contain_Picture = 1
            if tweet['entities'].get('hashtags'):  # check if tweet contain hashtag
                Contain_hashtag = 1
            if tweet['user']['verified']:
                verified_account = 1
            if tweet['lang'] == "en":
                langage = 1
            Retweet_class = tweet['retweet_count']
        else:
            tweet_orginal = tweet['retweeted_status']
            Number_of_friend = tweet_orginal['user']['friends_count']
            Number_of_followers = tweet_orginal['user']['followers_count']
            Total_of_tweets = tweet_orginal['user']['statuses_count']
            if tweet_orginal['entities'].get('media'):  # check if tweet contain photo
                if tweet_orginal['entities']['media'][0]['type'] == 'photo':  # type of media ==photo
                    Contain_Picture = 1
            if tweet_orginal['entities'].get('hashtags'):  # check if tweet contain hashtag
                Contain_hashtag = 1
            if tweet_orginal['user']['verified']:
                verified_account = 1
            if tweet['lang'] == "en":
                langage = 1
            Retweet_class = tweet_orginal['retweet_count']

        if Retweet_class > 0:
            Retweet_class = 1
        print("{0},{1},{2},{3},{4},{5},{6},{7}".format(Number_of_followers, Number_of_friend, Total_of_tweets, Contain_Picture, Contain_hashtag, langage, verified_account, Retweet_class))
        f_out.write("{0},{1},{2},{3},{4},{5},{6},{7}\n".format(Number_of_followers, Number_of_friend, Total_of_tweets, Contain_Picture, Contain_hashtag, langage, verified_account, Retweet_class))
    f_out.close()
    f_in.close()


def extracting_id(file_in_tweets_json,file_out_id_text):

    f_in = open(file_in_tweets_json, "r")
    f_out = open(file_out_id_text, "w")
    for line in f_in:
        tweet = json.loads(line)
        id = tweet['id']
        f_out.write ("%s\n" %(id))
    f_out.close()
    f_in.close()

 #-------

if __name__ == '__main__': #checks if this python script is ran directly. If yes, then the below code is executed
    #twitter_Search_several_tweets('Estonia', 2501)
    #extracting_features_binary_class("Search_2501_estonia_tweets.json")
    #extracting_features_binary_class_7features("Search_2501_estonia_tweets.json")
    extracting_id('Search_2501_estonia_tweets.json','Search_2501_estonia_tweets.txt')