Supervised_Text_Classfication/Twitter API Tweets extraction R Code.R at master · Rajiv2806/Supervised_Text_Classfication · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
##############################
# R has a dedicated package twitteR
# For additional functions refer twitteR.pdf (twitteR package manual)
# Sample code for getting twitter data -
#

require("twitteR")||install.packages("twitteR")
require("base64enc")||install.packages("base64enc")

library(twitteR)
library(base64enc)

#############################################
# Authentication
#############################################
# options(httr_oauth_cache=T)

api_key <- ""   #Consumer key: *

api_secret <- ""   # Consumer secret: *

access_token <- "182265101-"  # Access token:

access_token_secret <- "" # Access token secret:

# After this line of command type 1 for selection as Yes

setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)

#[1] "Using direct authentication"
#Use a local file to cache OAuth access credentials between R sessions?
#1: Yes
#2: No
#Selection: 1

#############################################
# Extract Tweets
#############################################

hashtags = c('#GST')

for (hashtag in hashtags){
tweets = searchTwitter(hashtag, n=1000 )     # hash tag for tweets search and number of tweets
tweets = twListToDF(tweets)    # Convert from list to dataframe
tweets.df = tweets[,1]  # assign tweets for cleaning
tweets.df = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets.df);head(tweets.df)
tweets.df = gsub("@\\w+", "", tweets.df);head(tweets.df) # regex for removing @user
tweets.df = gsub("[[:punct:]]", "", tweets.df);head(tweets.df) # regex for removing punctuation mark
tweets.df = gsub("[[:digit:]]", "", tweets.df);head(tweets.df) # regex for removing numbers
tweets.df = gsub("http\\w+", "", tweets.df);head(tweets.df) # regex for removing links
tweets.df = gsub("\n", " ", tweets.df);head(tweets.df)  ## regex for removing new line (\n)
tweets.df = gsub("[ \t]{2,}", " ", tweets.df);head(tweets.df) ## regex for removing two blank space
tweets.df =  gsub("[^[:alnum:]///' ]", " ", tweets.df)     # keep only alpha numeric
tweets.df =  iconv(tweets.df, "latin1", "ASCII", sub="")   # Keep only ASCII characters
tweets.df = gsub("^\\s+|\\s+$", "", tweets.df);head(tweets.df)  # Remove leading and trailing white space
tweets[,1] = tweets.df # save in Data frame
head(tweets)
write.csv(tweets,paste0(gsub('#','',hashtag),'.csv'))
}