-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
110 lines (100 loc) · 3.49 KB
/
scraper.py
File metadata and controls
110 lines (100 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import re
import pandas as pd
import time
from selenium.webdriver.chrome.options import Options
import sys
def TopHashtag(city,country):
url='https://trends24.in/'+country+'/'+city+'/'
r=requests.get(url)
text=r.text
soup=BeautifulSoup(text,'html.parser')
ol=soup.find_all('ol',{'class':"trend-card__list"})
top_tag=re.findall(r'(?<=<a href=")[^"]*',str(ol[-1]))
return top_tag
def HeadlessMode():
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=options)
return browser
def ScrollPage(browser):
SCROLL_PAUSE_TIME = 1
last_height = browser.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = browser.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
return browser
def OpenTwitter(tag):
hashtagurl=tag
browser.get(hashtagurl+'%20near%3A"'+city+'"%20within%3A50mi&src=typd')
return browser
def ScrapeTwitterUrl(browser,url):
soup=BeautifulSoup(browser.page_source,'html.parser')
tags=soup.find_all('p',attrs={'class':"TweetTextSize js-tweet-text tweet-text"})
comments=soup.find_all('span',attrs={'class':"ProfileTweet-actionCountForPresentation"})
timestamp=soup.find_all('a',attrs={'class':"tweet-timestamp js-permalink js-nav js-tooltip"})
return tags,comments,timestamp
def DeleteDuplicateComments(comments):
response=[]
length=len(comments)
for i in range(length,-1,-1):
if(i%5==2 or i%5==4):
comments.pop(i)
for i in range(len(comments)):
if(comments[i].text==""):
response+=[0]
elif('K' in comments[i].text):
response+=[float(re.findall('[0-9.]+',comments[i].text)[0])*1000]
elif('M' in comments[i].text):
response+=[float(re.findall('[0-9.]+',comments[i].text)[0])*1000000]
else:
response+=[float(comments[i].text)]
return response
def GetUserInfo(timestamp):
userid=[]
tweetid=[]
time=[]
tweet=[]
for i in range(len(timestamp)):
l=timestamp[i]["href"].split('/')
userid+=[l[1]]
tweetid+=[l[3]]
for i in range(len(timestamp)):
time+=[timestamp[i]["title"]]
for i in range(len(tags)):
tweet+=[tags[i].text]
return userid,tweetid,time,tweet
def SeperateComments(response):
comment=[]
like=[]
retweet=[]
for i in range(len(response)):
if(i%3==0):
comment+=[response[i]]
elif(i%3==1):
retweet+=[response[i]]
else:
like+=[response[i]]
return comment,like,retweet
def CreateDataFrame(tweet,userid,comment,like,retweet,time,city):
df=pd.DataFrame(tweet)
df.columns=["tweet"]
df['tweetid']=tweetid
df['userid']=userid
df['comment']=comment
df['like']=like
df['retweet']=retweet
df['Time']=time
df["score"]=df.retweet*3+df.comment*2+df.like
df['city']=city
return df