-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathscraper.py
More file actions
153 lines (118 loc) · 4.36 KB
/
scraper.py
File metadata and controls
153 lines (118 loc) · 4.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from bs4 import BeautifulSoup
import json
import sys
import urllib
import urllib2
# Max number of pages to click through when scraping reviews
MAX_PAGES_TO_SCRAPE = 5
# Google Play Reviews API base URL
# Give it a try:
# curl --data \
# "reviewType=0&pageNum=15&id=com.inxile.BardTale&reviewSortOrder=2&xhr=1" \
# https://play.google.com/store/getreviews
GOOGLE_PLAY_REVIEWS_URL = "https://play.google.com/store/getreviews"
class GooglePlayScrapedReview:
'''
Object returned from scraping Google Play reviews.
'''
def __init__(self, title, reviewText, delimiter='\n'):
self.title = title.strip()
self.reviewText = reviewText.strip()
self.delimiter = delimiter
def __str__(self):
string = \
("%s%s%s" %
(self.title, self.delimiter, self.reviewText))
return string.encode('utf-8').strip()
class GooglePlayReviewScraperException(Exception):
pass
class GooglePlayReviewScraper:
'''
Essentially, fires off a number of POST requests to the Google Play
to simulate client requests.
These requests (as of Sept 8, 2013) are in the format:
reviewType:0 [???]
pageNum:11 [pagination]
id:com.inxile.BardTale [package name]
reviewSortOrder:2 [???]
xhr:1 [???]
[???] means I don't know what the param is for.
'''
def __init__(self, packageName, maxNumberOfPages=MAX_PAGES_TO_SCRAPE):
self.packageName = packageName
self.maxNumberOfPages = maxNumberOfPages
self.postParams = \
{
'reviewType': 0,
'pageNum': 0,
'id': packageName,
'reviewSortOrder': 2,
'xhr': 1
}
def scrapePageNumber(self, pageNum=0):
# Modify POST params to use required pageNum param
pagedPostParams = self.postParams.copy()
pagedPostParams['pageNum'] = pageNum
# Encode
encodedPostParams = urllib.urlencode(pagedPostParams)
# Fire dat POST
return urllib2.urlopen(
GOOGLE_PLAY_REVIEWS_URL, encodedPostParams).read()
def parseResult(self, postResults):
# Magic indices required for parsing Google API results
contentIdx = 0
apiJsonStartIndex = 6
htmlStartIndex = 2
# Classes for parsing
reviewBodyClass = "review-body"
# Read in the API Request
postResultsString = postResults
# Attempt to parse result
try:
result = \
json.loads(
postResultsString[apiJsonStartIndex:]
)[contentIdx][htmlStartIndex]
htmlResult = BeautifulSoup(result)
parsedResults = \
[self.generateScrapedObject(reviewBody) for
reviewBody in htmlResult.find_all(class_=reviewBodyClass)]
return parsedResults
except:
raise \
GooglePlayReviewScraperException(
"Exception: %s. Bad parse: %s" %
(sys.exc_info()[0], postResults)
)
@classmethod
def generateScrapedObject(self, reviewBody):
# Extract the review title
reviewTitleClass = "review-title"
reviewTitle = reviewBody.find(class_=reviewTitleClass).get_text()
# Extract the review body
reviewBodyContentIdx = 2
reviewBody = reviewBody.contents[reviewBodyContentIdx]
return GooglePlayScrapedReview(reviewTitle, reviewBody)
def scrape(self, pageNumbers=None):
parsedResults = []
scrapePageNums = pageNumbers or self.maxNumberOfPages
try:
for pageNum in xrange(scrapePageNums):
results = self.parseResult(self.scrapePageNumber(pageNum))
for result in results:
parsedResults.append(result)
except GooglePlayReviewScraperException as e:
print("Bad Parse: %s" % e)
except:
print("Unexpected error:", sys.exc_info()[0])
finally:
return parsedResults
# Main method
# Play around to get a feel for this
if (__name__ == '__main__'):
PACKAGE_NAME = "com.inxile.BardTale"
NUM_PAGES_TO_SCRAPE = 2
gs = GooglePlayReviewScraper(PACKAGE_NAME)
scraped = gs.scrape(pageNumbers=2)
for review in scraped:
print(review)