-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRateMyProf.py
More file actions
152 lines (123 loc) · 5.15 KB
/
RateMyProf.py
File metadata and controls
152 lines (123 loc) · 5.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# thanks to M. Aaron Owen for use of his code from last year
from selenium import webdriver
import time
import csv
# identify browser to use
driver = webdriver.Chrome()
# specify school - name should match Dictionaries used in other scripts
# - will simplify merge and concat of dataframes later
school = 'harvard'
# open file for writing
csv_file = open(school + '.csv', 'w')
# create writer
writer = csv.writer(csv_file)
# add key names for dictionary
writer.writerow(['name', 'department', 'school', 'overall_score', 'difficulty_score', 'grade', 'chili', 'tag_list', 'content'])
# website to start
topurl = "http://www.ratemyprofessors.com/search.jsp?queryBy=schoolId&schoolName=Harvard+University&schoolID=399&queryoption=TEACHER"
driver.get(topurl)
num_ratings = driver.find_element_by_xpath('//span[@class = "professor-count"]').text
# each button click adds 20 new reviews
if int(num_ratings) <= 20:
button_clicks = 0
else:
button_clicks = int(num_ratings) // 20
### ###
prof_urls = []
# the tag for the review container
profs = driver.find_elements_by_xpath('//div[@class = "result-list"]//a')
for prof in profs:
# if there are reviews, then grab the number of them
if "ShowRatings" in prof.get_attribute("href"):
num_reviews = prof.find_element_by_xpath('.//span[@class = "info"]').text.split(" ")[0]
# if the number is >= 20, add it to the list
if int(num_reviews) >= 1:
prof_urls.append(prof.get_attribute("href"))
# now that we have the list of urls, we can scrape the ratings
for url in prof_urls:
# wait 2 seconds before visiting the next url
time.sleep(2)
# initialize the driver as the specific prof's webpage
driver.get(url)
# getting the prof's name
last_name = driver.find_element_by_xpath('//span[@class = "plname"]').text + ", "
first_name = driver.find_element_by_xpath('//span[@class = "pfname"]').text
name = last_name + first_name
# printing so that I can keep track of what's being scraped
print(name)
print(url)
# collecting the number of ratings for the prof to know how many times to click the button
num_ratings = driver.find_element_by_xpath('//div[@data-table = "rating-filter"]').text.split(" ")[0]
# need this if you want to extend to cases when reviews are less than 20
if int(num_ratings) <= 20:
button_clicks = 0
else:
button_clicks = int(num_ratings) // 20
# initializing the button -this causes problems when reviews are <= 20
if button_clicks > 0:
button = driver.find_element_by_xpath('//a[@id = "loadMore"]')
# clicking the button the desired number of times
for i in range(button_clicks):
driver.execute_script("arguments[0].click();", button)
print("single prof page button click " + str(i + 1))
time.sleep(2)
# the reviews have different tags as either class = '' or class = 'even'
reviews1 = driver.find_elements_by_xpath('//table[@class = "tftable"]//tr[@class = ""]')
reviews2 = driver.find_elements_by_xpath('//table[@class = "tftable"]//tr[@class = "even"]')
# getting the department
department = driver.find_element_by_xpath('//div[@class = "result-title"]').text
department = department.split("\n")[0]
# True if the word "hot" is in the attribute
chili = driver.find_element_by_xpath('//div[@class = "breakdown-section"]//img').get_attribute("src")
chili = "hot" in chili
# initializing an empty dictionary to store reviews, professor and school information
review_dict = {}
# Reviews are stored separately in odd and even rows. First, we will start with info from
# odd reviews
for ind, review in enumerate(reviews1):
print("reviews - 1 " + str(ind))
review_dict = {}
review2 = review.text.split("\n")
overall_score = review2[2]
difficulty_score = review2[4]
grade = review2[11].split(" ")[2]
content = review.find_element_by_xpath('.//p').text
tag_list = []
raw_tags = review.find_elements_by_xpath('.//div[@class = "tagbox"]//span')
for i in range(0, len(raw_tags)):
tag_list.append(raw_tags[i].text)
review_dict["name"] = name
review_dict["department"] = department
review_dict["school"] = school
review_dict["overall_score"] = overall_score
review_dict["difficulty_score"] = difficulty_score
review_dict["grade"] = grade
review_dict["chili"] = chili
review_dict["tag_list"] = tag_list
review_dict["content"] = content
writer.writerow(review_dict.values())
# collecting the even rows and putting information in the dictionary
for ind, review in enumerate(reviews2):
print("reviews - 2 " + str(ind))
review_dict = {}
review2 = review.text.split("\n")
overall_score = review2[2]
difficulty_score = review2[4]
grade = review2[11].split(" ")[2]
content = review.find_element_by_xpath('.//p').text
tag_list = []
raw_tags = review.find_elements_by_xpath('.//div[@class = "tagbox"]//span')
for i in range(0, len(raw_tags)):
tag_list.append(raw_tags[i].text)
review_dict["name"] = name
review_dict["department"] = department
review_dict["school"] = school
review_dict["overall_score"] = overall_score
review_dict["difficulty_score"] = difficulty_score
review_dict["grade"] = grade
review_dict["chili"] = chili
review_dict["tag_list"] = tag_list
review_dict["content"] = content
writer.writerow(review_dict.values())
# closing the driver
driver.close()