ScrapingProject/RateMyProf.py at master · dmbubb/ScrapingProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# thanks to M. Aaron Owen for use of his code from last year
from selenium import webdriver
import time
import csv

# identify browser to use
driver = webdriver.Chrome()

# specify school - name should match Dictionaries used in other scripts
# - will simplify merge and concat of dataframes later
school = 'harvard'

# open file for writing
csv_file = open(school + '.csv', 'w')

# create writer
writer = csv.writer(csv_file)

# add key names for dictionary
writer.writerow(['name', 'department', 'school', 'overall_score', 'difficulty_score', 'grade', 'chili', 'tag_list', 'content'])

# website to start
topurl = "http://www.ratemyprofessors.com/search.jsp?queryBy=schoolId&schoolName=Harvard+University&schoolID=399&queryoption=TEACHER"
driver.get(topurl)


num_ratings = driver.find_element_by_xpath('//span[@class = "professor-count"]').text
# each button click adds 20 new reviews
if int(num_ratings) <= 20:
	button_clicks = 0
else:
	button_clicks = int(num_ratings) // 20
### ###

prof_urls = []
# the tag for the review container
profs = driver.find_elements_by_xpath('//div[@class = "result-list"]//a')
for prof in profs:
	# if there are reviews, then grab the number of them
	if "ShowRatings" in prof.get_attribute("href"):
		num_reviews = prof.find_element_by_xpath('.//span[@class = "info"]').text.split(" ")[0]
		# if the number is >= 20, add it to the list
		if int(num_reviews) >= 1:
			prof_urls.append(prof.get_attribute("href"))

# now that we have the list of urls, we can scrape the ratings
for url in prof_urls:
	# wait 2 seconds before visiting the next url
	time.sleep(2)

	# initialize the driver as the specific prof's webpage
	driver.get(url)

	# getting the prof's name
	last_name = driver.find_element_by_xpath('//span[@class = "plname"]').text + ", "
	first_name = driver.find_element_by_xpath('//span[@class = "pfname"]').text
	name = last_name + first_name

	# printing so that I can keep track of what's being scraped
	print(name)
	print(url)

	# collecting the number of ratings for the prof to know how many times to click the button
	num_ratings = driver.find_element_by_xpath('//div[@data-table = "rating-filter"]').text.split(" ")[0]

	# need this if you want to extend to cases when reviews are less than 20
	if int(num_ratings) <= 20:
		button_clicks = 0
	else:
		button_clicks = int(num_ratings) // 20

	# initializing the button -this causes problems when reviews are <= 20
	if button_clicks > 0:
		button = driver.find_element_by_xpath('//a[@id = "loadMore"]')
	# clicking the button the desired number of times
	for i in range(button_clicks):
		driver.execute_script("arguments[0].click();", button)
		print("single prof page button click " + str(i + 1))
		time.sleep(2)

	# the reviews have different tags as either class = '' or class = 'even'
	reviews1 = driver.find_elements_by_xpath('//table[@class = "tftable"]//tr[@class = ""]')
	reviews2 = driver.find_elements_by_xpath('//table[@class = "tftable"]//tr[@class = "even"]')

	# getting the department
	department = driver.find_element_by_xpath('//div[@class = "result-title"]').text
	department = department.split("\n")[0]

	# True if the word "hot" is in the attribute
	chili = driver.find_element_by_xpath('//div[@class = "breakdown-section"]//img').get_attribute("src")
	chili = "hot" in chili

	# initializing an empty dictionary to store reviews, professor and school information
	review_dict = {}

	# Reviews are stored separately in odd and even rows.  First, we will start with info from
	# odd reviews
	for ind, review in enumerate(reviews1):
		print("reviews - 1 " + str(ind))
		review_dict = {}
		review2 = review.text.split("\n")
		overall_score = review2[2]
		difficulty_score = review2[4]
		grade = review2[11].split(" ")[2]

		content = review.find_element_by_xpath('.//p').text

		tag_list = []
		raw_tags = review.find_elements_by_xpath('.//div[@class = "tagbox"]//span')
		for i in range(0, len(raw_tags)):
			tag_list.append(raw_tags[i].text)

		review_dict["name"] = name
		review_dict["department"] = department
		review_dict["school"] = school
		review_dict["overall_score"] = overall_score
		review_dict["difficulty_score"] = difficulty_score
		review_dict["grade"] = grade
		review_dict["chili"] = chili
		review_dict["tag_list"] = tag_list
		review_dict["content"] = content
		writer.writerow(review_dict.values())

	# collecting the even rows and putting information in the dictionary
	for ind, review in enumerate(reviews2):
		print("reviews - 2 " + str(ind))
		review_dict = {}
		review2 = review.text.split("\n")
		overall_score = review2[2]
		difficulty_score = review2[4]
		grade = review2[11].split(" ")[2]

		content = review.find_element_by_xpath('.//p').text

		tag_list = []
		raw_tags = review.find_elements_by_xpath('.//div[@class = "tagbox"]//span')
		for i in range(0, len(raw_tags)):
			tag_list.append(raw_tags[i].text)

		review_dict["name"] = name
		review_dict["department"] = department
		review_dict["school"] = school
		review_dict["overall_score"] = overall_score
		review_dict["difficulty_score"] = difficulty_score
		review_dict["grade"] = grade
		review_dict["chili"] = chili
		review_dict["tag_list"] = tag_list
		review_dict["content"] = content
		writer.writerow(review_dict.values())

# closing the driver
driver.close()