-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspiders.py
More file actions
213 lines (165 loc) · 7.32 KB
/
spiders.py
File metadata and controls
213 lines (165 loc) · 7.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
"""
spiders.py module
scrapes Canadian banks for mortgage rates
"""
import re
import scrapy
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.signalmanager import dispatcher
from scrapy.utils.project import get_project_settings
from pprint import pprint
class BmoSpider(scrapy.Spider):
name = "bmo_spider"
start_urls = ["https://www.bmo.com/public-data/api/v1.1/mortgages.json"]
def parse(self, response):
# iterate over lines in json object mortgage-rates
for rate_name, rate_float in response.json()["mortgage-rates"].items():
# parse out values for rate dict
if "fixed" in rate_name.lower():
rate_type = "Fixed"
else:
rate_type = "Variable"
if "over-25" in rate_name.lower():
amort_years = 30
else:
amort_years = 25
if "-closed" in rate_name.lower():
term_type = "Closed"
else:
term_type = "Open"
# get term length
if matches := re.search(r"^.*-(?P<t_years>\d{1,2})-year.*", rate_name):
try:
term_years = int(matches.group("t_years"))
except ValueError:
# convert to int() failed
print(f"Error converting {matches.group('t_years')} to int()")
continue
else:
# no match on TERM regex, continue to next row
continue
# create a rate dict
rate_dict = {
"lender": "BMO",
"amort_years": amort_years,
"rate_percent": rate_float,
"rate_type": rate_type,
"term_years": term_years,
"term_type": term_type,
}
# yield result back to caller
yield rate_dict
class RbcSpider(scrapy.Spider):
name = "rbc_spider"
start_urls = ["https://www.rbcroyalbank.com/mortgages/mortgage-rates.html"]
def parse(self, response):
# find div id=special-rates
special_rates = response.xpath('//*[@id="special-rates"]')
# inside special_rates Selector, find amortization timeframes
amort_headers = special_rates.xpath("h4/text()")
# find tables with class='table-striped' within special-rates
stripe_tables = special_rates.css("table.table-striped")
# collapsing buttons above each stripe-table shows fixed OR variable
cbuttons = special_rates.css("button.collapse-toggle::text")
# iterate over stripe_tables (4)
# i_table is one of 4 stripe_tables holding RBC rates
for i_table, stripe_table in enumerate(stripe_tables):
# stripe_table 0 = 25 years or less amortization, FIXED
# stripe_table 1 = 25 years or less amortization, VARIABLE
# stripe_table 2 = more than 25 years amortization, FIXED
# stripe_table 3 = more than 25 years amortization, VARIABLE
match i_table:
case 0 | 1: # 25 years or less amortization
amort = amort_headers[0].get().strip()
case 2 | 3: # more than 25 years amortization
amort = amort_headers[1].get().strip()
# for each TABLEROW in current stripe_table
for tr in stripe_table.css("tr"):
# fixed or variable, based on which stripe table (i_table)
# collapsing buttons text says whether fixed/variable
fixed_var = cbuttons[i_table].get().strip()
if "fixed" in fixed_var.lower():
rate_type = "Fixed"
elif "variable" in fixed_var.lower():
rate_type = "Variable"
else:
rate_type = "Unknown"
# find term and rate from table data cells 0, 1
term = tr.xpath("td/text()")[0].get().strip()
rate = tr.xpath("td/text()")[1].get().strip()
# make sure table row contains a valid term and rate
if "year" in term.lower() and "%" in rate:
# determine if amortization is > 25, or 25 and less
if "greater than 25" in amort.lower():
amort_years = 30
else:
amort_years = 25
# get rate percentage as a float from a string
if matches := re.search(r"^.*(\d{1,2}\.\d{1,3}%)", rate):
try:
rate_percent = float((matches.group(1)).replace('%', ''))
except ValueError:
# rate can't be converted to int(), move on to next table row
print(f"Error converting {matches.group(1)} to float()")
continue
else:
# no match on RATE regex, move on to next table row
continue
# get term type and length
if matches := re.search(r"^.*(?P<t_years>\d{1}).*(?P<t_type>Closed|Open)", term):
try:
term_years = int(matches.group("t_years"))
if "open" in matches.group("t_type").lower():
term_type = "Open"
else:
term_type = "Closed"
except ValueError:
# term could not be converted to int, move on next table row
print(f"Error converting {matches.group('t_years')} to int()")
continue
else:
# no match on TERMS regex, continue on with next row
continue
# create a rate dict
rate_dict = {
"lender": "RBC",
"amort_years": amort_years,
"rate_percent": rate_percent,
"rate_type": rate_type,
"term_years": term_years,
"term_type": term_type,
}
# yield result back to caller
yield rate_dict
"""
Crawls Canadian Bank websites
Returns a list of dicts (bank rates) sorted by lowest rate first
"""
def crawl_bank_rates():
results = []
def crawler_results(signal, sender, item, response, spider):
results.append(item)
dispatcher.connect(crawler_results, signal=signals.item_scraped)
process = CrawlerProcess()
# crawl one or more spiders
process.crawl(BmoSpider)
process.crawl(RbcSpider)
# start crawling, script will block here until crawling jobs finish
process.start()
if results:
# return the sorted results list
return sorted(results, key=lambda d: d['rate_percent'])
else:
return None
def main():
# main function is only here for standalone test (to run without project.py)
bank_rates = crawl_bank_rates()
if bank_rates:
pprint(bank_rates)
print(f"\nLength of results: {len(bank_rates)}")
else:
print("\nNo results")
print("\n######## DONE ########\n")
if __name__ == "__main__":
main()