amortization/spiders.py at main · diligent176/amortization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
"""
spiders.py module
scrapes Canadian banks for mortgage rates
"""
import re
import scrapy
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.signalmanager import dispatcher
from scrapy.utils.project import get_project_settings
from pprint import pprint


class BmoSpider(scrapy.Spider):

    name = "bmo_spider"
    start_urls = ["https://www.bmo.com/public-data/api/v1.1/mortgages.json"]

    def parse(self, response):

        # iterate over lines in json object mortgage-rates
        for rate_name, rate_float in response.json()["mortgage-rates"].items():

            # parse out values for rate dict
            if "fixed" in rate_name.lower():
                rate_type = "Fixed"
            else:
                rate_type = "Variable"

            if "over-25" in rate_name.lower():
                amort_years = 30
            else:
                amort_years = 25

            if "-closed" in rate_name.lower():
                term_type = "Closed"
            else:
                term_type = "Open"

            # get term length
            if matches := re.search(r"^.*-(?P<t_years>\d{1,2})-year.*", rate_name):
                try:
                    term_years = int(matches.group("t_years"))
                except ValueError:
                    # convert to int() failed
                    print(f"Error converting {matches.group('t_years')} to int()")
                    continue
            else:
                # no match on TERM regex, continue to next row
                continue

            # create a rate dict
            rate_dict = {
                "lender": "BMO",
                "amort_years": amort_years,
                "rate_percent": rate_float,
                "rate_type": rate_type,
                "term_years": term_years,
                "term_type": term_type,
            }

            # yield result back to caller
            yield rate_dict


class RbcSpider(scrapy.Spider):

    name = "rbc_spider"
    start_urls = ["https://www.rbcroyalbank.com/mortgages/mortgage-rates.html"]

    def parse(self, response):

        # find div id=special-rates
        special_rates = response.xpath('//*[@id="special-rates"]')

        # inside special_rates Selector, find amortization timeframes
        amort_headers = special_rates.xpath("h4/text()")

        # find tables with class='table-striped' within special-rates
        stripe_tables = special_rates.css("table.table-striped")

        # collapsing buttons above each stripe-table shows fixed OR variable
        cbuttons = special_rates.css("button.collapse-toggle::text")

        # iterate over stripe_tables (4)
        # i_table is one of 4 stripe_tables holding RBC rates
        for i_table, stripe_table in enumerate(stripe_tables):

            # stripe_table 0 = 25 years or less amortization, FIXED
            # stripe_table 1 = 25 years or less amortization, VARIABLE
            # stripe_table 2 = more than 25 years amortization, FIXED
            # stripe_table 3 = more than 25 years amortization, VARIABLE
            match i_table:
                case 0 | 1:  # 25 years or less amortization
                    amort = amort_headers[0].get().strip()
                case 2 | 3:  # more than 25 years amortization
                    amort = amort_headers[1].get().strip()

            # for each TABLEROW in current stripe_table
            for tr in stripe_table.css("tr"):

                # fixed or variable, based on which stripe table (i_table)
                # collapsing buttons text says whether fixed/variable
                fixed_var = cbuttons[i_table].get().strip()

                if "fixed" in fixed_var.lower():
                    rate_type = "Fixed"
                elif "variable" in fixed_var.lower():
                    rate_type = "Variable"
                else:
                    rate_type = "Unknown"

                # find term and rate from table data cells 0, 1
                term = tr.xpath("td/text()")[0].get().strip()
                rate = tr.xpath("td/text()")[1].get().strip()

                # make sure table row contains a valid term and rate
                if "year" in term.lower() and "%" in rate:

                    # determine if amortization is > 25, or 25 and less
                    if "greater than 25" in amort.lower():
                        amort_years = 30
                    else:
                        amort_years = 25

                    # get rate percentage as a float from a string
                    if matches := re.search(r"^.*(\d{1,2}\.\d{1,3}%)", rate):
                        try:
                            rate_percent = float((matches.group(1)).replace('%', ''))

                        except ValueError:
                            # rate can't be converted to int(), move on to next table row
                            print(f"Error converting {matches.group(1)} to float()")
                            continue
                    else:
                        # no match on RATE regex, move on to next table row
                        continue

                    # get term type and length
                    if matches := re.search(r"^.*(?P<t_years>\d{1}).*(?P<t_type>Closed|Open)", term):
                        try:
                            term_years = int(matches.group("t_years"))
                            if "open" in matches.group("t_type").lower():
                                term_type = "Open"
                            else:
                                term_type = "Closed"

                        except ValueError:
                            # term could not be converted to int, move on next table row
                            print(f"Error converting {matches.group('t_years')} to int()")
                            continue
                    else:
                        # no match on TERMS regex, continue on with next row
                        continue

                    # create a rate dict
                    rate_dict = {
                        "lender": "RBC",
                        "amort_years": amort_years,
                        "rate_percent": rate_percent,
                        "rate_type": rate_type,
                        "term_years": term_years,
                        "term_type": term_type,
                    }

                    # yield result back to caller
                    yield rate_dict


"""
Crawls Canadian Bank websites
Returns a list of dicts (bank rates) sorted by lowest rate first
"""
def crawl_bank_rates():

    results = []

    def crawler_results(signal, sender, item, response, spider):
        results.append(item)

    dispatcher.connect(crawler_results, signal=signals.item_scraped)

    process = CrawlerProcess()

    # crawl one or more spiders
    process.crawl(BmoSpider)
    process.crawl(RbcSpider)

    # start crawling, script will block here until crawling jobs finish
    process.start()

    if results:
        # return the sorted results list
        return sorted(results, key=lambda d: d['rate_percent'])
    else:
        return None


def main():
    # main function is only here for standalone test (to run without project.py)
    bank_rates = crawl_bank_rates()

    if bank_rates:
        pprint(bank_rates)
        print(f"\nLength of results: {len(bank_rates)}")
    else:
        print("\nNo results")

    print("\n######## DONE ########\n")


if __name__ == "__main__":
    main()