python-web-scrapper/Realo_scraper.py at main · naomithiru/python-web-scrapper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 28 08:11:38 2020

@author: Naomi
"""

import csv
import random
import requests
import selenium

from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options

# from requests_html import HTMLSession
# import pandas as pandas
sleepTimes = [2.1, 2.8, 3.2, 4, 5.2]
larger_sleep_times = [10, 12, 15]
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
    "Dnt": "1",
    "Host": "httpbin.org",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
# url = "https://www.realo.be/en"
url = "https://www.realo.be/en/search/house,flat/for-sale?savedSearchId=18538987&saved=1"

file = open("listings.csv", "w")
writer = csv.writer(file, delimiter=",", quotechar='"')
print("Open file for writting")


def fetch_listings(url, page):
    # scrape the page using beautiful soup
    property_links = []
    if page > 1:
        url = f"https://www.realo.be/en/search/house,flat/for-sale?page={page}"

    print(url)
    print("Fetching search results")
    results = requests.get(url).text
    soup = BeautifulSoup(results, "html.parser")
    lists = soup.find_all("div", class_="body")

    for lst in lists:
        try:
            link = lst.find("a", class_="link")["href"].strip()
            property_links.append("https://www.realo.be" + link)
        except Exception:
            pass
    garden = "No"
    garage = "No"
    terrace = "No"
    furnished = "No"
    swimming_pool = "No"
    equipped_kitchen = "No"
    state_of_building = "No"
    open_fire = "No"
    property_type = ""
    heating = ""
    bathrooms = 0
    number_of_rooms = 0
    area = 0
    land_surface_area = 0

    for each_property in property_links:
        print("Fetching listing")
        sleep(random.choice(larger_sleep_times))
        property = requests.get(each_property).text
        soup = BeautifulSoup(property, "html.parser")

        try:
            address = soup.find("h1", class_="address").get_text().strip()
        except Exception:
            address = ""

        try:
            price = soup.find("div", class_="value").text
        except Exception:
            price = 0

        try:
            tags_li = soup.find("div", class_="component-property-description__tags").find_all("li")
        except Exception:
            tags_li = None

        if tags_li:
            for tg in tags_li:
                tag = tg.text.strip()

                if tag == "Garden":
                    garden = "Yes"
                if tag == "Garage":
                    garage = "Yes"
                if tag == "Terrace":
                    terrace = "Yes"
                if tag == "Swimming Pool":
                    swimming_pool = "Yes"
                if tag == "Equipped Kitchen":
                    equipped_kitchen = "Yes"
                if tag == "Furnished":
                    furnished = "Yes"
                if tag == "New Build":
                    state_of_building = "Yes"

        feature_table = soup.find("div", class_="component-property-features").find("table")
        trs = feature_table.find_all("tr")
        for tr in trs:
            tds = tr.find_all("td")
            name = tds[0].get_text()
            value = tds[1].get_text().strip()

            if name == "Property type":
                property_type = value
            elif name == "Bathrooms":
                bathrooms = value
            elif name == "Bedrooms":
                number_of_rooms = value
            elif name == "Habitable area":
                area = value
            elif name == "Lot size":
                land_surface_area = value
            elif name == "Heating type":
                heating = value

        print("Writting to file")
        writer.writerow(
            [
                address,
                price,
                property_type,
                number_of_rooms,
                area,
                equipped_kitchen,
                furnished,
                garage,
                open_fire,
                terrace,
                garden,
                swimming_pool,
                state_of_building,
                each_property,
            ]
        )
        file.flush()

    page += 1
    next_page_url = f"https://www.realo.be/en/search/house,flat/for-sale?page={page}"
    sleep(15)
    print(f"Fetching next page - {page} results")
    fetch_listings(next_page_url, page)


# write title row
writer.writerow(
    [
        "Locality",
        "Price",
        "Property type",
        "Number of rooms",
        "Area",
        "Equipped kitchen",
        "Furnished",
        "Garage" "open_fire",
        "Terrace",
        "Garden",
        "Swimming pool",
        "New building",
        "Link",
    ]
)

fetch_listings(url, 3)
file.close()