-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRealo_scraper.py
More file actions
181 lines (159 loc) · 5.26 KB
/
Realo_scraper.py
File metadata and controls
181 lines (159 loc) · 5.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 28 08:11:38 2020
@author: Naomi
"""
import csv
import random
import requests
import selenium
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
# from requests_html import HTMLSession
# import pandas as pandas
sleepTimes = [2.1, 2.8, 3.2, 4, 5.2]
larger_sleep_times = [10, 12, 15]
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
"Dnt": "1",
"Host": "httpbin.org",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
# url = "https://www.realo.be/en"
url = "https://www.realo.be/en/search/house,flat/for-sale?savedSearchId=18538987&saved=1"
file = open("listings.csv", "w")
writer = csv.writer(file, delimiter=",", quotechar='"')
print("Open file for writting")
def fetch_listings(url, page):
# scrape the page using beautiful soup
property_links = []
if page > 1:
url = f"https://www.realo.be/en/search/house,flat/for-sale?page={page}"
print(url)
print("Fetching search results")
results = requests.get(url).text
soup = BeautifulSoup(results, "html.parser")
lists = soup.find_all("div", class_="body")
for lst in lists:
try:
link = lst.find("a", class_="link")["href"].strip()
property_links.append("https://www.realo.be" + link)
except Exception:
pass
garden = "No"
garage = "No"
terrace = "No"
furnished = "No"
swimming_pool = "No"
equipped_kitchen = "No"
state_of_building = "No"
open_fire = "No"
property_type = ""
heating = ""
bathrooms = 0
number_of_rooms = 0
area = 0
land_surface_area = 0
for each_property in property_links:
print("Fetching listing")
sleep(random.choice(larger_sleep_times))
property = requests.get(each_property).text
soup = BeautifulSoup(property, "html.parser")
try:
address = soup.find("h1", class_="address").get_text().strip()
except Exception:
address = ""
try:
price = soup.find("div", class_="value").text
except Exception:
price = 0
try:
tags_li = soup.find("div", class_="component-property-description__tags").find_all("li")
except Exception:
tags_li = None
if tags_li:
for tg in tags_li:
tag = tg.text.strip()
if tag == "Garden":
garden = "Yes"
if tag == "Garage":
garage = "Yes"
if tag == "Terrace":
terrace = "Yes"
if tag == "Swimming Pool":
swimming_pool = "Yes"
if tag == "Equipped Kitchen":
equipped_kitchen = "Yes"
if tag == "Furnished":
furnished = "Yes"
if tag == "New Build":
state_of_building = "Yes"
feature_table = soup.find("div", class_="component-property-features").find("table")
trs = feature_table.find_all("tr")
for tr in trs:
tds = tr.find_all("td")
name = tds[0].get_text()
value = tds[1].get_text().strip()
if name == "Property type":
property_type = value
elif name == "Bathrooms":
bathrooms = value
elif name == "Bedrooms":
number_of_rooms = value
elif name == "Habitable area":
area = value
elif name == "Lot size":
land_surface_area = value
elif name == "Heating type":
heating = value
print("Writting to file")
writer.writerow(
[
address,
price,
property_type,
number_of_rooms,
area,
equipped_kitchen,
furnished,
garage,
open_fire,
terrace,
garden,
swimming_pool,
state_of_building,
each_property,
]
)
file.flush()
page += 1
next_page_url = f"https://www.realo.be/en/search/house,flat/for-sale?page={page}"
sleep(15)
print(f"Fetching next page - {page} results")
fetch_listings(next_page_url, page)
# write title row
writer.writerow(
[
"Locality",
"Price",
"Property type",
"Number of rooms",
"Area",
"Equipped kitchen",
"Furnished",
"Garage" "open_fire",
"Terrace",
"Garden",
"Swimming pool",
"New building",
"Link",
]
)
fetch_listings(url, 3)
file.close()