-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
367 lines (307 loc) · 12.2 KB
/
utils.py
File metadata and controls
367 lines (307 loc) · 12.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
"""Utility functions for parsing Morizon.pl HTML."""
import re
import logging
from typing import Optional
from bs4 import BeautifulSoup
from models import Property
from config import MORIZON_BASE_URL
logger = logging.getLogger(__name__)
def _clean_text(text: Optional[str]) -> Optional[str]:
"""Clean and normalize text."""
if not text:
return None
# Remove extra whitespace and normalize
cleaned = " ".join(text.split())
return cleaned if cleaned else None
def _extract_number(text: Optional[str]) -> Optional[str]:
"""Extract numeric value from text."""
if not text:
return None
# Find numbers with optional decimal
match = re.search(r"[\d\s]+[,.]?\d*", text.replace(" ", ""))
if match:
return match.group().strip().replace(" ", "")
return None
def parse_total_count(html: str) -> int:
"""Parse total number of listings from search results page.
Args:
html: HTML content of search results page
Returns:
Total number of listings
"""
soup = BeautifulSoup(html, "lxml")
# Look for text like "11 870 ogłoszeń" or "Mieszkania do wynajęcia - 11 870 ogłoszeń"
# Try finding in the title/heading area
for text_elem in soup.find_all(string=re.compile(r"[\d\s]+ ogłosze")):
text = text_elem.strip()
match = re.search(r"([\d\s]+)\s*ogłosze", text)
if match:
count_str = match.group(1).replace(" ", "").replace("\xa0", "")
try:
return int(count_str)
except ValueError:
continue
# Alternative: look for meta or structured data
meta_desc = soup.find("meta", {"name": "description"})
if meta_desc and meta_desc.get("content"):
match = re.search(r"([\d\s]+)\s*ogłosze", meta_desc["content"])
if match:
count_str = match.group(1).replace(" ", "").replace("\xa0", "")
try:
return int(count_str)
except ValueError:
pass
logger.warning("Could not parse total count from page")
return 0
def parse_property_urls(html: str) -> list[str]:
"""Parse property URLs from search results page.
Args:
html: HTML content of search results page
Returns:
List of property URLs
"""
soup = BeautifulSoup(html, "lxml")
urls = []
# Find all links to property detail pages
# Pattern: /oferta/wynajem-mieszkanie-... or /oferta/sprzedaz-mieszkanie-...
for link in soup.find_all("a", href=True):
href = link["href"]
if "/oferta/" in href and href.count("/") >= 2:
# Normalize URL
if href.startswith("/"):
full_url = MORIZON_BASE_URL + href
elif href.startswith("http"):
full_url = href
else:
continue
# Skip duplicates and non-property links
if full_url not in urls and "mzn" in full_url.lower():
urls.append(full_url)
return urls
def _find_value_after_label(soup: BeautifulSoup, label: str) -> Optional[str]:
"""Find value text that appears after a label.
Args:
soup: BeautifulSoup object
label: Label text to search for
Returns:
Value text or None
"""
# Find elements containing the label
for elem in soup.find_all(string=re.compile(label, re.IGNORECASE)):
parent = elem.parent
if parent:
# Check next sibling
next_elem = parent.find_next_sibling()
if next_elem:
text = next_elem.get_text(strip=True)
if text:
return _clean_text(text)
# Check parent's next sibling
parent_next = parent.parent.find_next_sibling() if parent.parent else None
if parent_next:
text = parent_next.get_text(strip=True)
if text:
return _clean_text(text)
return None
def _extract_table_data(soup: BeautifulSoup) -> dict:
"""Extract key-value pairs from tables and definition lists.
Args:
soup: BeautifulSoup object
Returns:
Dictionary of key-value pairs
"""
data = {}
# Find all table rows (tr with th/td pairs)
for tr in soup.find_all("tr"):
th = tr.find("th")
td = tr.find("td")
if th and td:
key = _clean_text(th.get_text())
value = _clean_text(td.get_text())
if key and value:
data[key.lower()] = value
# Find all definition lists (dl with dt/dd pairs)
for dl in soup.find_all("dl"):
dts = dl.find_all("dt")
dds = dl.find_all("dd")
for dt, dd in zip(dts, dds):
key = _clean_text(dt.get_text())
value = _clean_text(dd.get_text())
if key and value:
data[key.lower()] = value
# Find labeled divs/spans with adjacent values
for label in soup.find_all(class_=re.compile(r"label|key|name", re.IGNORECASE)):
key = _clean_text(label.get_text())
if key:
# Look for value in next sibling or parent's next child
value_elem = label.find_next_sibling()
if value_elem:
value = _clean_text(value_elem.get_text())
if value:
data[key.lower()] = value
return data
def _extract_section_items(soup: BeautifulSoup, section_name: str) -> Optional[str]:
"""Extract items from a named section.
Args:
soup: BeautifulSoup object
section_name: Name of section to find
Returns:
Comma-separated items or None
"""
# Find section header
for header in soup.find_all(string=re.compile(section_name, re.IGNORECASE)):
parent = header.parent
if parent:
# Look for list items in parent or next sibling
container = parent.parent if parent.parent else parent
items = []
for li in container.find_all("li"):
item_text = _clean_text(li.get_text())
if item_text:
items.append(item_text)
if items:
return ", ".join(items)
return None
def parse_property_details(html: str, url: str) -> Property:
"""Parse property details from detail page.
Args:
html: HTML content of property detail page
url: Property URL
Returns:
Property object with parsed data
"""
soup = BeautifulSoup(html, "lxml")
prop = Property(url=url)
# Extract listing ID from URL
match = re.search(r"(mzn\d+)", url.lower())
if match:
prop.listing_id = match.group(1)
# Extract title
title_elem = soup.find("h1")
if title_elem:
prop.title = _clean_text(title_elem.get_text())
# Extract price
price_patterns = [
re.compile(r"([\d\s]+)\s*zł", re.IGNORECASE),
re.compile(r"([\d\s]+)\s*PLN", re.IGNORECASE),
]
for text_elem in soup.find_all(string=re.compile(r"[\d\s]+\s*(zł|PLN)")):
text = text_elem.strip()
for pattern in price_patterns:
match = pattern.search(text)
if match:
price_str = match.group(1).replace(" ", "").replace("\xa0", "")
prop.price = price_str
break
if prop.price:
break
# Extract price per sqm
for text_elem in soup.find_all(string=re.compile(r"[\d\s]+\s*zł/m")):
text = text_elem.strip()
match = re.search(r"([\d\s,]+)\s*zł/m", text)
if match:
prop.price_per_sqm = match.group(1).replace(" ", "").replace("\xa0", "")
break
# Extract structured data from tables/lists
table_data = _extract_table_data(soup)
# Map Polish labels to property fields
field_mappings = {
# Property details
"powierzchnia": "living_area",
"pow. całkowita": "living_area",
"pokoje": "rooms",
"liczba pokoi": "rooms",
"piętro": "floor",
"liczba pięter": "total_floors",
"wysokość wnętrza": "interior_height",
# Characteristics
"stan nieruchomości": "condition",
"stan mieszkania": "condition",
"rynek": "market_type",
"forma własności": "ownership",
"dostępne od": "available_from",
"rodzaj umowy": "contract_type",
"depozyt za wynajem": "deposit",
"depozyt": "deposit",
# Kitchen/bathroom
"typ kuchni": "kitchen_type",
"łazienka razem z wc": "bathroom_with_wc",
"balkon": "balcony",
"stolarka okienna": "windows",
# Building
"typ budynku": "building_type",
"rok budowy": "year_built",
"ogrzewanie": "heating",
# Listing info
"data dodania": "date_added",
"aktualizacja": "date_updated",
"numer ogłoszenia": "listing_id",
"liczba odsłon": "views",
}
for polish_key, field_name in field_mappings.items():
for table_key, value in table_data.items():
if polish_key in table_key:
current_val = getattr(prop, field_name)
if not current_val:
setattr(prop, field_name, value)
break
# Extract equipment, amenities, media sections
prop.equipment = _extract_section_items(soup, "Wyposażenie")
prop.amenities = _extract_section_items(soup, "Udogodnienia")
prop.media = _extract_section_items(soup, "Media")
# Extract address/location from breadcrumbs or location section
breadcrumb = soup.find(class_=re.compile(r"breadcrumb", re.IGNORECASE))
if breadcrumb:
crumbs = breadcrumb.find_all("a")
location_parts = [_clean_text(c.get_text()) for c in crumbs if c.get_text()]
if len(location_parts) >= 2:
prop.city = location_parts[-1] if len(location_parts) > 0 else None
prop.voivodeship = location_parts[1] if len(location_parts) > 1 else None
# Try to extract address from title or specific elements
address_elem = soup.find(class_=re.compile(r"address|location", re.IGNORECASE))
if address_elem:
prop.address = _clean_text(address_elem.get_text())
# Extract agent info
agent_section = soup.find(class_=re.compile(r"agent|contact|advertiser", re.IGNORECASE))
if agent_section:
# Try to find company name
company = agent_section.find(class_=re.compile(r"company|agency", re.IGNORECASE))
if company:
prop.agent_company = _clean_text(company.get_text())
# Try to find agent name
name = agent_section.find(class_=re.compile(r"name|person", re.IGNORECASE))
if name:
prop.advertiser_name = _clean_text(name.get_text())
# Extract description - look for specific morizon description section
# Try to find the main property description section
desc_elem = None
# Look for section with "Opis" heading
for header in soup.find_all(["h2", "h3", "h4"], string=re.compile(r"^Opis", re.IGNORECASE)):
parent = header.parent
if parent:
# Find next sibling with text content
for sibling in parent.find_next_siblings():
text = sibling.get_text(strip=True)
if len(text) > 50 and not re.search(r"ogłosze|mieszkania|sprzedaż|wynajem", text, re.IGNORECASE):
desc_elem = sibling
break
if desc_elem:
break
# Fallback: look for div with description-like content
if not desc_elem:
for div in soup.find_all("div"):
text = div.get_text(strip=True)
# Description typically has substantial text without navigation/menu content
if 100 < len(text) < 3000:
# Check it's not a navigation or listing section
if not re.search(r"(mieszkania do wynajęcia|domy na sprzedaż|Województwa|Miasta|Blog|Regulaminy)", text, re.IGNORECASE):
# Check for paragraph-like structure
paragraphs = div.find_all("p")
if paragraphs and len(paragraphs) >= 1:
desc_elem = div
break
if desc_elem:
desc_text = desc_elem.get_text(separator=" ", strip=True)
if 50 < len(desc_text) < 2000:
prop.description = _clean_text(desc_text)
return prop