MorizonScraper/utils.py at main · kami4ka/MorizonScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
"""Utility functions for parsing Morizon.pl HTML."""

import re
import logging
from typing import Optional
from bs4 import BeautifulSoup

from models import Property
from config import MORIZON_BASE_URL

logger = logging.getLogger(__name__)


def _clean_text(text: Optional[str]) -> Optional[str]:
    """Clean and normalize text."""
    if not text:
        return None
    # Remove extra whitespace and normalize
    cleaned = " ".join(text.split())
    return cleaned if cleaned else None


def _extract_number(text: Optional[str]) -> Optional[str]:
    """Extract numeric value from text."""
    if not text:
        return None
    # Find numbers with optional decimal
    match = re.search(r"[\d\s]+[,.]?\d*", text.replace(" ", ""))
    if match:
        return match.group().strip().replace(" ", "")
    return None


def parse_total_count(html: str) -> int:
    """Parse total number of listings from search results page.

    Args:
        html: HTML content of search results page

    Returns:
        Total number of listings
    """
    soup = BeautifulSoup(html, "lxml")

    # Look for text like "11 870 ogłoszeń" or "Mieszkania do wynajęcia - 11 870 ogłoszeń"
    # Try finding in the title/heading area
    for text_elem in soup.find_all(string=re.compile(r"[\d\s]+ ogłosze")):
        text = text_elem.strip()
        match = re.search(r"([\d\s]+)\s*ogłosze", text)
        if match:
            count_str = match.group(1).replace(" ", "").replace("\xa0", "")
            try:
                return int(count_str)
            except ValueError:
                continue

    # Alternative: look for meta or structured data
    meta_desc = soup.find("meta", {"name": "description"})
    if meta_desc and meta_desc.get("content"):
        match = re.search(r"([\d\s]+)\s*ogłosze", meta_desc["content"])
        if match:
            count_str = match.group(1).replace(" ", "").replace("\xa0", "")
            try:
                return int(count_str)
            except ValueError:
                pass

    logger.warning("Could not parse total count from page")
    return 0


def parse_property_urls(html: str) -> list[str]:
    """Parse property URLs from search results page.

    Args:
        html: HTML content of search results page

    Returns:
        List of property URLs
    """
    soup = BeautifulSoup(html, "lxml")
    urls = []

    # Find all links to property detail pages
    # Pattern: /oferta/wynajem-mieszkanie-... or /oferta/sprzedaz-mieszkanie-...
    for link in soup.find_all("a", href=True):
        href = link["href"]
        if "/oferta/" in href and href.count("/") >= 2:
            # Normalize URL
            if href.startswith("/"):
                full_url = MORIZON_BASE_URL + href
            elif href.startswith("http"):
                full_url = href
            else:
                continue

            # Skip duplicates and non-property links
            if full_url not in urls and "mzn" in full_url.lower():
                urls.append(full_url)

    return urls


def _find_value_after_label(soup: BeautifulSoup, label: str) -> Optional[str]:
    """Find value text that appears after a label.

    Args:
        soup: BeautifulSoup object
        label: Label text to search for

    Returns:
        Value text or None
    """
    # Find elements containing the label
    for elem in soup.find_all(string=re.compile(label, re.IGNORECASE)):
        parent = elem.parent
        if parent:
            # Check next sibling
            next_elem = parent.find_next_sibling()
            if next_elem:
                text = next_elem.get_text(strip=True)
                if text:
                    return _clean_text(text)
            # Check parent's next sibling
            parent_next = parent.parent.find_next_sibling() if parent.parent else None
            if parent_next:
                text = parent_next.get_text(strip=True)
                if text:
                    return _clean_text(text)

    return None


def _extract_table_data(soup: BeautifulSoup) -> dict:
    """Extract key-value pairs from tables and definition lists.

    Args:
        soup: BeautifulSoup object

    Returns:
        Dictionary of key-value pairs
    """
    data = {}

    # Find all table rows (tr with th/td pairs)
    for tr in soup.find_all("tr"):
        th = tr.find("th")
        td = tr.find("td")
        if th and td:
            key = _clean_text(th.get_text())
            value = _clean_text(td.get_text())
            if key and value:
                data[key.lower()] = value

    # Find all definition lists (dl with dt/dd pairs)
    for dl in soup.find_all("dl"):
        dts = dl.find_all("dt")
        dds = dl.find_all("dd")
        for dt, dd in zip(dts, dds):
            key = _clean_text(dt.get_text())
            value = _clean_text(dd.get_text())
            if key and value:
                data[key.lower()] = value

    # Find labeled divs/spans with adjacent values
    for label in soup.find_all(class_=re.compile(r"label|key|name", re.IGNORECASE)):
        key = _clean_text(label.get_text())
        if key:
            # Look for value in next sibling or parent's next child
            value_elem = label.find_next_sibling()
            if value_elem:
                value = _clean_text(value_elem.get_text())
                if value:
                    data[key.lower()] = value

    return data


def _extract_section_items(soup: BeautifulSoup, section_name: str) -> Optional[str]:
    """Extract items from a named section.

    Args:
        soup: BeautifulSoup object
        section_name: Name of section to find

    Returns:
        Comma-separated items or None
    """
    # Find section header
    for header in soup.find_all(string=re.compile(section_name, re.IGNORECASE)):
        parent = header.parent
        if parent:
            # Look for list items in parent or next sibling
            container = parent.parent if parent.parent else parent
            items = []
            for li in container.find_all("li"):
                item_text = _clean_text(li.get_text())
                if item_text:
                    items.append(item_text)
            if items:
                return ", ".join(items)

    return None


def parse_property_details(html: str, url: str) -> Property:
    """Parse property details from detail page.

    Args:
        html: HTML content of property detail page
        url: Property URL

    Returns:
        Property object with parsed data
    """
    soup = BeautifulSoup(html, "lxml")
    prop = Property(url=url)

    # Extract listing ID from URL
    match = re.search(r"(mzn\d+)", url.lower())
    if match:
        prop.listing_id = match.group(1)

    # Extract title
    title_elem = soup.find("h1")
    if title_elem:
        prop.title = _clean_text(title_elem.get_text())

    # Extract price
    price_patterns = [
        re.compile(r"([\d\s]+)\s*zł", re.IGNORECASE),
        re.compile(r"([\d\s]+)\s*PLN", re.IGNORECASE),
    ]
    for text_elem in soup.find_all(string=re.compile(r"[\d\s]+\s*(zł|PLN)")):
        text = text_elem.strip()
        for pattern in price_patterns:
            match = pattern.search(text)
            if match:
                price_str = match.group(1).replace(" ", "").replace("\xa0", "")
                prop.price = price_str
                break
        if prop.price:
            break

    # Extract price per sqm
    for text_elem in soup.find_all(string=re.compile(r"[\d\s]+\s*zł/m")):
        text = text_elem.strip()
        match = re.search(r"([\d\s,]+)\s*zł/m", text)
        if match:
            prop.price_per_sqm = match.group(1).replace(" ", "").replace("\xa0", "")
            break

    # Extract structured data from tables/lists
    table_data = _extract_table_data(soup)

    # Map Polish labels to property fields
    field_mappings = {
        # Property details
        "powierzchnia": "living_area",
        "pow. całkowita": "living_area",
        "pokoje": "rooms",
        "liczba pokoi": "rooms",
        "piętro": "floor",
        "liczba pięter": "total_floors",
        "wysokość wnętrza": "interior_height",
        # Characteristics
        "stan nieruchomości": "condition",
        "stan mieszkania": "condition",
        "rynek": "market_type",
        "forma własności": "ownership",
        "dostępne od": "available_from",
        "rodzaj umowy": "contract_type",
        "depozyt za wynajem": "deposit",
        "depozyt": "deposit",
        # Kitchen/bathroom
        "typ kuchni": "kitchen_type",
        "łazienka razem z wc": "bathroom_with_wc",
        "balkon": "balcony",
        "stolarka okienna": "windows",
        # Building
        "typ budynku": "building_type",
        "rok budowy": "year_built",
        "ogrzewanie": "heating",
        # Listing info
        "data dodania": "date_added",
        "aktualizacja": "date_updated",
        "numer ogłoszenia": "listing_id",
        "liczba odsłon": "views",
    }

    for polish_key, field_name in field_mappings.items():
        for table_key, value in table_data.items():
            if polish_key in table_key:
                current_val = getattr(prop, field_name)
                if not current_val:
                    setattr(prop, field_name, value)
                break

    # Extract equipment, amenities, media sections
    prop.equipment = _extract_section_items(soup, "Wyposażenie")
    prop.amenities = _extract_section_items(soup, "Udogodnienia")
    prop.media = _extract_section_items(soup, "Media")

    # Extract address/location from breadcrumbs or location section
    breadcrumb = soup.find(class_=re.compile(r"breadcrumb", re.IGNORECASE))
    if breadcrumb:
        crumbs = breadcrumb.find_all("a")
        location_parts = [_clean_text(c.get_text()) for c in crumbs if c.get_text()]
        if len(location_parts) >= 2:
            prop.city = location_parts[-1] if len(location_parts) > 0 else None
            prop.voivodeship = location_parts[1] if len(location_parts) > 1 else None

    # Try to extract address from title or specific elements
    address_elem = soup.find(class_=re.compile(r"address|location", re.IGNORECASE))
    if address_elem:
        prop.address = _clean_text(address_elem.get_text())

    # Extract agent info
    agent_section = soup.find(class_=re.compile(r"agent|contact|advertiser", re.IGNORECASE))
    if agent_section:
        # Try to find company name
        company = agent_section.find(class_=re.compile(r"company|agency", re.IGNORECASE))
        if company:
            prop.agent_company = _clean_text(company.get_text())

        # Try to find agent name
        name = agent_section.find(class_=re.compile(r"name|person", re.IGNORECASE))
        if name:
            prop.advertiser_name = _clean_text(name.get_text())

    # Extract description - look for specific morizon description section
    # Try to find the main property description section
    desc_elem = None

    # Look for section with "Opis" heading
    for header in soup.find_all(["h2", "h3", "h4"], string=re.compile(r"^Opis", re.IGNORECASE)):
        parent = header.parent
        if parent:
            # Find next sibling with text content
            for sibling in parent.find_next_siblings():
                text = sibling.get_text(strip=True)
                if len(text) > 50 and not re.search(r"ogłosze|mieszkania|sprzedaż|wynajem", text, re.IGNORECASE):
                    desc_elem = sibling
                    break
        if desc_elem:
            break

    # Fallback: look for div with description-like content
    if not desc_elem:
        for div in soup.find_all("div"):
            text = div.get_text(strip=True)
            # Description typically has substantial text without navigation/menu content
            if 100 < len(text) < 3000:
                # Check it's not a navigation or listing section
                if not re.search(r"(mieszkania do wynajęcia|domy na sprzedaż|Województwa|Miasta|Blog|Regulaminy)", text, re.IGNORECASE):
                    # Check for paragraph-like structure
                    paragraphs = div.find_all("p")
                    if paragraphs and len(paragraphs) >= 1:
                        desc_elem = div
                        break

    if desc_elem:
        desc_text = desc_elem.get_text(separator=" ", strip=True)
        if 50 < len(desc_text) < 2000:
            prop.description = _clean_text(desc_text)

    return prop