BlocketScraper/models.py at main · kami4ka/BlocketScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""Data models for Blocket scraper."""

from dataclasses import dataclass, field, asdict
from typing import List, Optional
from datetime import datetime
import json
import csv
import os


@dataclass
class Listing:
    """Represents a single listing from Blocket."""

    title: str
    price: str
    location: str
    listing_url: str
    category: str
    time_posted: str = ""
    can_ship: bool = False
    buy_now: bool = False
    image_url: str = ""
    scraped_at: str = field(default_factory=lambda: datetime.now().isoformat())

    def to_dict(self) -> dict:
        """Convert listing to dictionary."""
        return asdict(self)


@dataclass
class ListingCollection:
    """Collection of listings with export capabilities."""

    listings: List[Listing] = field(default_factory=list)

    def add_listing(self, listing: Listing) -> None:
        """Add a listing to the collection."""
        self.listings.append(listing)

    def add_listings(self, listings: List[Listing]) -> None:
        """Add multiple listings to the collection."""
        self.listings.extend(listings)

    def deduplicate(self) -> int:
        """Remove duplicate listings based on URL. Returns count of removed duplicates."""
        seen_urls = set()
        unique_listings = []
        duplicates = 0

        for listing in self.listings:
            if listing.listing_url not in seen_urls:
                seen_urls.add(listing.listing_url)
                unique_listings.append(listing)
            else:
                duplicates += 1

        self.listings = unique_listings
        return duplicates

    def to_csv(self, filepath: str) -> None:
        """Export listings to CSV file."""
        if not self.listings:
            return

        os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else ".", exist_ok=True)

        fieldnames = [
            "title",
            "price",
            "location",
            "listing_url",
            "category",
            "time_posted",
            "can_ship",
            "buy_now",
            "image_url",
            "scraped_at",
        ]

        with open(filepath, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            for listing in self.listings:
                writer.writerow(listing.to_dict())

    def to_json(self, filepath: str) -> None:
        """Export listings to JSON file."""
        if not self.listings:
            return

        os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else ".", exist_ok=True)

        with open(filepath, "w", encoding="utf-8") as f:
            json.dump([listing.to_dict() for listing in self.listings], f, indent=2, ensure_ascii=False)

    def __len__(self) -> int:
        """Return number of listings."""
        return len(self.listings)