-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodels.py
More file actions
99 lines (78 loc) · 2.82 KB
/
models.py
File metadata and controls
99 lines (78 loc) · 2.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""Data models for Blocket scraper."""
from dataclasses import dataclass, field, asdict
from typing import List, Optional
from datetime import datetime
import json
import csv
import os
@dataclass
class Listing:
"""Represents a single listing from Blocket."""
title: str
price: str
location: str
listing_url: str
category: str
time_posted: str = ""
can_ship: bool = False
buy_now: bool = False
image_url: str = ""
scraped_at: str = field(default_factory=lambda: datetime.now().isoformat())
def to_dict(self) -> dict:
"""Convert listing to dictionary."""
return asdict(self)
@dataclass
class ListingCollection:
"""Collection of listings with export capabilities."""
listings: List[Listing] = field(default_factory=list)
def add_listing(self, listing: Listing) -> None:
"""Add a listing to the collection."""
self.listings.append(listing)
def add_listings(self, listings: List[Listing]) -> None:
"""Add multiple listings to the collection."""
self.listings.extend(listings)
def deduplicate(self) -> int:
"""Remove duplicate listings based on URL. Returns count of removed duplicates."""
seen_urls = set()
unique_listings = []
duplicates = 0
for listing in self.listings:
if listing.listing_url not in seen_urls:
seen_urls.add(listing.listing_url)
unique_listings.append(listing)
else:
duplicates += 1
self.listings = unique_listings
return duplicates
def to_csv(self, filepath: str) -> None:
"""Export listings to CSV file."""
if not self.listings:
return
os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else ".", exist_ok=True)
fieldnames = [
"title",
"price",
"location",
"listing_url",
"category",
"time_posted",
"can_ship",
"buy_now",
"image_url",
"scraped_at",
]
with open(filepath, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for listing in self.listings:
writer.writerow(listing.to_dict())
def to_json(self, filepath: str) -> None:
"""Export listings to JSON file."""
if not self.listings:
return
os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else ".", exist_ok=True)
with open(filepath, "w", encoding="utf-8") as f:
json.dump([listing.to_dict() for listing in self.listings], f, indent=2, ensure_ascii=False)
def __len__(self) -> int:
"""Return number of listings."""
return len(self.listings)