Skip to content

Commit f0837bc

Browse files
committed
updated scrapper to match new frontend
1 parent 4a13e10 commit f0837bc

4 files changed

Lines changed: 103 additions & 43 deletions

File tree

public/data/meta.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

scraper/categories.json

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -742,13 +742,5 @@
742742
{
743743
"category": "Bird & Other Pet Food",
744744
"url": "https://chaldal.com/bird-food"
745-
},
746-
{
747-
"category": "Toys & Sports",
748-
"url": "https://chaldal.com/toys-sports"
749-
},
750-
{
751-
"category": "Breakfast",
752-
"url": "https://chaldal.com/eggs-2"
753745
}
754746
]

scraper/check_data.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import pandas as pd
2+
import json
3+
import os
4+
5+
# Paths
6+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
7+
PRICES_DIR = os.path.join(BASE_DIR, "public", "data", "prices")
8+
META_FILE = os.path.join(BASE_DIR, "public", "data", "meta.json")
9+
10+
def check_data():
11+
print("--- DATA INSPECTION TOOL ---")
12+
13+
# 1. Check Parquet Database
14+
print("\n[1] Checking Parquet Database...")
15+
found_any = False
16+
for root, dirs, files in os.walk(PRICES_DIR):
17+
for file in files:
18+
if file.endswith(".parquet"):
19+
path = os.path.join(root, file)
20+
df = pd.read_parquet(path)
21+
print(f"File: {os.path.relpath(path, BASE_DIR)}")
22+
print(f" > Total Records: {len(df)}")
23+
print(f" > Columns: {list(df.columns)}")
24+
print(f" > Date Range: {df['date'].min()} to {df['date'].max()}")
25+
print(f" > Unique Products: {df['name'].nunique()}")
26+
27+
print("\nSample Data (First 5 items):")
28+
print(df[['name', 'category', 'price']].head())
29+
found_any = True
30+
31+
if not found_any:
32+
print(" > No Parquet files found!")
33+
34+
# 2. Check Meta JSON (Search Index)
35+
print("\n[2] Checking Search Index (meta.json)...")
36+
if os.path.exists(META_FILE):
37+
with open(META_FILE, 'r', encoding='utf-8') as f:
38+
meta = json.load(f)
39+
print(f" > Total Items in Index: {len(meta)}")
40+
if meta:
41+
print(" > First entry sample:")
42+
print(json.dumps(meta[0], indent=2))
43+
else:
44+
print(" > meta.json not found!")
45+
46+
if __name__ == "__main__":
47+
check_data()

scraper/main.py

Lines changed: 55 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import time
44
import hashlib
55
import datetime
6-
import requests
76
import json
87
import pandas as pd
98
from playwright.sync_api import sync_playwright
@@ -54,6 +53,9 @@ def process_image(image_url, filename):
5453
except Exception:
5554
pass
5655

56+
# --- CONFIGURATION ---
57+
# No longer using API constants, back to Browser automation for reliability
58+
5759
def scrape():
5860
# 1. START TIMER
5961
start_time = time.time()
@@ -72,47 +74,60 @@ def scrape():
7274
print("Launching browser...")
7375
browser = p.chromium.launch(headless=True)
7476

75-
# specific context setup (mobile view/location/locale)
77+
# specific context setup
7678
context = browser.new_context(
7779
viewport={'width': 1920, 'height': 1080},
7880
permissions=['geolocation'],
7981
geolocation={'latitude': 23.8103, 'longitude': 90.4125},
8082
locale='en-US'
8183
)
8284

83-
# total_cats = len(URLS) # Moved up
84-
8585
for index, entry in enumerate(URLS):
8686
print(f"[{index+1}/{total_cats}] Scraping: {entry['category']}...")
8787

8888
try:
8989
page = context.new_page()
9090
page.goto(entry['url'], timeout=60000)
9191

92-
# Wait for products to load
93-
try:
94-
page.wait_for_selector('.product', timeout=10000)
95-
except:
96-
print(f" > Warning: No items found for {entry['category']} (Timeout)")
92+
# Wait for any product container to load (Multi-selector wait)
93+
container_selectors = ['.productV2Catalog', '.product', '.productsContent > div', '.product-pane div']
94+
found_container = None
95+
for selector in container_selectors:
96+
try:
97+
page.wait_for_selector(selector, timeout=8000)
98+
found_container = selector
99+
break
100+
except:
101+
continue
102+
103+
if not found_container:
104+
print(f" > Warning: No product containers found for {entry['category']} (Final URL: {page.url})")
97105
page.close()
98106
continue
99107

100-
# Scroll down to load lazy-loaded items
101-
for i in range(15):
108+
# Scroll down with more breathing room for infinite scroll
109+
for i in range(16):
102110
page.keyboard.press("PageDown")
103-
time.sleep(0.5)
111+
time.sleep(0.8) # Increased wait for loading
112+
113+
# Final settle wait
114+
time.sleep(1.5)
104115

105-
products = page.query_selector_all('.product')
116+
products = page.query_selector_all(found_container)
106117
count_for_page = 0
107118

108119
for product in products:
109120
try:
110-
# Skip cart/summary items
111-
class_attr = product.get_attribute("class")
112-
if "total" in class_attr or "shoppingCart" in class_attr: continue
113-
114-
name_el = product.query_selector('.name')
115-
price_el = product.query_selector('.price')
121+
# 1. NAME SELECTORS
122+
name_el = product.query_selector('.nameTextWithEllipsis') or \
123+
product.query_selector('.pvName p') or \
124+
product.query_selector('.name')
125+
126+
# 2. PRICE SELECTORS
127+
# Note: Some use .productV2discountedPrice, some use .price
128+
price_el = product.query_selector('.productV2discountedPrice span') or \
129+
product.query_selector('.price span') or \
130+
product.query_selector('.price')
116131

117132
if not name_el or not price_el: continue
118133

@@ -122,35 +137,41 @@ def scrape():
122137
if not price_text: continue
123138
price = float(price_text)
124139

125-
unit_el = product.query_selector('.subText')
140+
# 3. UNIT SELECTORS
141+
# Note: Case sensitivity matters in CSS (.subText vs .subtext)
142+
unit_el = product.query_selector('.subText span') or \
143+
product.query_selector('.subtext span') or \
144+
product.query_selector('.subText') or \
145+
product.query_selector('.subtext') or \
146+
product.query_selector('.sub-text')
147+
126148
unit = unit_el.inner_text().strip() if unit_el else "N/A"
127149

128-
# ---------------------------------------------------------
129-
# ### FIX: COMPOSITE NAME LOGIC
130-
# We append the unit to the name. This ensures that "Oil 1L"
131-
# and "Oil 5L" are treated as completely different products
132-
# by the database, the image hasher, and the frontend.
133-
# ---------------------------------------------------------
134-
if unit and unit != "N/A":
135-
name = f"{name} {unit}"
136-
# ---------------------------------------------------------
137-
138-
# Image processing (Now uses the unique name with unit for hashing)
139-
img_el = product.query_selector('img')
150+
# Composition logic
151+
display_name = name
152+
if unit and unit != "N/A" and unit.lower() not in name.lower():
153+
display_name = f"{name} {unit}"
154+
155+
# 4. IMAGE SELECTORS
156+
img_el = product.query_selector('.imageWrapperWrapper img') or \
157+
product.query_selector('.imageWrapper img') or \
158+
product.query_selector('img')
159+
140160
img_url = img_el.get_attribute('src') if img_el else None
141161

142-
img_filename = get_image_filename(name)
162+
img_filename = get_image_filename(display_name)
143163
if img_url:
144164
process_image(img_url, img_filename)
145165

146166
scraped_data.append({
147167
"date": today,
148-
"name": name, # This now contains the unit (e.g. "Soybean Oil 5L")
168+
"name": display_name,
149169
"price": price,
150170
"unit": unit,
151171
"category": entry['category'],
152172
"image": img_filename
153173
})
174+
print(f" + {display_name}: ৳{price}")
154175
count_for_page += 1
155176
except Exception:
156177
continue

0 commit comments

Comments
 (0)