updated scrapper to match new frontend

MatrixRex · MatrixRex · commit f0837bcc3bd6 · 2026-03-03T23:22:39.000+06:00
diff --git a/public/data/meta.json b/public/data/meta.json
diff --git a/scraper/categories.json b/scraper/categories.json
@@ -742,13 +742,5 @@
   {
     "category": "Bird & Other Pet Food",
     "url": "https://chaldal.com/bird-food"
-  },
-  {
-    "category": "Toys & Sports",
-    "url": "https://chaldal.com/toys-sports"
-  },
-  {
-    "category": "Breakfast",
-    "url": "https://chaldal.com/eggs-2"
   }
 ]
diff --git a/scraper/check_data.py b/scraper/check_data.py
@@ -0,0 +1,47 @@
+import pandas as pd
+import json
+import os
+
+# Paths
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+PRICES_DIR = os.path.join(BASE_DIR, "public", "data", "prices")
+META_FILE = os.path.join(BASE_DIR, "public", "data", "meta.json")
+
+def check_data():
+    print("--- DATA INSPECTION TOOL ---")
+    
+    # 1. Check Parquet Database
+    print("\n[1] Checking Parquet Database...")
+    found_any = False
+    for root, dirs, files in os.walk(PRICES_DIR):
+        for file in files:
+            if file.endswith(".parquet"):
+                path = os.path.join(root, file)
+                df = pd.read_parquet(path)
+                print(f"File: {os.path.relpath(path, BASE_DIR)}")
+                print(f"  > Total Records: {len(df)}")
+                print(f"  > Columns: {list(df.columns)}")
+                print(f"  > Date Range: {df['date'].min()} to {df['date'].max()}")
+                print(f"  > Unique Products: {df['name'].nunique()}")
+                
+                print("\nSample Data (First 5 items):")
+                print(df[['name', 'category', 'price']].head())
+                found_any = True
+                
+    if not found_any:
+        print("  > No Parquet files found!")
+
+    # 2. Check Meta JSON (Search Index)
+    print("\n[2] Checking Search Index (meta.json)...")
+    if os.path.exists(META_FILE):
+        with open(META_FILE, 'r', encoding='utf-8') as f:
+            meta = json.load(f)
+            print(f"  > Total Items in Index: {len(meta)}")
+            if meta:
+                print("  > First entry sample:")
+                print(json.dumps(meta[0], indent=2))
+    else:
+        print("  > meta.json not found!")
+
+if __name__ == "__main__":
+    check_data()
diff --git a/scraper/main.py b/scraper/main.py
@@ -3,7 +3,6 @@
 import time
 import hashlib
 import datetime
-import requests
 import json
 import pandas as pd
 from playwright.sync_api import sync_playwright
@@ -54,6 +53,9 @@ def process_image(image_url, filename):
     except Exception:
         pass 
 
+# --- CONFIGURATION ---
+# No longer using API constants, back to Browser automation for reliability
+
 def scrape():
     # 1. START TIMER
     start_time = time.time()
@@ -72,47 +74,60 @@ def scrape():
         print("Launching browser...")
         browser = p.chromium.launch(headless=True) 
         
-        # specific context setup (mobile view/location/locale)
+        # specific context setup
         context = browser.new_context(
             viewport={'width': 1920, 'height': 1080},
             permissions=['geolocation'], 
             geolocation={'latitude': 23.8103, 'longitude': 90.4125}, 
             locale='en-US'
         )
         
-        # total_cats = len(URLS) # Moved up
-        
         for index, entry in enumerate(URLS):
             print(f"[{index+1}/{total_cats}] Scraping: {entry['category']}...")
             
             try:
                 page = context.new_page()
                 page.goto(entry['url'], timeout=60000)
 
-                # Wait for products to load
-                try:
-                    page.wait_for_selector('.product', timeout=10000)
-                except:
-                    print(f"  > Warning: No items found for {entry['category']} (Timeout)")
+                # Wait for any product container to load (Multi-selector wait)
+                container_selectors = ['.productV2Catalog', '.product', '.productsContent > div', '.product-pane div']
+                found_container = None
+                for selector in container_selectors:
+                    try:
+                        page.wait_for_selector(selector, timeout=8000)
+                        found_container = selector
+                        break
+                    except:
+                        continue
+
+                if not found_container:
+                    print(f"  > Warning: No product containers found for {entry['category']} (Final URL: {page.url})")
                     page.close()
                     continue
 
-                # Scroll down to load lazy-loaded items
-                for i in range(15): 
+                # Scroll down with more breathing room for infinite scroll
+                for i in range(16): 
                     page.keyboard.press("PageDown")
-                    time.sleep(0.5)
+                    time.sleep(0.8) # Increased wait for loading
+                
+                # Final settle wait
+                time.sleep(1.5)
 
-                products = page.query_selector_all('.product')
+                products = page.query_selector_all(found_container)
                 count_for_page = 0
 
                 for product in products:
                     try:
-                        # Skip cart/summary items
-                        class_attr = product.get_attribute("class")
-                        if "total" in class_attr or "shoppingCart" in class_attr: continue
-
-                        name_el = product.query_selector('.name')
-                        price_el = product.query_selector('.price')
+                        # 1. NAME SELECTORS
+                        name_el = product.query_selector('.nameTextWithEllipsis') or \
+                                  product.query_selector('.pvName p') or \
+                                  product.query_selector('.name')
+                        
+                        # 2. PRICE SELECTORS
+                        # Note: Some use .productV2discountedPrice, some use .price
+                        price_el = product.query_selector('.productV2discountedPrice span') or \
+                                   product.query_selector('.price span') or \
+                                   product.query_selector('.price')
                         
                         if not name_el or not price_el: continue
 
@@ -122,35 +137,41 @@ def scrape():
                         if not price_text: continue
                         price = float(price_text)
 
-                        unit_el = product.query_selector('.subText')
+                        # 3. UNIT SELECTORS
+                        # Note: Case sensitivity matters in CSS (.subText vs .subtext)
+                        unit_el = product.query_selector('.subText span') or \
+                                  product.query_selector('.subtext span') or \
+                                  product.query_selector('.subText') or \
+                                  product.query_selector('.subtext') or \
+                                  product.query_selector('.sub-text')
+                        
                         unit = unit_el.inner_text().strip() if unit_el else "N/A"
 
-                        # ---------------------------------------------------------
-                        # ### FIX: COMPOSITE NAME LOGIC
-                        # We append the unit to the name. This ensures that "Oil 1L" 
-                        # and "Oil 5L" are treated as completely different products 
-                        # by the database, the image hasher, and the frontend.
-                        # ---------------------------------------------------------
-                        if unit and unit != "N/A":
-                            name = f"{name} {unit}"
-                        # ---------------------------------------------------------
-
-                        # Image processing (Now uses the unique name with unit for hashing)
-                        img_el = product.query_selector('img')
+                        # Composition logic
+                        display_name = name
+                        if unit and unit != "N/A" and unit.lower() not in name.lower():
+                            display_name = f"{name} {unit}"
+
+                        # 4. IMAGE SELECTORS
+                        img_el = product.query_selector('.imageWrapperWrapper img') or \
+                                 product.query_selector('.imageWrapper img') or \
+                                 product.query_selector('img')
+                        
                         img_url = img_el.get_attribute('src') if img_el else None
                         
-                        img_filename = get_image_filename(name)
+                        img_filename = get_image_filename(display_name)
                         if img_url:
                             process_image(img_url, img_filename)
 
                         scraped_data.append({
                             "date": today,
-                            "name": name, # This now contains the unit (e.g. "Soybean Oil 5L")
+                            "name": display_name,
                             "price": price,
                             "unit": unit,
                             "category": entry['category'],
                             "image": img_filename
                         })
+                        print(f"    + {display_name}: ৳{price}") 
                         count_for_page += 1
                     except Exception:
                         continue

Original file line number	Diff line number	Diff line change
`@@ -742,13 +742,5 @@`
`742`	`742`	`{`
`743`	`743`	`"category": "Bird & Other Pet Food",`
`744`	`744`	`"url": "https://chaldal.com/bird-food"`
`745`		`- },`
`746`		`- {`
`747`		`- "category": "Toys & Sports",`
`748`		`- "url": "https://chaldal.com/toys-sports"`
`749`		`- },`
`750`		`- {`
`751`		`- "category": "Breakfast",`
`752`		`- "url": "https://chaldal.com/eggs-2"`
`753`	`745`	`}`
`754`	`746`	`]`