33import time
44import hashlib
55import datetime
6- import requests
76import json
87import pandas as pd
98from playwright .sync_api import sync_playwright
@@ -54,6 +53,9 @@ def process_image(image_url, filename):
5453 except Exception :
5554 pass
5655
56+ # --- CONFIGURATION ---
57+ # No longer using API constants, back to Browser automation for reliability
58+
5759def scrape ():
5860 # 1. START TIMER
5961 start_time = time .time ()
@@ -72,47 +74,60 @@ def scrape():
7274 print ("Launching browser..." )
7375 browser = p .chromium .launch (headless = True )
7476
75- # specific context setup (mobile view/location/locale)
77+ # specific context setup
7678 context = browser .new_context (
7779 viewport = {'width' : 1920 , 'height' : 1080 },
7880 permissions = ['geolocation' ],
7981 geolocation = {'latitude' : 23.8103 , 'longitude' : 90.4125 },
8082 locale = 'en-US'
8183 )
8284
83- # total_cats = len(URLS) # Moved up
84-
8585 for index , entry in enumerate (URLS ):
8686 print (f"[{ index + 1 } /{ total_cats } ] Scraping: { entry ['category' ]} ..." )
8787
8888 try :
8989 page = context .new_page ()
9090 page .goto (entry ['url' ], timeout = 60000 )
9191
92- # Wait for products to load
93- try :
94- page .wait_for_selector ('.product' , timeout = 10000 )
95- except :
96- print (f" > Warning: No items found for { entry ['category' ]} (Timeout)" )
92+ # Wait for any product container to load (Multi-selector wait)
93+ container_selectors = ['.productV2Catalog' , '.product' , '.productsContent > div' , '.product-pane div' ]
94+ found_container = None
95+ for selector in container_selectors :
96+ try :
97+ page .wait_for_selector (selector , timeout = 8000 )
98+ found_container = selector
99+ break
100+ except :
101+ continue
102+
103+ if not found_container :
104+ print (f" > Warning: No product containers found for { entry ['category' ]} (Final URL: { page .url } )" )
97105 page .close ()
98106 continue
99107
100- # Scroll down to load lazy-loaded items
101- for i in range (15 ):
108+ # Scroll down with more breathing room for infinite scroll
109+ for i in range (16 ):
102110 page .keyboard .press ("PageDown" )
103- time .sleep (0.5 )
111+ time .sleep (0.8 ) # Increased wait for loading
112+
113+ # Final settle wait
114+ time .sleep (1.5 )
104115
105- products = page .query_selector_all ('.product' )
116+ products = page .query_selector_all (found_container )
106117 count_for_page = 0
107118
108119 for product in products :
109120 try :
110- # Skip cart/summary items
111- class_attr = product .get_attribute ("class" )
112- if "total" in class_attr or "shoppingCart" in class_attr : continue
113-
114- name_el = product .query_selector ('.name' )
115- price_el = product .query_selector ('.price' )
121+ # 1. NAME SELECTORS
122+ name_el = product .query_selector ('.nameTextWithEllipsis' ) or \
123+ product .query_selector ('.pvName p' ) or \
124+ product .query_selector ('.name' )
125+
126+ # 2. PRICE SELECTORS
127+ # Note: Some use .productV2discountedPrice, some use .price
128+ price_el = product .query_selector ('.productV2discountedPrice span' ) or \
129+ product .query_selector ('.price span' ) or \
130+ product .query_selector ('.price' )
116131
117132 if not name_el or not price_el : continue
118133
@@ -122,35 +137,41 @@ def scrape():
122137 if not price_text : continue
123138 price = float (price_text )
124139
125- unit_el = product .query_selector ('.subText' )
140+ # 3. UNIT SELECTORS
141+ # Note: Case sensitivity matters in CSS (.subText vs .subtext)
142+ unit_el = product .query_selector ('.subText span' ) or \
143+ product .query_selector ('.subtext span' ) or \
144+ product .query_selector ('.subText' ) or \
145+ product .query_selector ('.subtext' ) or \
146+ product .query_selector ('.sub-text' )
147+
126148 unit = unit_el .inner_text ().strip () if unit_el else "N/A"
127149
128- # ---------------------------------------------------------
129- # ### FIX: COMPOSITE NAME LOGIC
130- # We append the unit to the name. This ensures that "Oil 1L"
131- # and "Oil 5L" are treated as completely different products
132- # by the database, the image hasher, and the frontend.
133- # ---------------------------------------------------------
134- if unit and unit != "N/A" :
135- name = f"{ name } { unit } "
136- # ---------------------------------------------------------
137-
138- # Image processing (Now uses the unique name with unit for hashing)
139- img_el = product .query_selector ('img' )
150+ # Composition logic
151+ display_name = name
152+ if unit and unit != "N/A" and unit .lower () not in name .lower ():
153+ display_name = f"{ name } { unit } "
154+
155+ # 4. IMAGE SELECTORS
156+ img_el = product .query_selector ('.imageWrapperWrapper img' ) or \
157+ product .query_selector ('.imageWrapper img' ) or \
158+ product .query_selector ('img' )
159+
140160 img_url = img_el .get_attribute ('src' ) if img_el else None
141161
142- img_filename = get_image_filename (name )
162+ img_filename = get_image_filename (display_name )
143163 if img_url :
144164 process_image (img_url , img_filename )
145165
146166 scraped_data .append ({
147167 "date" : today ,
148- "name" : name , # This now contains the unit (e.g. "Soybean Oil 5L")
168+ "name" : display_name ,
149169 "price" : price ,
150170 "unit" : unit ,
151171 "category" : entry ['category' ],
152172 "image" : img_filename
153173 })
174+ print (f" + { display_name } : ৳{ price } " )
154175 count_for_page += 1
155176 except Exception :
156177 continue
0 commit comments