-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpost_urls_scraper.py
More file actions
78 lines (62 loc) · 3.19 KB
/
post_urls_scraper.py
File metadata and controls
78 lines (62 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class PostsLink:
"""
A class to collect Instagram post URLs from a user or hashtag page by scrolling.
"""
def __init__(self, driver):
"""
Initialize the object with the provided web driver.
Parameters:
- driver: Selenium WebDriver instance for browser interaction.
"""
self.driver = driver # Selenium WebDriver instance.
self.clean_links = set() # Set to store unique post URLs.
self.scroll_attempts = 0 # Counter for tracking consecutive scrolls without finding new links.
def collection_post_url(self):
"""
Collects Instagram post URLs by scrolling the page.
Returns:
- A set of unique URLs representing the posts collected from the page.
"""
try:
# Wait for the main posts container to load.
div_posts = WebDriverWait(self.driver, 20).until(
EC.presence_of_element_located((By.XPATH, "//section/main/div/div[2]/div"))
)
print("Starting to collect post URLs...")
print("\n----------------------------------------------------------------------------------\n")
while True:
try:
# Save the count of links collected before scrolling.
previous_count = len(self.clean_links)
# Find all anchor tags inside the posts container and add their href attributes to the set.
links = div_posts.find_elements(By.CSS_SELECTOR, "div a")
for link in links:
self.clean_links.add(link.get_attribute('href'))
# Save the count of links collected after scrolling.
current_count = len(self.clean_links)
print(f"Number of links collected so far: {current_count}")
# If new links were found, reset scroll attempts.
if current_count > previous_count:
self.scroll_attempts = 0
else:
# Increment the scroll attempts if no new links are found.
self.scroll_attempts += 1
print(f"Scroll attempts without new links: {self.scroll_attempts}")
# Scroll the page down in increments to load more content.
for _ in range(4): # Perform four small scrolls for gradual loading.
self.driver.execute_script("window.scrollBy(0, 500);")
time.sleep(2)
# Exit the loop if the maximum number of scroll attempts is reached.
if self.scroll_attempts > 5:
print("No more new links loaded. Exiting...")
break
except Exception as e:
print(f"An error occurred during scrolling: {e}")
break
except Exception as e:
print(f"An error occurred during initialization: {e}")
return self.clean_links