Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 45 additions & 23 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 12 11:02:06 2020

@author: OHyic

Google Image Scraper
A script to download images from Google Images based on search terms.
"""
#Import libraries

# Import required libraries
import os
import concurrent.futures
import chromedriver_autoinstaller
from GoogleImageScraper import GoogleImageScraper
from patch import webdriver_executable


def worker_thread(search_key):
"""
Worker thread function that handles image scraping for a single search term

Args:
search_key (str): The search term to scrape images for
"""
# Initialize scraper with configuration parameters
image_scraper = GoogleImageScraper(
webdriver_path,
image_path,
Expand All @@ -22,31 +29,46 @@ def worker_thread(search_key):
min_resolution,
max_resolution,
max_missed)

# Get image URLs and save them to disk
image_urls = image_scraper.find_image_urls()
image_scraper.save_images(image_urls, keep_filenames)

#Release resources
# Release resources
del image_scraper

if __name__ == "__main__":
#Define file path
webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))
# Auto-install chromedriver for the current Chrome version
chromedriver_path = chromedriver_autoinstaller.install()

# Create photos directory if it doesn't exist
image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))
os.makedirs(image_path, exist_ok=True)

# Use the detected chromedriver path
webdriver_path = chromedriver_path

# Collect search terms from user input
user_input = input("Enter search terms separated by commas: ")
search_keys = [term.strip() for term in user_input.split(',') if term.strip()]

# Exit if no valid search terms were provided
if not search_keys:
print("No valid search terms provided. Exiting...")
exit()

print(f"Will search for: {', '.join(search_keys)}")

# Configuration parameters
number_of_images = 10 # Number of images to download per search term
headless = False # Whether to run Chrome in headless mode
min_resolution = (0, 0) # Minimum image resolution (width, height)
max_resolution = (9999, 9999) # Maximum image resolution (width, height)
max_missed = 10 # Maximum number of consecutive download failures before stopping
number_of_workers = min(len(search_keys), 4) # Number of parallel threads (max 4)
keep_filenames = False # Whether to keep original filenames from URLs

#Add new search key into array ["cat","t-shirt","apple","orange","pear","fish"]
search_keys = list(set(["car","stars"]))

#Parameters
number_of_images = 10 # Desired number of images
headless = False # True = No Chrome GUI
min_resolution = (0, 0) # Minimum desired image resolution
max_resolution = (9999, 9999) # Maximum desired image resolution
max_missed = 10 # Max number of failed images before exit
number_of_workers = 1 # Number of "workers" used
keep_filenames = False # Keep original URL image filenames

#Run each search_key in a separate thread
#Automatically waits for all threads to finish
#Removes duplicate strings from search_keys
# Run searches in parallel threads
# Each thread handles one search term
with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_workers) as executor:
executor.map(worker_thread, search_keys)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
selenium==3.141.0
requests==2.25.1
pillow==9.0.1
pillow==9.0.1
chromedriver_autoinstaller