Skip to content

Commit 19a3be1

Browse files
Add multithreading in image downloader; Support subcategories (#16)
* Rebase master * small adjustments
1 parent 5f9e6f9 commit 19a3be1

3 files changed

Lines changed: 177 additions & 81 deletions

File tree

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,7 @@ fabric.properties
143143
.idea/codestream.xml
144144

145145
# End of https://www.toptal.com/developers/gitignore/api/macos,pycharm
146+
147+
dataset/
148+
.vscode
149+
env

scripts/image_downloader.py

Lines changed: 171 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,54 @@
11
"""
2-
This is a script that takes data from the CSVs, download and stores the images.
3-
It assumes the csv has the following columns: name, thumbnail link, date, img link, subcategory id, subcategory
4-
5-
The final dataset is stored in the final_filename{i} files according to CIFAR semantics:
6-
7-
The archive contains the files dataset1, dataset2, ... Each of these files is a Python "pickled" object produced with Pickle.
8-
Here is a python3 routine which will open such a file and return a dictionary:
9-
10-
def unpickle(file):
11-
import pickle
12-
with open(file, 'rb') as fo:
13-
dict = pickle.load(fo, encoding='bytes')
14-
return dict
15-
16-
Loaded in this way, each of the batch files contains a dictionary with the following elements:
17-
data -- a 10000x7500 numpy array of uint8s. Each row of the array stores a 80x80 colour image. The first 2500 entries contain the red channel values, the next 2500 the green, and the final 2500 the blue.
18-
The image is stored in row-major order, so that the first 50 entries of the array are the red channel values of the first row of the image.
19-
labels -- a list of 10000 numbers in the range 0-20. The number at index i indicates the label of the ith image in the array data.
2+
This is a script that takes data from various CSV files, download and stores the images.
3+
It assumes the csv has the following columns: name, thumbnail link, date, img link, subcategory id, subcategory.
4+
5+
The final dataset can be store according to two different conventions:
6+
one that is more pytorch friendly: saves the images as individual files inside a folder and the labels in a csv
7+
the other is more numpy friendly: the images are stored as numpy array inside a file "pickled" from a dictionary with also the labels
8+
9+
Details of the numpy version:
10+
The final dataset is stored in the final_filename{i} files according to CIFAR semantics:
11+
The archive contains the files dataset1, dataset2, ... Each of these files is a Python "pickled" object produced with Pickle.
12+
Here is a python3 routine which will open such a file and return a dictionary:
13+
14+
def unpickle(file):
15+
import pickle
16+
with open(file, 'rb') as fo:
17+
dict = pickle.load(fo, encoding='bytes')
18+
return dict
19+
20+
Loaded in this way, each of the batch files contains a dictionary with the following elements:
21+
data -- a 10000x3072 numpy array of uint8s. Each row of the array stores a 32x32 colour image.
22+
The first 1024 entries contain the red channel values, the next 1024 the green, and the final 1024 the blue.
23+
The image is stored in row-major order, so that the first 50 entries of the array are the red channel values of the first row of the image.
24+
labels -- a list of 10000 numbers in the range 1-21. The number at index i indicates the label of the ith image in the array data.
25+
sublabels -- a list of 10000 numbers in the range 1-?. The number at index i indecates the sublabel (detailed category) of the ith image in the array data.
26+
27+
Details of the pytorch version:
28+
The images will all be stored inside the folder 'images' (that will be created if non-existent) with their original name (assumed to be unique)
29+
The labels will be stored inside a csv file dataset_labels.csv here the first column has the image name, the second its label, the third its sublabel
2030
2131
Dependencies: Pillow, Pickle, numpy, requests
2232
"""
2333

2434
from os import listdir
2535
import csv
2636
from enum import Enum
37+
from typing import Any, Callable
2738
import requests
2839
import numpy as np
2940
import pickle
3041
import io
3142
import PIL.Image as Image
32-
import threading
43+
import PIL.ImageOps as ImageOps
44+
from threading import RLock
45+
from concurrent.futures import ThreadPoolExecutor
3346
from time import sleep
34-
47+
import argparse
48+
import csv
49+
from os import mkdir
50+
from os.path import exists
51+
from dataclasses import dataclass
3552

3653
class Tags(Enum):
3754
NAME = 0
@@ -41,86 +58,161 @@ class Tags(Enum):
4158
SUB_ID = 4
4259
SUB = 5
4360

44-
45-
def download_file_images(file, data, labels, lock: threading.RLock):
61+
class STORE_FORMAT(Enum):
62+
NUMPY_FRIENDLY = 0
63+
PYTORCH_FRIENDLY = 1
64+
65+
@dataclass
66+
class DownloaderConfig():
67+
store_format: STORE_FORMAT = STORE_FORMAT.PYTORCH_FRIENDLY # the prefered store format
68+
dataset_folder: str = "./dataset/" # the folder in which to find the csv files and in which to store the dataset
69+
date_accepted: int = 2010 # date from which to save a product
70+
final_filename: str = "dataset" # the base name for the dataset that will be stored
71+
percentage2download: int = 10 # percentage of dataset to download (not garanteed)
72+
samples4file: int = 10000 # save data every...
73+
labels_writer: Any = None # used in the PYTORCH_FRIENDLY, its a csv writer for the labels file
74+
final_dataset_path: Callable = None # used in NUMPY_FRIENDLY, used to generate the name of the next dataset file
75+
76+
GET_CATEGORY = lambda x: x.split("/")[-1].split("_")[0] # get the category from the filename
77+
collected_samples = 0 # the number of samples correctly stored
78+
79+
def store_data(config, data, labels, sublabels):
80+
"""Store the given data, based on the config object
81+
82+
Args:
83+
config (DownloaderConfig): an object with all the useful configs
84+
data (list): the images stored as bytes
85+
labels (list): the labels relative to each image corresponding to the category of the product
86+
sublabels (list): the sublabels relative to each image corresponding to the subcategory
87+
"""
88+
if config.store_format == STORE_FORMAT.NUMPY_FRIENDLY:
89+
# store as numpy array
90+
images2write = np.array([np.array(ImageOps.pad(Image.open(io.BytesIO(x)).convert('RGB'), (32, 32))) for x, _ in data])
91+
92+
with open(config.final_dataset_path(), "wb") as data_file:
93+
dict2write = {"data": images2write, "labels": np.array(labels), "sublabels": np.array(sublabels)}
94+
pickle.dump(dict2write, data_file, protocol=pickle.HIGHEST_PROTOCOL)
95+
data.clear()
96+
labels.clear()
97+
sublabels.clear()
98+
else:
99+
for (image, image_name), label, sublabel in zip(data, labels, sublabels):
100+
with open(config.dataset_folder + "images/" + image_name, "wb") as file:
101+
file.write(image)
102+
config.labels_writer.writerow([image_name, label, sublabel])
103+
104+
105+
106+
def download_file_images(file, data, labels, sublabels, config, categories, lock):
46107
global collected_samples
47-
csv_filename = DATASET_DIR + file
48-
with open(csv_filename, 'r', newline='') as csvfile:
108+
headers = {
109+
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
110+
} # needed, otherwise the request hangs
111+
# todo: is there a workaround??
112+
113+
with open(config.dataset_folder + file, 'r', newline='') as csvfile:
49114
reader = csv.reader(csvfile, delimiter=',')
50115
# read each line
51116
for index, row in enumerate(reader):
52117
# take one in ten
53-
if index % (100 / percentage2download) == 0:
118+
if index % (100 / config.percentage2download) == 0:
54119
date = row[Tags.DATE.value]
55120
# check date
56-
if not date or int(date.split("-")[0]) > date_accepted:
121+
if not date or int(date.split("-")[0]) > config.date_accepted:
57122
thumb_url = row[Tags.THUMBNAIL.value]
58123
try:
59124
answer = requests.get(thumb_url, headers=headers) # download thumbnail
60125
if answer.status_code <= 400:
61126
image = answer.content
62-
image = np.array(Image.open(io.BytesIO(image)))
63-
# image.save(thumb_url.split("/")[-1]) #was used to make sure the images are stored correctly
64127
with lock:
65-
data.append(image)
128+
data.append((image, thumb_url.split("/")[-1]))
66129
labels.append(categories[GET_CATEGORY(file)])
130+
sublabels.append(row[Tags.SUB_ID.value])
67131
collected_samples += 1
68132

69133
# check if collected enough samples
70-
if collected_samples % samples4file == 0:
71-
with open(FINAL_DATASET_PATH(), "wb") as data_file:
72-
dict2write = {"data": np.array(data, dtype=object), "labels": np.array(labels, dtype=object)}
73-
pickle.dump(dict2write, data_file, protocol=pickle.HIGHEST_PROTOCOL)
74-
data.clear()
75-
labels.clear()
76-
if collected_samples % 100 == 0:
77-
print(f'Collected {collected_samples} samples')
134+
if collected_samples % config.samples4file == 0:
135+
store_data(config, data, labels, sublabels) # todo: could be improved as operation can be done without lock
136+
137+
if collected_samples % 100 == 0:
138+
print(f'Collected {collected_samples} samples')
78139
else:
79140
print(f"Received a non-success code {answer.status_code} when crawling:")
80141
print(thumb_url)
81142
sleep(10)
82-
except Exception as e:
83-
print(f'Caught the following exception: {e} when crawling:')
143+
except Exception as e: # todo: bad as it catches all the exceptions
144+
print('Caught the following exception when crawling: ', e)
84145
print(thumb_url)
85146

86147

87-
GET_CATEGORY = lambda x: x.split("_")[0] # get the category from the filename
88-
DATASET_DIR = "./dataset/"
89-
90-
all_files = listdir(DATASET_DIR)
91-
all_files.sort()
92-
csv_files = []
93-
categories = {}
94-
95-
# IMPORTANT PARAMETERS
96-
percentage2download = 10 # 10%
97-
samples4file = 10000 # how many samples to store per file
98-
final_filename = "dataset"
99-
date_accepted = 2010 # date from which to collect samples
100-
101-
collected_samples = 0
102-
FINAL_DATASET_PATH = lambda: f'{DATASET_DIR}{final_filename}{int(np.ceil(collected_samples / samples4file))}' # get dataset path
103-
104-
for file in all_files:
105-
if not file.endswith(".csv"):
106-
continue
107-
csv_files.append(file)
108-
if not GET_CATEGORY(file) in categories:
109-
categories[GET_CATEGORY(file)] = len(categories)
110-
111-
data = []
112-
labels = []
113-
headers = {
114-
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'} # needed, otherwise the request hangs
115-
# todo: is there a workaround??
116-
117-
workers = []
118-
119-
# files are in the dataset folder
120-
for file in reversed(csv_files):
121-
worker = threading.Thread(target=download_file_images, args=(file, data, labels, threading.RLock(),))
122-
worker.start()
123-
workers.append(worker)
124-
125-
for thread in workers:
126-
thread.join()
148+
def multithread_image_download(config, max_threads):
149+
all_files = listdir(config.dataset_folder)
150+
all_files.sort()
151+
csv_files = []
152+
categories = {}
153+
154+
for file in all_files:
155+
if not file.endswith(".csv"):
156+
continue
157+
csv_files.append(file)
158+
if not GET_CATEGORY(file) in categories:
159+
categories[GET_CATEGORY(file)] = len(categories)
160+
161+
data = []
162+
labels = []
163+
sublabels = []
164+
165+
common_lock = RLock()
166+
167+
labels_csv_file = None
168+
if config.store_format == STORE_FORMAT.PYTORCH_FRIENDLY:
169+
if not exists(config.dataset_folder + "images"):
170+
mkdir(config.dataset_folder + "images")
171+
labels_csv_file = open(f'{config.dataset_folder}{config.final_filename}_lables.csv', "w", newline='')
172+
config.labels_writer = csv.writer(labels_csv_file, delimiter=',')
173+
174+
# files are in the dataset folder
175+
with ThreadPoolExecutor(max_workers=max_threads) as executor:
176+
for file in reversed(csv_files):
177+
executor.submit(download_file_images, file, data, labels, sublabels, config, categories, common_lock,)
178+
179+
store_data(config, data, labels, sublabels)
180+
if labels_csv_file:
181+
labels_csv_file.close()
182+
183+
if __name__ == "__main__":
184+
# key parameters
185+
downloader_config = DownloaderConfig()
186+
187+
# accept key parameter as args
188+
parser = argparse.ArgumentParser(description='Downloads the images from the given csv files and stores them in the given format')
189+
parser.add_argument('--format', help='the format in which to save the images, either "numpy" or "pytorch". Default is "numpy"')
190+
parser.add_argument('--folder', help='the folder in which to find the csv files, default is "./dataset/"')
191+
parser.add_argument('--threads', help="the max number of threads running at the same time, default: uncapped")
192+
parser.add_argument('--dataset-percentage', help="the percentage of the dataset to download, default is 10")
193+
args = vars(parser.parse_args())
194+
195+
# set arguments based on the parsed ones
196+
if args['format'] == "pytorch":
197+
downloader_config.store_format = STORE_FORMAT.PYTORCH_FRIENDLY
198+
elif args['format'] and args['format'] != "numpy":
199+
print('The format you provided is not valid.')
200+
parser.print_help()
201+
exit()
202+
203+
if args['folder']:
204+
downloader_config.dataset_folder = args['csv-folder']
205+
if downloader_config.dataset_folder[-1] != "/":
206+
downloader_config.dataset_folder += "/"
207+
max_threads = None
208+
if args['threads']:
209+
max_threads = int(args['threads'])
210+
211+
if args['dataset_percentage']:
212+
downloader_config.percentage2download = args['dataset-percentage']
213+
214+
# define lambda used in NUMPY_FRIDENLY to name the datasets name
215+
downloader_config.final_dataset_path = lambda: f'{downloader_config.dataset_folder}{downloader_config.final_filename}' +\
216+
f'{int(np.ceil(collected_samples / downloader_config.samples4file))}'
217+
218+
multithread_image_download(downloader_config, max_threads)

scripts/product_filtered2csv.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
""""
2-
This is a script to gather useful information from each product and save it in a csv file.
3-
The info collected are:
2+
This is a script to gather useful information from each product and save it in a csv file.
3+
The info collected are:
44
category (from the file), name (title), thumbnail_url (thumbnail), date (release_date), img_url (big_images -> big_image), sub_category_id (catid), subcategory (categories -> category)
55
The script also excludes 'bad' product recognized by having:
66
<link>https://cdon.se/</link>

0 commit comments

Comments
 (0)