-
Notifications
You must be signed in to change notification settings - Fork 44
Expand file tree
/
Copy pathdo_scraping.py
More file actions
80 lines (59 loc) · 1.94 KB
/
do_scraping.py
File metadata and controls
80 lines (59 loc) · 1.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import json
import sys
import time
import xlsxwriter
from configparser import ConfigParser
from Scraper import Scraper
# Loading of configurations
from utils import ComplexEncoder
config = ConfigParser()
config.read('config.ini')
# Setting the execution mode
headless_option = len(sys.argv) >= 2 and sys.argv[1].upper() == 'HEADLESS'
# Loading of input data (LinkedIn Urls)
profiles_urls = []
for entry in open(config.get('profiles_data', 'input_file_name'), "r"):
profiles_urls.append(entry.strip())
if len(profiles_urls) == 0:
print("Please provide an input.")
sys.exit(0)
# Launch Scraper
s = Scraper(
linkedin_username=config.get('linkedin', 'username'),
linkedin_password=config.get('linkedin', 'password'),
profiles_urls=profiles_urls,
headless=headless_option
)
s.start()
s.join()
scraping_results = s.results
# Generation of XLS file with profiles data
output_file_name = config.get('profiles_data', 'output_file_name')
if config.get('profiles_data', 'append_timestamp').upper() == 'Y':
output_file_name_split = output_file_name.split('.')
output_file_name = "".join(output_file_name_split[0:-1]) + "_" + str(int(time.time())) + "." + \
output_file_name_split[-1]
workbook = xlsxwriter.Workbook(output_file_name)
worksheet = workbook.add_worksheet()
# Headers
headers = ['Name', 'Email', 'Skills', 'Jobs']
for h in range(len(headers)):
worksheet.write(0, h, headers[h])
# Content
for i in range(len(scraping_results)):
scraping_result = scraping_results[i]
if scraping_result.is_error():
data = ['Error'] * len(headers)
else:
p = scraping_result.profile
data = [
p.name,
p.email,
",".join(p.skills)
]
for job in p.jobs:
data.append(json.dumps(job.reprJSON(), cls=ComplexEncoder))
for j in range(len(data)):
worksheet.write(i + 1, j, data[j])
workbook.close()
print("Scraping Ended")