NLP_Job_Analysis/GDscraper_3.py at master · jgroth1/NLP_Job_Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# coding: utf-8

# In[1]:


from splinter import Browser
from bs4 import BeautifulSoup
from random import randint
import datetime
import os
import json
import time
import re

# Sets the root url
root_url = 'https://www.glassdoor.com'

# Sets the search parameters
time_period = "Last Day"
keyword = 'Data Scientist'
location = 'United States'
# Set path variables

corpus_path = 'corpus_master/'


# Sets the path to the chrome driver
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

# directs the browser to visit the root site and sleeps for 2 seconds
browser.visit(root_url)
time.sleep(2)

# Fills out the form for job title and location and sleeps for 2 seconds
browser.find_by_xpath("//input[@id='KeywordSearch']").fill(keyword)
browser.find_by_xpath("//input[@id='LocationSearch']").fill(location)
browser.find_by_id('HeroSearchButton').click()
time.sleep(2)

# Select date posted drop down and Set date range to search
browser.find_by_text("Date Posted").click()
time.sleep(1)
browser.find_by_text(time_period).click()
time.sleep(randint(1,3))


# In[ ]:


# initialize page count
n = 1

# run while there is still a next button
while True:
    try:
        x_button = browser.find_by_css('div.xBtn')
        if x_button:
            x_button.click()
    except Exception as e:
        print(e)
    test_url = browser.url


    #creates directory to save all files
    date = datetime.datetime.strftime(datetime.datetime.now(), '%d-%m-%Y')
    path = corpus_path + date + '/'
    if not os.path.exists(path):
        os.makedirs(path)


    elements = browser.find_by_xpath("//li[@class='jl']")

    for element in elements:
        try:
            x_button = browser.find_by_css('div.xBtn')
            if x_button:
                x_button.click()
        except Exception as e:
            print(e)
        try:
            element.click()
            time.sleep(2)
            job_title = browser.find_by_xpath("//h1[@class='jobTitle h2 strong']")

            #print(job_title.html)


            job_title = job_title.html

            job_title = str(job_title)

            company = str(browser.find_by_xpath("//a[@class='plain strong empDetailsLink']").html)
            print(company)

            company_info = browser.find_by_xpath("//div[@class='compInfo']")

            if len(company_info.find_by_tag('span')) == 2:
                city_state = str(company_info.find_by_tag('span')[1].html)
                rating = str(company_info.find_by_tag('span')[0].html)
                rating = rating[0:3]
                print(city_state)

                print(rating)
            elif len(company_info.find_by_tag('span')) == 1:
                city_state = company_info.find_by_tag('span').html
                rating = 'NA'
                print(city_state)
                print(rating)

            job_description = str(browser.find_by_xpath("//div[@class='jobDescriptionContent desc']").html)

            date = datetime.datetime.strftime(datetime.datetime.now(), '%d-%m-%Y')

            job = {'job title': job_title, 'company': company, 'city state': city_state, 'rating': rating, 'job description': job_description}

            try:
                filename = company.replace(" ", "") + '-' + job_title.replace(" ", "").replace("/", "-") + '.json'
                with open(path + filename, 'w') as f:
                    f.write(json.dumps(job))
                    print('Saved to file : ' + filename)
            except Exception as e:
                print(e)
            time.sleep(2)
        except Exception as e:
            print(e)

    next_button = browser.find_by_css('li.next')
    if next_button:
        next_button.click()
        time.sleep(randint(1,10))
    else:
        break
    n += 1