FlaskWebScraper/hinScraperCSV.py at master · AndrewPalet/FlaskWebScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import logging
import requests
import pandas as pd
from bs4 import BeautifulSoup
import queue
import html5lib
import json
import re
import time
import os

from datetime import date

# Parameters
login = {'login': '', 'PWDpassword': ''}

"""
def scrape_csv function receives a csv filepath and processes the information into a dataframe. It then scrapes the
information from a website and returns a dataframe object AND amount of hins processed

:param str fileLocation: location of the csv file needing to be processed

"""
def scrape_csv(fileLocation):

    df = pd.read_csv(fileLocation)
    hin_list = df['HIN_NUM']
    hin_list = set(hin_list)

    base_url = 'https://www.hibcchin.org/hin_detail.asp?hin='
    dfCSV = pd.DataFrame(columns=['HIN NUMBER', 'CUSTOMER NAME', 'ADDRESS LINE 1',
    'ADDRESS LINE 2', 'CITY', 'STATE', 'ZIP CODE', 'COUNTRY', 'MARKET SEGMENT', 'STATUS', 'PHONE NUMBER'])

    # For loop through all the hin list until finished
    for h in hin_list:
        try:
            page = requests.get(base_url + str(h), login, stream=True, verify=False)
        except requests.ConnectionError as e:
            logging.error("** Connection Error **")
            logging.error(str(e))

        soup = BeautifulSoup(page.content, features='lxml')
        html = list(soup.children)[0]
        body = list(html.children)[1]

        try:
            # Finds the location of HIN, Customer name, Address 1+2, City State
            # Zipcode, Country
            t2 = list(body.table.table.table)[1]
            subtable = list(t2.children)[0].table
            subtable.b.text
            t3 = list(subtable.td.td.children)

            # Finds the Market Segment, Status, and Phone number and saves them in variables
            ls = soup.find_all("b")
            MARKET_SEGMENT = str(ls[3]).replace('<b>', '').replace('</b>', '')
            STATUS = str(ls[5]).replace('<b>', '').replace('</b>', '')
            PHONE_NUMBER = str(ls[7]).replace('<b>', '').replace('</b>', '')

            # Phone Number: removing encoded values in the phone number by extracting the integers
            temp = re.findall(r'\d+', PHONE_NUMBER)
            res = list(map(int, temp))
            PHONE_NUMBER = ""
            for x in res:
                PHONE_NUMBER += str(x) + " "

            if len(t3) == 12:
                CUSTOMER_NAME = t3[0].text
                ADDRESS_LINE_1 = t3[1]
                ADDRESS_LINE_2 = t3[3]
                CITYSTATE = t3[5]
                CITY = CITYSTATE.split(',')[0]
                STATE = CITYSTATE.split(',')[1]
                ZIP_CODE = t3[7]
                COUNTRY = t3[9]
            elif len(t3) == 10:
                CUSTOMER_NAME = t3[0].text
                ADDRESS_LINE_1 = t3[1]
                ADDRESS_LINE_2 = ''
                CITYSTATE = t3[3]
                CITY = CITYSTATE.split(',')[0]
                STATE = CITYSTATE.split(',')[1]
                ZIP_CODE = t3[5]
                COUNTRY = t3[7]

            dfCSV = dfCSV.append({'HIN NUMBER': str(h), 'CUSTOMER NAME': CUSTOMER_NAME, 'ADDRESS LINE 1': ADDRESS_LINE_1,
            'ADDRESS LINE 2': ADDRESS_LINE_2, 'CITY': CITY, 'STATE': STATE, 'ZIP CODE': ZIP_CODE, 'COUNTRY': COUNTRY,
            'MARKET SEGMENT': MARKET_SEGMENT, 'STATUS': STATUS, 'PHONE NUMBER': PHONE_NUMBER}, ignore_index=True)

        except IndexError:
            CUSTOMER_NAME = ''
            ADDRESS_LINE_1 = ''
            ADDRESS_LINE_2 = ''
            CITY = ''
            STATE = ''
            ZIP_CODE = ''
            COUNTRY = ''
            MARKET_SEGMENT = ''
            STATUS = ''
            PHONE_NUMBER = ''

            dfCSV = dfCSV.append({'HIN NUMBER': str(h), 'CUSTOMER NAME': CUSTOMER_NAME, 'ADDRESS LINE 1': ADDRESS_LINE_1,
            'ADDRESS LINE 2': ADDRESS_LINE_2, 'CITY': CITY, 'STATE': STATE, 'ZIP CODE': ZIP_CODE, 'COUNTRY': COUNTRY,
            'MARKET SEGMENT': MARKET_SEGMENT, 'STATUS': STATUS, 'PHONE NUMBER': PHONE_NUMBER}, ignore_index=True)

    # Exiting For loop to save processed information from dataframe to csv file
    dfCSV.to_csv(fileLocation, index = False, header = True)

    # Returns the completed dataframe AND number of hins processed for time saved value
    return(dfCSV, len(hin_list))