SEO-Python-Analyzer/run_through_urls.py at master · FazilShah/SEO-Python-Analyzer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from Scrap_Webpage import scrape
from multiprocessing.dummy import Pool


def scrape_details(line):
    print("Scraping " + line + "\n")
    page = scrape(line.strip())
    title = page.get_title()
    print(line + " " + str(title))
    dict_titles.update({line: title})
    meta_descriptions = page.get_meta_description()
    dict_meta_d.update({line: meta_descriptions})
    h1_text = page.get_h1_tags()
    h1_tags.update(({line:h1_text}))
    canonical = page.get_canonical()
    canonicals.update({line:canonical})
    viewport_present = page.get_viewports()
    viewports.update({line:viewport_present})


def scrape_faster(filename):
    clean_url_list = get_urls_from_file(filename)
    global dict_meta_d
    dict_meta_d = {}

    global dict_titles
    dict_titles = {}

    global h1_tags
    h1_tags = {}


    global canonicals
    canonicals = {}

    global viewports
    viewports = {}

    pool = Pool(8)

    pool.map(scrape_details, clean_url_list)
    pool.close()
    pool.join()

    return dict_titles, dict_meta_d, h1_tags, canonicals, viewports


def get_urls_from_file(filename):
    with open(filename) as file:
        list_of_url = file.read().split('\n')
        # print(list_of_url)
        list_of_url.remove('')
        clean_url_list = [url for url in list_of_url if (
                    'jpeg' in url or '#' in url or 'pdf' in url or 'png' in url or 'jpg' in url or 'tag' in url or 'tel' in url) == False]
        return clean_url_list