FinanceScraping/twitterlinks.py at main · Stephan-Linzbach/FinanceScraping · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import json
import time
from selenium.webdriver.chrome.options import Options
import glob
from bs4 import BeautifulSoup as bs
import os
import requests
import selenium

#%% get html
def get_questions_html(url, sleep_time):
    print( 'Loading questions page...')
    chromeOptions = Options()
    chromeOptions.headless = True
    driver = selenium.webdriver.Chrome('/home/stephan/chromedriver', options=chromeOptions)
    try:
        driver.set_page_load_timeout(10)
        driver.get(url)
    except Exception as e:
        chromeOptions = Options()
        driver = selenium.webdriver.Chrome('/home/stephan/chromedriver')
        driver.set_page_load_timeout(10)
        try:
            driver.get(url)
        except:
            pass
    try:
        if not driver.page_source:
            print("No Source")
            return None
    except:
       return None
    """
    for i in range(1, times_to_scroll):
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        time.sleep(sleep_time)
    """
    time.sleep(sleep_time)
    return driver.page_source
#%%
def do_it(l):
    return list(set(l))
#%%
with open("./Europe_Usa.json_cleaned_final.json") as f:
     data = json.load(f)
#%%
names = [s[12:] for s in glob.glob("./out_links/*")]

data = {k : v for k, v in data.items() if k not in names}
#%%

for i, d in enumerate(data):
    if 'Adresse' not in list(data[d].keys()):
        with open("./out_links/" + d, "w") as f:
            json.dump({}, f)
        continue
    if 'Internet' not in list(data[d]['Adresse'].keys()):
        with open("./out_links/" + d, "w") as f:
            json.dump({}, f)
        continue
    print(i)
    url = data[d]['Adresse']['Internet'][0]
    print(url)
    if url == '':
        continue
    text = get_questions_html(url, 0)
    if not text:
        continue
    a = [a['href'] for a in bs(text).find_all('a', href=True)]
    twitter = do_it([t for t in a if 'twitter' in t])
    facebook = do_it([t for t in a if 'facebook' in t])
    youtube = do_it([t for t in a if 'youtube' in t])
    linkedin = do_it([t for t in a if 'linkedin' in t])
    all_known = twitter + facebook + youtube + linkedin
    all_extern = [t for t in a if 'https' in t and not t in all_known]
    for k in all_known:
        print(k)
    links = {}
    if len(twitter):
        links['Twitter'] = twitter
    if len(facebook):
        links['Twitter'] = facebook
    if len(youtube):
        links['Twitter'] = youtube
    if len(linkedin):
        links['Twitter'] = linkedin
    if len(all_extern):
        links['Extern'] = all_extern
    with open("./out_links/" + d, "w") as f:
        json.dump(links, f)