-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathparser.py
More file actions
executable file
·72 lines (63 loc) · 2.57 KB
/
parser.py
File metadata and controls
executable file
·72 lines (63 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/python2.7
# -*- coding: utf-8 -*-
"""Seekingalpha website StockTalks parser."""
from __future__ import print_function
import os
import csv
from datetime import datetime, timedelta
from selenium import webdriver
from bs4 import BeautifulSoup
def get_time(timestamp):
"""Converting website date reference to datetime object."""
if timestamp.endswith('d'):
delta = timestamp.split(' ')[0]
return datetime.now() - timedelta(days=int(delta))
else:
return datetime.strptime(timestamp, '%m/%d/%Y')
def write_data(output, date, data):
"""Writing data to CSV file."""
short_path = output + '/' + date
path = short_path + '/comments.csv'
if not os.path.exists(short_path):
os.makedirs(short_path)
if os.path.exists(path):
with open(path, 'a') as csvfile:
fieldnames = ['date', 'text', 'tickers']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(data)
else:
with open(path, 'w') as csvfile:
fieldnames = ['date', 'text', 'tickers']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerow(data)
def parse_data(time_window, output):
"""Parsing posts in user-defined time window."""
time_delta = datetime.now() - timedelta(days=int(time_window))
driver = webdriver.PhantomJS(executable_path="./phantomjs")
driver.get("""https://seekingalpha.com/author/the-geoteam/"""
"""stocktalks#view=posts_activities""")
feed = BeautifulSoup(driver.page_source,
'html.parser').findAll('div',
{'class':
'card'})
driver.quit()
driver.close()
for i in feed:
if get_time(i.span.string) >= time_delta:
data = {}
data['date'] = get_time(i.span.string).strftime('%Y/%m/%d')
data['text'] = i.find('div',
{'class':
'headline'}).text.encode('utf-8')
tickers = i.find('div', {'class': 'title'}).findAll('a')
if len(tickers) > 1:
data['tickers'] = (', ').join([i.string
for i in tickers][1:])
else:
data['tickers'] = ''
write_data(output, data['date'], data)
if __name__ == '__main__':
output = raw_input('Output folder: ')
time_window = raw_input('Days timedelta: ')
parse_data(time_window, output)