Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Byte-compiled / optimized / DLL files
.idea
__pycache__/
*.py[cod]

Expand Down
126 changes: 29 additions & 97 deletions morningscraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
import sys
import re

from decimal import Decimal
from datetime import datetime

from bs4 import BeautifulSoup
from security import make_soup, SecurityPage


if sys.version_info[0] == 3:
from urllib.request import urlopen
from urllib.parse import quote, urlsplit
elif sys.version_info[0] == 2:
from urllib import urlopen, quote
from urllib import quote
from urlparse import urlsplit
else:
raise Exception('Python version 2 or 3 required')
Expand Down Expand Up @@ -75,8 +70,7 @@ def search(ref, verbose=False):

if verbose:
print('Search for: %s' % ref)
data = urlopen(SEARCH_BASE % quote(ref)).read()
parsed_html = BeautifulSoup(data)
parsed_html = make_soup(SEARCH_BASE % quote(ref))
results = []
stocks = parsed_html.find_all(
'table', id='ctl00_MainContent_stockTable'
Expand All @@ -93,25 +87,28 @@ def search(ref, verbose=False):
'td', class_='searchCurrency'
)[0].text,
})
funds = parsed_html.find_all(
'table', id='ctl00_MainContent_fundTable'
)
if funds:
funds = funds[0].find_all('tr')[1:]
for fund in funds:
data = fund.find_all('td')
results.append({
'name': data[0].text,
'url': fix_url(data[0].a.get('href')),
'type': 'Fund',
'ISIN': data[1].text,
})

for instrument_type in ["fund", "etf"]:
funds = parsed_html.find_all(
'table', id='ctl00_MainContent_{}Table'.format(instrument_type)
)
if funds:
funds = funds[0].find_all('tr')[1:]
for fund in funds:
data = fund.find_all('td')
results.append({
'name': data[0].text,
'url': fix_url(data[0].a.get('href')),
'type': instrument_type,
'ISIN': data[1].text,
})
break

if verbose:
if results:
print('%s item(s) found.' % len(results))
for item in results:
print('\t%s\t%s' % (item['type'], item['name']))
print(item)
else:
print('No items found.')
return results
Expand Down Expand Up @@ -179,83 +176,18 @@ def get_url(url, verbose=False):
print('\nOpening %s' % url)
if not urlsplit(url).netloc.endswith(SITE):
raise Exception('Non morningstar.co.uk url %r' % url)
result = None
if '/uk/funds/snapshot/snapshot' in url:
try:
result = _get_funds(url)
except:
result = None
elif '/uk/stockreport/' in url:
try:
result = _get_stock(url)
except:
result = None
else:
raise Exception('Unrecognised url %r' % url)
page = SecurityPage.from_url(url)
result = page.get_data()
if verbose:
print(result)
return result


def _get_funds(url):
''' Get and parse returned html for fund pages e.g.
http://www.morningstar.co.uk/uk/funds/snapshot/snapshot.aspx?id=F00000NGEH
'''
data = urlopen(url).read()
parsed_html = BeautifulSoup(data)
title = parsed_html.find_all('div', class_='snapshotTitleBox')[0].h1.text
table = parsed_html.find_all('table', class_='overviewKeyStatsTable')[0]
for tr in table.find_all('tr'):
tds = tr.find_all('td')
if len(tds) != 3:
continue
if tds[0].text.startswith('NAV'):
date = tds[0].span.text
(currency, value) = tds[2].text.split()
if tds[0].text.startswith('Day Change'):
change = tds[2].text.strip()
if tds[0].text.startswith('ISIN'):
isin = tds[2].text.strip()
return {
'title': title,
'value': Decimal(value),
'currency': currency,
'change': change,
'date': dmy_2_date(date),
'url': url,
'ISIN': isin,
'type': 'Fund',
}


def _get_stock(url):
''' Get and parse returned html for stock pages e.g.
http://tools.morningstar.co.uk/uk/stockreport/default.aspx?SecurityToken=0P000090RG]3]0]E0WWE$$ALL
'''
data = urlopen(url).read()
parsed_html = BeautifulSoup(data)
title = parsed_html.find_all('span', class_='securityName')[0].text
value = parsed_html.find_all('span', id='Col0Price')[0].text
change = parsed_html.find_all('span', id='Col0PriceDetail')[0].text
change = change.split('|')[1].strip()
date = parsed_html.find_all('p', id='Col0PriceTime')[0].text[6:16]
currency = parsed_html.find_all('p', id='Col0PriceTime')[0].text
currency = re.search(r'\|\s([A-Z]{3,4})\b', currency).group(1)
isin = parsed_html.find_all('td', id='Col0Isin')[0].text
return {
'title': title,
'value': Decimal(value),
'currency': currency,
'change': change,
'date': dmy_2_date(date),
'url': url,
'ISIN': isin,
'type': 'Stock',
}


if __name__ == '__main__':
get_data('GB00B54RK123', verbose=True)
get_data('LLOY LSE', verbose=True)
get_data('GOOG NASDAQ', verbose=True)
get_data('LU1023728089', verbose=True)
search('EWJ', verbose=True)
get_data('ASHR', verbose=True)
# get_data('GLD ETF', verbose=True)
# get_data('GB00B54RK123', verbose=True)
# get_data('LLOY LSE', verbose=True)
# get_data('GOOG NASDAQ', verbose=True)
# get_data('LU1023728089', verbose=True)
115 changes: 115 additions & 0 deletions morningscraper/security.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import sys
import re
import abc
import six
from decimal import Decimal
from datetime import datetime

from bs4 import BeautifulSoup

if sys.version_info[0] == 3:
from urllib.request import urlopen
elif sys.version_info[0] == 2:
from urllib import urlopen
else:
raise Exception('Python version 2 or 3 required')


def make_soup(url, parser="html.parser"):
response = urlopen(url)
soup = BeautifulSoup(response, parser)
return soup


@six.add_metaclass(abc.ABCMeta)
class SecurityPage(object):

@classmethod
def from_url(cls, url):
if '/uk/funds/snapshot/snapshot' in url:
return FundsPage(url)
elif '/uk/stockreport/' in url:
return StockPage(url)
elif '/uk/etf/' in url:
return ETFPage(url)

def __init__(self, url):
self.url = url
cls_name = self.__class__.__name__
security_type = cls_name[:cls_name.find("Page")]
self.data_ = {"type": security_type, "url": self.url}

def get_data(self):
soup = make_soup(self.url)
self._update_data(soup)
return self.data_

@abc.abstractmethod
def _update_data(self, soup):
""""""


class FundsPage(SecurityPage):
"""
http://www.morningstar.co.uk/uk/funds/snapshot/snapshot.aspx?id=F00000NGEH
"""
def _update_data(self, soup):
text = soup.find_all('div', class_='snapshotTitleBox')[0].h1.text
self.data_["name"] = str(text)
table = soup.find_all('table', class_='overviewKeyStatsTable')[0]
for tr in table.find_all('tr'):
tds = tr.find_all('td')
if len(tds) != 3:
continue
if tds[0].text.startswith('NAV'):
date = tds[0].span.text
(currency, value) = tds[2].text.split()
if tds[0].text.startswith('Day Change'):
change = tds[2].text.strip()
if tds[0].text.startswith('ISIN'):
isin = tds[2].text.strip()
result = {
'value': Decimal(value),
'currency': currency,
'change': change,
'date': datetime.strptime(date, '%d/%m/%Y').date(),
'ISIN': isin
}
self.data_.update(result)


class StockPage(SecurityPage):
def _update_data(self, soup):
title = soup.find_all('span', class_='securityName')[0].text
value = soup.find_all('span', id='Col0Price')[0].text
change = soup.find_all('span', id='Col0PriceDetail')[0].text
change = change.split('|')[1].strip()
date = soup.find_all('p', id='Col0PriceTime')[0].text[6:16]
currency = soup.find_all('p', id='Col0PriceTime')[0].text
currency = re.search(r'\|\s([A-Z]{3,4})\b', currency).group(1)
isin = soup.find_all('td', id='Col0Isin')[0].text
return {
'name': title,
'value': Decimal(value),
'currency': currency,
'change': change,
'date': datetime.strptime(date, '%d/%m/%Y').date(),
'ISIN': isin
}


class ETFPage(SecurityPage):
def _update_data(self, soup):
text = soup.find_all('div', class_='snapshotTitleBox')[0].h1.text
self.data_["name"] = text.split('|')[0].strip()
self.data_["ticker"] = text.split('|')[1].strip()
for keyword in ["Exchange", "ISIN"]:
line = soup.find(text=keyword)
if line is None:
continue
text = line.parent.nextSibling.nextSibling.text
self.data_[keyword] = str(text)
line = soup.find(text="Closing Price")
if line is not None:
self.data_["currency"] = \
line.parent.nextSibling.nextSibling.text[:3]
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
url="https://github.com/tobes/MorningScraper",
packages=find_packages(),
long_description=long_desc,
install_requires=['beautifulsoup4'],
install_requires=['beautifulsoup4', 'six'],
classifiers=[
"Development Status :: 3 - Alpha",
"Topic :: Utilities",
Expand Down