tobes · aliciawyy · Jun 22, 2017 · Jun 22, 2017 · Jun 22, 2017 · Jun 22, 2017
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 # Byte-compiled / optimized / DLL files
+.idea
 __pycache__/
 *.py[cod]
 

diff --git a/morningscraper/__init__.py b/morningscraper/__init__.py
@@ -1,17 +1,12 @@
 import sys
-import re
-
-from decimal import Decimal
 from datetime import datetime
-
-from bs4 import BeautifulSoup
+from security import make_soup, SecurityPage
 
 
 if sys.version_info[0] == 3:
-    from urllib.request import urlopen
     from urllib.parse import quote, urlsplit
 elif sys.version_info[0] == 2:
-    from urllib import urlopen, quote
+    from urllib import  quote
     from urlparse import urlsplit
 else:
     raise Exception('Python version 2 or 3 required')
@@ -75,8 +70,7 @@ def search(ref, verbose=False):
 
     if verbose:
         print('Search for: %s' % ref)
-    data = urlopen(SEARCH_BASE % quote(ref)).read()
-    parsed_html = BeautifulSoup(data)
+    parsed_html = make_soup(SEARCH_BASE % quote(ref))
     results = []
     stocks = parsed_html.find_all(
         'table', id='ctl00_MainContent_stockTable'
@@ -93,25 +87,28 @@ def search(ref, verbose=False):
                     'td', class_='searchCurrency'
                 )[0].text,
             })
-    funds = parsed_html.find_all(
-        'table', id='ctl00_MainContent_fundTable'
-    )
-    if funds:
-        funds = funds[0].find_all('tr')[1:]
-        for fund in funds:
-            data = fund.find_all('td')
-            results.append({
-                'name': data[0].text,
-                'url': fix_url(data[0].a.get('href')),
-                'type': 'Fund',
-                'ISIN': data[1].text,
-            })
+
+    for instrument_type in ["fund", "etf"]:
+        funds = parsed_html.find_all(
+            'table', id='ctl00_MainContent_{}Table'.format(instrument_type)
+        )
+        if funds:
+            funds = funds[0].find_all('tr')[1:]
+            for fund in funds:
+                data = fund.find_all('td')
+                results.append({
+                    'name': data[0].text,
+                    'url': fix_url(data[0].a.get('href')),
+                    'type': instrument_type,
+                    'ISIN': data[1].text,
+                })
+            break
 
     if verbose:
         if results:
             print('%s item(s) found.' % len(results))
             for item in results:
-                print('\t%s\t%s' % (item['type'], item['name']))
+                print(item)
         else:
             print('No items found.')
     return results
@@ -179,83 +176,18 @@ def get_url(url, verbose=False):
         print('\nOpening %s' % url)
     if not urlsplit(url).netloc.endswith(SITE):
         raise Exception('Non morningstar.co.uk url %r' % url)
-    result = None
-    if '/uk/funds/snapshot/snapshot' in url:
-        try:
-            result = _get_funds(url)
-        except:
-            result = None
-    elif '/uk/stockreport/' in url:
-        try:
-            result = _get_stock(url)
-        except:
-            result = None
-    else:
-        raise Exception('Unrecognised url %r' % url)
+    page = SecurityPage.from_url(url)
+    result = page.get_data()
     if verbose:
         print(result)
     return result
 
 
-def _get_funds(url):
-    ''' Get and parse returned html for fund pages e.g.
-        http://www.morningstar.co.uk/uk/funds/snapshot/snapshot.aspx?id=F00000NGEH
-    '''
-    data = urlopen(url).read()
-    parsed_html = BeautifulSoup(data)
-    title = parsed_html.find_all('div', class_='snapshotTitleBox')[0].h1.text
-    table = parsed_html.find_all('table', class_='overviewKeyStatsTable')[0]
-    for tr in table.find_all('tr'):
-        tds = tr.find_all('td')
-        if len(tds) != 3:
-            continue
-        if tds[0].text.startswith('NAV'):
-            date = tds[0].span.text
-            (currency, value) = tds[2].text.split()
-        if tds[0].text.startswith('Day Change'):
-            change = tds[2].text.strip()
-        if tds[0].text.startswith('ISIN'):
-            isin = tds[2].text.strip()
-    return {
-        'title': title,
-        'value': Decimal(value),
-        'currency': currency,
-        'change': change,
-        'date': dmy_2_date(date),
-        'url': url,
-        'ISIN': isin,
-        'type': 'Fund',
-    }
-
-
-def _get_stock(url):
-    ''' Get and parse returned html for stock pages e.g.
-        http://tools.morningstar.co.uk/uk/stockreport/default.aspx?SecurityToken=0P000090RG]3]0]E0WWE$$ALL
-    '''
-    data = urlopen(url).read()
-    parsed_html = BeautifulSoup(data)
-    title = parsed_html.find_all('span', class_='securityName')[0].text
-    value = parsed_html.find_all('span', id='Col0Price')[0].text
-    change = parsed_html.find_all('span', id='Col0PriceDetail')[0].text
-    change = change.split('|')[1].strip()
-    date = parsed_html.find_all('p',  id='Col0PriceTime')[0].text[6:16]
-    currency = parsed_html.find_all('p',  id='Col0PriceTime')[0].text
-    currency = re.search(r'\|\s([A-Z]{3,4})\b', currency).group(1)
-    isin = parsed_html.find_all('td',  id='Col0Isin')[0].text
-    return {
-        'title': title,
-        'value': Decimal(value),
-        'currency': currency,
-        'change': change,
-        'date': dmy_2_date(date),
-        'url': url,
-        'ISIN': isin,
-        'type': 'Stock',
-    }
-
-
 if __name__ == '__main__':
-    get_data('GB00B54RK123', verbose=True)
-    get_data('LLOY LSE', verbose=True)
-    get_data('GOOG NASDAQ', verbose=True)
-    get_data('LU1023728089', verbose=True)
+    search('EWJ', verbose=True)
+    get_data('ASHR', verbose=True)
+    # get_data('GLD ETF', verbose=True)
+    # get_data('GB00B54RK123', verbose=True)
+    # get_data('LLOY LSE', verbose=True)
+    # get_data('GOOG NASDAQ', verbose=True)
+    # get_data('LU1023728089', verbose=True)
diff --git a/morningscraper/security.py b/morningscraper/security.py
@@ -0,0 +1,115 @@
+import sys
+import re
+import abc
+import six
+from decimal import Decimal
+from datetime import datetime
+
+from bs4 import BeautifulSoup
+
+if sys.version_info[0] == 3:
+    from urllib.request import urlopen
+elif sys.version_info[0] == 2:
+    from urllib import urlopen
+else:
+    raise Exception('Python version 2 or 3 required')
+
+
+def make_soup(url, parser="html.parser"):
+    response = urlopen(url)
+    soup = BeautifulSoup(response, parser)
+    return soup
+
+
+@six.add_metaclass(abc.ABCMeta)
+class SecurityPage(object):
+
+    @classmethod
+    def from_url(cls, url):
+        if '/uk/funds/snapshot/snapshot' in url:
+            return FundsPage(url)
+        elif '/uk/stockreport/' in url:
+            return StockPage(url)
+        elif '/uk/etf/' in url:
+            return ETFPage(url)
+
+    def __init__(self, url):
+        self.url = url
+        cls_name = self.__class__.__name__
+        security_type = cls_name[:cls_name.find("Page")]
+        self.data_ = {"type": security_type, "url": self.url}
+
+    def get_data(self):
+        soup = make_soup(self.url)
+        self._update_data(soup)
+        return self.data_
+
+    @abc.abstractmethod
+    def _update_data(self, soup):
+        """"""
+
+
+class FundsPage(SecurityPage):
+    """
+    http://www.morningstar.co.uk/uk/funds/snapshot/snapshot.aspx?id=F00000NGEH
+    """
+    def _update_data(self, soup):
+        text = soup.find_all('div', class_='snapshotTitleBox')[0].h1.text
+        self.data_["name"] = str(text)
+        table = soup.find_all('table', class_='overviewKeyStatsTable')[0]
+        for tr in table.find_all('tr'):
+            tds = tr.find_all('td')
+            if len(tds) != 3:
+                continue
+            if tds[0].text.startswith('NAV'):
+                date = tds[0].span.text
+                (currency, value) = tds[2].text.split()
+            if tds[0].text.startswith('Day Change'):
+                change = tds[2].text.strip()
+            if tds[0].text.startswith('ISIN'):
+                isin = tds[2].text.strip()
+        result = {
+            'value': Decimal(value),
+            'currency': currency,
+            'change': change,
+            'date': datetime.strptime(date, '%d/%m/%Y').date(),
+            'ISIN': isin
+        }
+        self.data_.update(result)
+
+
+class StockPage(SecurityPage):
+    def _update_data(self, soup):
+        title = soup.find_all('span', class_='securityName')[0].text
+        value = soup.find_all('span', id='Col0Price')[0].text
+        change = soup.find_all('span', id='Col0PriceDetail')[0].text
+        change = change.split('|')[1].strip()
+        date = soup.find_all('p', id='Col0PriceTime')[0].text[6:16]
+        currency = soup.find_all('p', id='Col0PriceTime')[0].text
+        currency = re.search(r'\|\s([A-Z]{3,4})\b', currency).group(1)
+        isin = soup.find_all('td', id='Col0Isin')[0].text
+        return {
+            'name': title,
+            'value': Decimal(value),
+            'currency': currency,
+            'change': change,
+            'date': datetime.strptime(date, '%d/%m/%Y').date(),
+            'ISIN': isin
+        }
+
+
+class ETFPage(SecurityPage):
+    def _update_data(self, soup):
+        text = soup.find_all('div', class_='snapshotTitleBox')[0].h1.text
+        self.data_["name"] = text.split('|')[0].strip()
+        self.data_["ticker"] = text.split('|')[1].strip()
+        for keyword in ["Exchange", "ISIN"]:
+            line = soup.find(text=keyword)
+            if line is None:
+                continue
+            text = line.parent.nextSibling.nextSibling.text
+            self.data_[keyword] = str(text)
+        line = soup.find(text="Closing Price")
+        if line is not None:
+            self.data_["currency"] = \
+                line.parent.nextSibling.nextSibling.text[:3]
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
     url="https://github.com/tobes/MorningScraper",
     packages=find_packages(),
     long_description=long_desc,
-    install_requires=['beautifulsoup4'],
+    install_requires=['beautifulsoup4', 'six'],
     classifiers=[
         "Development Status :: 3 - Alpha",
         "Topic :: Utilities",