From 05111d6c1df962a586cd9ba472eacc72a8a25fa4 Mon Sep 17 00:00:00 2001 From: James Rowe Date: Sat, 12 Oct 2013 18:35:23 +0100 Subject: [PATCH 1/6] Don't install global tests package --- MANIFEST.in | 1 + setup.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 6599213..2f2eec3 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include README.rst include MANIFEST.in +recursive-include tests *.py prune *.pyc \ No newline at end of file diff --git a/setup.py b/setup.py index 9bfe4fa..528ba55 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def read(fname): license = "BSD", keywords = "html2data html data xpath crawler transform", url = "https://github.com/dperezrada/html2data", - packages=['html2data', 'tests'], + packages=['html2data', ], long_description=read('README.rst'), include_package_data=True, classifiers=[ From 019aff117709809cef378f4b60a4df2dd33c7286 Mon Sep 17 00:00:00 2001 From: James Rowe Date: Sat, 12 Oct 2013 18:36:43 +0100 Subject: [PATCH 2/6] Fix tree setup when URL is given --- html2data/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html2data/__init__.py b/html2data/__init__.py index bdb8cf6..fb2c121 100644 --- a/html2data/__init__.py +++ b/html2data/__init__.py @@ -11,8 +11,8 @@ def __init__(self, html = None, url = None, tree = None): raise Exception('html or url or tree parameters are required') if url: connection = Http() - header, html = connection.request(url) - elif html: + _, html = connection.request(url) + if html: self.tree = self._get_tree_from_html(html) else: self.tree = tree From 4aa0c3ebbd28ebcda2612c637ccedec452c69cd5 Mon Sep 17 00:00:00 2001 From: James Rowe Date: Sat, 12 Oct 2013 18:37:33 +0100 Subject: [PATCH 3/6] Remove unused imports --- html2data/__init__.py | 1 - tests/test_parse.py | 1 - tests/test_tree.py | 2 -- 3 files changed, 4 deletions(-) diff --git a/html2data/__init__.py b/html2data/__init__.py index fb2c121..01c8e53 100644 --- a/html2data/__init__.py +++ b/html2data/__init__.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from lxml import etree -from StringIO import StringIO from copy import copy from httplib2 import Http diff --git a/tests/test_parse.py b/tests/test_parse.py index 54141e7..90e9bb8 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -1,5 +1,4 @@ # encoding: utf-8 -import os from unittest import TestCase diff --git a/tests/test_tree.py b/tests/test_tree.py index 8512d8e..d89895d 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -1,8 +1,6 @@ # encoding: utf-8 -import os from unittest import TestCase -from httplib2 import Http from ludibrio import Mock from html2data import HTML2Data From ba08dce226cb195e1979de3f903e603ab91c4d51 Mon Sep 17 00:00:00 2001 From: James Rowe Date: Sat, 12 Oct 2013 18:43:30 +0100 Subject: [PATCH 4/6] Don't use mutable objects in function defaults --- html2data/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/html2data/__init__.py b/html2data/__init__.py index 01c8e53..a95480e 100644 --- a/html2data/__init__.py +++ b/html2data/__init__.py @@ -65,7 +65,9 @@ def _apply_after(self, value, apply_after, multiple, strip, text): value = after(value) return value - def parse_one(self, xpath = None, css = None, multiple = False, apply_after = [], text = True, strip = True): + def parse_one(self, xpath = None, css = None, multiple = False, apply_after = None, text = True, strip = True): + if apply_after is None: + apply_after = [] #TODO: Be able to return elements and text if xpath: value = self.xpath(xpath.replace('/text()', '')) From 82db21b2a95da179227950e2a730aea66c4e130a Mon Sep 17 00:00:00 2001 From: James Rowe Date: Sat, 12 Oct 2013 18:44:14 +0100 Subject: [PATCH 5/6] Remove unused total_added_after variable --- html2data/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/html2data/__init__.py b/html2data/__init__.py index a95480e..5ceb8e9 100644 --- a/html2data/__init__.py +++ b/html2data/__init__.py @@ -54,7 +54,6 @@ def _get_one(elements): return (elements or [None])[0] def _apply_after(self, value, apply_after, multiple, strip, text): - total_added_after = 0 if not multiple: apply_after.insert(0, self._get_one) if strip and text: From 8178f8825149fb428ccdb5580f45b3468f23af4a Mon Sep 17 00:00:00 2001 From: James Rowe Date: Sat, 12 Oct 2013 18:47:32 +0100 Subject: [PATCH 6/6] Hyoer-trivial typo fix --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index e5b6471..cb15c2e 100644 --- a/README.rst +++ b/README.rst @@ -7,7 +7,7 @@ Welcome to Html2Data Description =========== -A simple way to transform a HTML file or URL to structured data. You only need to define the xpath to the element. Optionaly you can define functions to be applied after. You can easily write XPATH using the firebug extension, copy XPATH (I recommend edit the XPATH given by firebug, making it shorter). +A simple way to transform a HTML file or URL to structured data. You only need to define the xpath to the element. Optionally you can define functions to be applied after. You can easily write XPATH using the firebug extension, copy XPATH (I recommend edit the XPATH given by firebug, making it shorter). Installation ============