diff --git a/.prospector.yaml b/.prospector.yaml new file mode 100644 index 00000000..7e8efe1a --- /dev/null +++ b/.prospector.yaml @@ -0,0 +1,21 @@ +strictness: veryhigh +doc-warnings: false +test-warnings: false + +max-line-length: 139 + +requirements: + - requirements.txt + - requirements-test.txt + - requirements-optional.txt + +ignore-paths: + - parse.py + - utils/ + +python-targets: + - 2 + - 3 + +mccabe: + run: false diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000..ea74d5db --- /dev/null +++ b/.pylintrc @@ -0,0 +1,10 @@ +[MASTER] +ignore=tests + +[MESSAGES CONTROL] +# messages up to fixme should probably be fixed somehow +disable = redefined-builtin,attribute-defined-outside-init,anomalous-backslash-in-string,no-self-use,redefined-outer-name,bad-continuation,wrong-import-order,superfluous-parens,no-member,duplicate-code,super-init-not-called,abstract-method,property-on-old-class,wrong-import-position,no-name-in-module,no-init,bad-mcs-classmethod-argument,bad-classmethod-argument,fixme,invalid-name,import-error,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-boolean-expressions,too-many-branches,too-many-instance-attributes,too-many-locals,too-many-lines,too-many-public-methods,too-many-return-statements,too-many-statements,missing-docstring,line-too-long,locally-disabled,locally-enabled,bad-builtin,deprecated-lambda + +[FORMAT] +max-line-length=139 +single-line-if-stmt=no diff --git a/flake8-run.sh b/flake8-run.sh index 685ec6ab..d9264946 100755 --- a/flake8-run.sh +++ b/flake8-run.sh @@ -5,8 +5,5 @@ if [[ ! -x $(which flake8) ]]; then exit 1 fi -find html5lib/ -name '*.py' -and -not -name 'constants.py' -print0 | xargs -0 flake8 --ignore=E501 -flake1=$? -flake8 --max-line-length=99 --ignore=E126 html5lib/constants.py -flake2=$? -exit $[$flake1 || $flake2] +flake8 `dirname $0` +exit $? diff --git a/html5lib/constants.py b/html5lib/constants.py index 2244933c..df1f061e 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -2819,7 +2819,6 @@ 0x0d: "\u000D", 0x80: "\u20AC", 0x81: "\u0081", - 0x81: "\u0081", 0x82: "\u201A", 0x83: "\u0192", 0x84: "\u201E", diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index caddd318..7f81c0d1 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -765,15 +765,15 @@ def sanitize_token(self, token): if ((namespace, name) in self.allowed_elements or (namespace is None and (namespaces["html"], name) in self.allowed_elements)): - return self.allowed_token(token, token_type) + return self.allowed_token(token) else: - return self.disallowed_token(token, token_type) + return self.disallowed_token(token) elif token_type == "Comment": pass else: return token - def allowed_token(self, token, token_type): + def allowed_token(self, token): if "data" in token: attrs = token["data"] attr_names = set(attrs.keys()) @@ -823,7 +823,8 @@ def allowed_token(self, token, token_type): token["data"] = attrs return token - def disallowed_token(self, token, token_type): + def disallowed_token(self, token): + token_type = token["type"] if token_type == "EndTag": token["data"] = "%s>" % token["name"] elif token["data"]: @@ -862,7 +863,7 @@ def sanitize_css(self, style): 'padding']: for keyword in value.split(): if keyword not in self.allowed_css_keywords and \ - not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): + not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa break else: clean.append(prop + ': ' + value + ';') diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index e6808425..331b8fd7 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -121,7 +121,7 @@ def reset(self): self.phase.insertHtmlElement() self.resetInsertionMode() else: - self.innerHTML = False + self.innerHTML = False # pylint:disable=redefined-variable-type self.phase = self.phases["initial"] self.lastPhase = None @@ -241,6 +241,7 @@ def parse(self, stream, encoding=None, parseMeta=True, def parseFragment(self, stream, container="div", encoding=None, parseMeta=False, useChardet=True, scripting=False): + # pylint:disable=unused-argument """Parse a HTML fragment into a well-formed tree fragment container - name of the element we're setting the innerHTML property @@ -259,8 +260,10 @@ def parseFragment(self, stream, container="div", encoding=None, encoding=encoding, scripting=scripting) return self.tree.getFragment() - def parseError(self, errorcode="XXX-undefined-error", datavars={}): + def parseError(self, errorcode="XXX-undefined-error", datavars=None): # XXX The idea is to make errorcode mandatory. + if datavars is None: + datavars = {} self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) if self.strict: raise ParseError(E[errorcode] % datavars) @@ -361,6 +364,7 @@ def adjustForeignAttributes(self, token): del token["data"][originalName] def reparseTokenNormal(self, token): + # pylint:disable=unused-argument self.parser.phase() def resetInsertionMode(self): @@ -458,6 +462,7 @@ def getMetaclass(use_metaclass, metaclass_func): else: return type + # pylint:disable=unused-argument class Phase(with_metaclass(getMetaclass(debug, log))): """Base class for helper object that implements each phase of processing """ @@ -948,8 +953,8 @@ class InBodyPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - # Keep a ref to this for special handling of whitespace in
- self.processSpaceCharactersNonPre = self.processSpaceCharacters
+ # Set this to the default handler
+ self.processSpaceCharacters = self.processSpaceCharactersNonPre
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
@@ -1082,7 +1087,7 @@ def processCharacters(self, token):
for char in token["data"]])):
self.parser.framesetOK = False
- def processSpaceCharacters(self, token):
+ def processSpaceCharactersNonPre(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"])
@@ -2763,6 +2768,7 @@ def startTagOther(self, token):
def processEndTag(self, token):
self.parser.parseError("expected-eof-but-got-end-tag",
{"name": token["name"]})
+ # pylint:enable=unused-argument
return {
"initial": InitialPhase,
diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py
index 5da5d938..d6d1d6fb 100644
--- a/html5lib/ihatexml.py
+++ b/html5lib/ihatexml.py
@@ -175,9 +175,9 @@ def escapeRegexp(string):
return string
# output from the above
-nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
-nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
# Simpler things
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
@@ -186,7 +186,7 @@ def escapeRegexp(string):
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
- def __init__(self, replaceChars=None,
+ def __init__(self,
dropXmlnsLocalName=False,
dropXmlnsAttrNs=False,
preventDoubleDashComments=False,
@@ -217,7 +217,7 @@ def coerceAttribute(self, name, namespace=None):
else:
return self.toXmlName(name)
- def coerceElement(self, name, namespace=None):
+ def coerceElement(self, name):
return self.toXmlName(name)
def coerceComment(self, data):
@@ -232,7 +232,7 @@ def coerceComment(self, data):
def coerceCharacters(self, data):
if self.replaceFormFeedCharacters:
- for i in range(data.count("\x0C")):
+ for _ in range(data.count("\x0C")):
warnings.warn("Text cannot contain U+000C", DataLossWarning)
data = data.replace("\x0C", " ")
# Other non-xml characters
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 15acba0d..58d626c9 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -19,12 +19,6 @@
except ImportError:
BytesIO = StringIO
-try:
- from io import BufferedIOBase
-except ImportError:
- class BufferedIOBase(object):
- pass
-
# Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
@@ -32,15 +26,17 @@ class BufferedIOBase(object):
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
-invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
if utils.supports_lone_surrogates:
# Use one extra step of indirection and create surrogates with
- # unichr. Not using this indirection would introduce an illegal
+ # eval. Not using this indirection would introduce an illegal
# unicode literal on platforms not supporting such lone
# surrogates.
- invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
- eval('"\\uD800-\\uDFFF"'))
+ assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
+ invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
+ eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
+ "]")
else:
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
@@ -296,7 +292,7 @@ def readChunk(self, chunkSize=None):
return True
def characterErrorsUCS4(self, data):
- for i in range(len(invalid_unicode_re.findall(data))):
+ for _ in range(len(invalid_unicode_re.findall(data))):
self.errors.append("invalid-codepoint")
def characterErrorsUCS2(self, data):
@@ -453,7 +449,7 @@ def openStream(self, source):
try:
stream.seek(stream.tell())
- except:
+ except: # pylint:disable=bare-except
stream = BufferedStream(stream)
return stream
@@ -571,6 +567,7 @@ def __new__(self, value):
return bytes.__new__(self, value.lower())
def __init__(self, value):
+ # pylint:disable=unused-argument
self._position = -1
def __iter__(self):
@@ -681,7 +678,7 @@ def getEncoding(self):
(b" 1) or
- (not is_ucs4 and len(v) > 2)):
- continue
- if v != "&":
- if len(v) == 2:
- v = utils.surrogatePairToCodepoint(v)
- else:
- v = ord(v)
- if v not in encode_entity_map or k.islower():
- # prefer < over < and similarly for &, >, etc.
- encode_entity_map[v] = k
-
- def htmlentityreplace_errors(exc):
- if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
- res = []
- codepoints = []
- skip = False
- for i, c in enumerate(exc.object[exc.start:exc.end]):
- if skip:
- skip = False
- continue
- index = i + exc.start
- if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
- codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
- skip = True
- else:
- codepoint = ord(c)
- codepoints.append(codepoint)
- for cp in codepoints:
- e = encode_entity_map.get(cp)
- if e:
- res.append("&")
- res.append(e)
- if not e.endswith(";"):
- res.append(";")
- else:
- res.append("%s;" % (hex(cp)[2:]))
- return ("".join(res), exc.end)
- else:
- return xmlcharrefreplace_errors(exc)
- register_error(unicode_encode_errors, htmlentityreplace_errors)
+encode_entity_map = {}
+is_ucs4 = len("\U0010FFFF") == 1
+for k, v in list(entities.items()):
+ # skip multi-character entities
+ if ((is_ucs4 and len(v) > 1) or
+ (not is_ucs4 and len(v) > 2)):
+ continue
+ if v != "&":
+ if len(v) == 2:
+ v = utils.surrogatePairToCodepoint(v)
+ else:
+ v = ord(v)
+ if v not in encode_entity_map or k.islower():
+ # prefer < over < and similarly for &, >, etc.
+ encode_entity_map[v] = k
+
+
+def htmlentityreplace_errors(exc):
+ if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+ res = []
+ codepoints = []
+ skip = False
+ for i, c in enumerate(exc.object[exc.start:exc.end]):
+ if skip:
+ skip = False
+ continue
+ index = i + exc.start
+ if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
+ codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
+ skip = True
+ else:
+ codepoint = ord(c)
+ codepoints.append(codepoint)
+ for cp in codepoints:
+ e = encode_entity_map.get(cp)
+ if e:
+ res.append("&")
+ res.append(e)
+ if not e.endswith(";"):
+ res.append(";")
+ else:
+ res.append("%s;" % (hex(cp)[2:]))
+ return ("".join(res), exc.end)
+ else:
+ return xmlcharrefreplace_errors(exc)
- del register_error
+register_error("htmlentityreplace", htmlentityreplace_errors)
class HTMLSerializer(object):
@@ -168,7 +163,7 @@ def __init__(self, **kwargs):
def encode(self, string):
assert(isinstance(string, text_type))
if self.encoding:
- return string.encode(self.encoding, unicode_encode_errors)
+ return string.encode(self.encoding, "htmlentityreplace")
else:
return string
@@ -180,6 +175,7 @@ def encodeStrict(self, string):
return string
def serialize(self, treewalker, encoding=None):
+ # pylint:disable=too-many-nested-blocks
self.encoding = encoding
in_cdata = False
self.errors = []
@@ -241,7 +237,7 @@ def serialize(self, treewalker, encoding=None):
in_cdata = True
elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element")
- for (attr_namespace, attr_name), attr_value in token["data"].items():
+ for (_, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here
k = attr_name
v = attr_value
@@ -328,6 +324,6 @@ def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
raise SerializeError
-def SerializeError(Exception):
+class SerializeError(Exception):
"""Error in serialized tree"""
pass
diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py
index 6e6a916b..6ae09dbe 100644
--- a/html5lib/tests/support.py
+++ b/html5lib/tests/support.py
@@ -1,5 +1,7 @@
from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=wrong-import-position
+
import os
import sys
import codecs
@@ -13,7 +15,7 @@
os.path.pardir,
os.path.pardir)))
-from html5lib import treebuilders, treewalkers, treeadapters
+from html5lib import treebuilders, treewalkers, treeadapters # noqa
del base_path
# Build a dict of available trees
@@ -26,14 +28,14 @@
}
# ElementTree impls
-import xml.etree.ElementTree as ElementTree
+import xml.etree.ElementTree as ElementTree # noqa
treeTypes['ElementTree'] = {
"builder": treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True),
"walker": treewalkers.getTreeWalker("etree", ElementTree)
}
try:
- import xml.etree.cElementTree as cElementTree
+ import xml.etree.cElementTree as cElementTree # noqa
except ImportError:
treeTypes['cElementTree'] = None
else:
@@ -47,7 +49,7 @@
}
try:
- import lxml.etree as lxml # flake8: noqa
+ import lxml.etree as lxml # noqa
except ImportError:
treeTypes['lxml'] = None
else:
@@ -58,7 +60,7 @@
# Genshi impls
try:
- import genshi # flake8: noqa
+ import genshi # noqa
except ImportError:
pass
else:
@@ -68,6 +70,8 @@
"walker": treewalkers.getTreeWalker("genshi")
}
+# pylint:enable=wrong-import-position
+
def get_data_files(subdirectory, files='*.dat', search_dir=test_dir):
return sorted(glob.glob(os.path.join(search_dir, subdirectory, files)))
diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
index 09504654..c5d2af12 100644
--- a/html5lib/tests/test_encoding.py
+++ b/html5lib/tests/test_encoding.py
@@ -51,19 +51,21 @@ def runPreScanEncodingTest(data, encoding):
def test_encoding():
for filename in get_data_files("encoding"):
tests = _TestData(filename, b"data", encoding=None)
- for idx, test in enumerate(tests):
+ for test in tests:
yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
+# pylint:disable=wrong-import-position
try:
try:
- import charade # flake8: noqa
+ import charade # noqa
except ImportError:
- import chardet # flake8: noqa
+ import chardet # noqa
except ImportError:
print("charade/chardet not found, skipping chardet tests")
else:
def test_chardet():
- with open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb") as fp:
+ with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp:
encoding = inputstream.HTMLInputStream(fp.read()).charEncoding
assert encoding[0].name == "big5"
+# pylint:enable=wrong-import-position
diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py
index 2f3ba2c8..f8e1ac43 100644
--- a/html5lib/tests/test_parser2.py
+++ b/html5lib/tests/test_parser2.py
@@ -2,10 +2,8 @@
import io
-import pytest
+from . import support # noqa
-from . import support # flake8: noqa
-from html5lib import html5parser
from html5lib.constants import namespaces
from html5lib import parse
@@ -23,29 +21,29 @@ def test_line_counter():
def test_namespace_html_elements_0_dom():
doc = parse("",
- treebuilder="dom",
- namespaceHTMLElements=True)
+ treebuilder="dom",
+ namespaceHTMLElements=True)
assert doc.childNodes[0].namespaceURI == namespaces["html"]
def test_namespace_html_elements_1_dom():
doc = parse("",
- treebuilder="dom",
- namespaceHTMLElements=False)
+ treebuilder="dom",
+ namespaceHTMLElements=False)
assert doc.childNodes[0].namespaceURI is None
def test_namespace_html_elements_0_etree():
doc = parse("",
- treebuilder="etree",
- namespaceHTMLElements=True)
+ treebuilder="etree",
+ namespaceHTMLElements=True)
assert doc.tag == "{%s}html" % (namespaces["html"],)
def test_namespace_html_elements_1_etree():
doc = parse("",
- treebuilder="etree",
- namespaceHTMLElements=False)
+ treebuilder="etree",
+ namespaceHTMLElements=False)
assert doc.tag == "html"
diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py
index 1f8a06f6..e19deea8 100644
--- a/html5lib/tests/test_sanitizer.py
+++ b/html5lib/tests/test_sanitizer.py
@@ -4,7 +4,7 @@
from html5lib.filters import sanitizer
-def runSanitizerTest(name, expected, input):
+def runSanitizerTest(_, expected, input):
parsed = parseFragment(expected)
expected = serialize(parsed,
omit_optional_tags=False,
@@ -63,7 +63,8 @@ def test_sanitizer():
for ns, tag_name in sanitizer.allowed_elements:
if ns != constants.namespaces["html"]:
continue
- if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'select']:
+ if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td',
+ 'tfoot', 'th', 'thead', 'tr', 'select']:
continue # TODO
if tag_name == 'image':
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py
index b3ffe0df..b3cda7d7 100644
--- a/html5lib/tests/test_serializer.py
+++ b/html5lib/tests/test_serializer.py
@@ -12,6 +12,7 @@
from html5lib.serializer import HTMLSerializer, serialize
from html5lib.treewalkers._base import TreeWalker
+# pylint:disable=wrong-import-position
optionals_loaded = []
try:
@@ -19,6 +20,7 @@
optionals_loaded.append("lxml")
except ImportError:
pass
+# pylint:enable=wrong-import-position
default_namespace = constants.namespaces["html"]
@@ -219,5 +221,5 @@ def test_serializer():
for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)):
with open(filename) as fp:
tests = json.load(fp)
- for index, test in enumerate(tests['tests']):
+ for test in tests['tests']:
yield runSerializerTest, test["input"], test["expected"], test.get("options", {})
diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py
index 3b659fbb..77e411d5 100644
--- a/html5lib/tests/test_stream.py
+++ b/html5lib/tests/test_stream.py
@@ -1,15 +1,20 @@
from __future__ import absolute_import, division, unicode_literals
-from . import support # flake8: noqa
+from . import support # noqa
+
import codecs
-from io import BytesIO
-import socket
+import sys
+from io import BytesIO, StringIO
+
+import pytest
import six
from six.moves import http_client, urllib
from html5lib.inputstream import (BufferedStream, HTMLInputStream,
HTMLUnicodeInputStream, HTMLBinaryInputStream)
+from html5lib.utils import supports_lone_surrogates
+
def test_basic():
s = b"abc"
@@ -17,6 +22,7 @@ def test_basic():
read = fp.read(10)
assert read == s
+
def test_read_length():
fp = BufferedStream(BytesIO(b"abcdef"))
read1 = fp.read(1)
@@ -28,17 +34,23 @@ def test_read_length():
read4 = fp.read(4)
assert read4 == b""
+
def test_tell():
fp = BufferedStream(BytesIO(b"abcdef"))
read1 = fp.read(1)
+ assert read1 == b"a"
assert fp.tell() == 1
read2 = fp.read(2)
+ assert read2 == b"bc"
assert fp.tell() == 3
read3 = fp.read(3)
+ assert read3 == b"def"
assert fp.tell() == 6
read4 = fp.read(4)
+ assert read4 == b""
assert fp.tell() == 6
+
def test_seek():
fp = BufferedStream(BytesIO(b"abcdef"))
read1 = fp.read(1)
@@ -55,20 +67,26 @@ def test_seek():
read5 = fp.read(2)
assert read5 == b"ef"
+
def test_seek_tell():
fp = BufferedStream(BytesIO(b"abcdef"))
read1 = fp.read(1)
+ assert read1 == b"a"
assert fp.tell() == 1
fp.seek(0)
read2 = fp.read(1)
+ assert read2 == b"a"
assert fp.tell() == 1
read3 = fp.read(2)
+ assert read3 == b"bc"
assert fp.tell() == 3
fp.seek(2)
read4 = fp.read(2)
+ assert read4 == b"cd"
assert fp.tell() == 4
fp.seek(4)
read5 = fp.read(2)
+ assert read5 == b"ef"
assert fp.tell() == 6
@@ -85,11 +103,13 @@ def test_char_ascii():
assert stream.charEncoding[0].name == 'windows-1252'
assert stream.char() == "'"
+
def test_char_utf8():
stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8')
assert stream.charEncoding[0].name == 'utf-8'
assert stream.char() == '\u2018'
+
def test_char_win1252():
stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252'))
assert stream.charEncoding[0].name == 'windows-1252'
@@ -97,16 +117,19 @@ def test_char_win1252():
assert stream.char() == "\xf1"
assert stream.char() == "\u2019"
+
def test_bom():
stream = HTMLInputStream(codecs.BOM_UTF8 + b"'")
assert stream.charEncoding[0].name == 'utf-8'
assert stream.char() == "'"
+
def test_utf_16():
stream = HTMLInputStream((' ' * 1025).encode('utf-16'))
assert stream.charEncoding[0].name in ['utf-16le', 'utf-16be']
assert len(stream.charsUntil(' ', True)) == 1025
+
def test_newlines():
stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe")
assert stream.position() == (1, 0)
@@ -117,11 +140,13 @@ def test_newlines():
assert stream.charsUntil('e') == "x"
assert stream.position() == (4, 5)
+
def test_newlines2():
size = HTMLUnicodeInputStream._defaultChunkSize
stream = HTMLInputStream("\r" * size + "\n")
assert stream.charsUntil('x') == "\n" * size
+
def test_position():
stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh")
assert stream.position() == (1, 0)
@@ -140,6 +165,7 @@ def test_position():
assert stream.charsUntil('h') == "e\nf\ng"
assert stream.position() == (6, 1)
+
def test_position2():
stream = HTMLUnicodeInputStreamShortChunk("abc\nd")
assert stream.position() == (1, 0)
@@ -154,6 +180,7 @@ def test_position2():
assert stream.char() == "d"
assert stream.position() == (2, 1)
+
def test_python_issue_20007():
"""
Make sure we have a work-around for Python bug #20007
@@ -161,6 +188,7 @@ def test_python_issue_20007():
"""
class FakeSocket(object):
def makefile(self, _mode, _bufsize=None):
+ # pylint:disable=unused-argument
return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
source = http_client.HTTPResponse(FakeSocket())
@@ -168,6 +196,7 @@ def makefile(self, _mode, _bufsize=None):
stream = HTMLInputStream(source)
assert stream.charsUntil(" ") == "Text"
+
def test_python_issue_20007_b():
"""
Make sure we have a work-around for Python bug #20007
@@ -178,6 +207,7 @@ def test_python_issue_20007_b():
class FakeSocket(object):
def makefile(self, _mode, _bufsize=None):
+ # pylint:disable=unused-argument
return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
source = http_client.HTTPResponse(FakeSocket())
@@ -185,3 +215,109 @@ def makefile(self, _mode, _bufsize=None):
wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com")
stream = HTMLInputStream(wrapped)
assert stream.charsUntil(" ") == "Text"
+
+
+@pytest.mark.parametrize("inp,num",
+ [("\u0000", 0),
+ ("\u0001", 1),
+ ("\u0008", 1),
+ ("\u0009", 0),
+ ("\u000A", 0),
+ ("\u000B", 1),
+ ("\u000C", 0),
+ ("\u000D", 0),
+ ("\u000E", 1),
+ ("\u001F", 1),
+ ("\u0020", 0),
+ ("\u007E", 0),
+ ("\u007F", 1),
+ ("\u009F", 1),
+ ("\u00A0", 0),
+ ("\uFDCF", 0),
+ ("\uFDD0", 1),
+ ("\uFDEF", 1),
+ ("\uFDF0", 0),
+ ("\uFFFD", 0),
+ ("\uFFFE", 1),
+ ("\uFFFF", 1),
+ ("\U0001FFFD", 0),
+ ("\U0001FFFE", 1),
+ ("\U0001FFFF", 1),
+ ("\U0002FFFD", 0),
+ ("\U0002FFFE", 1),
+ ("\U0002FFFF", 1),
+ ("\U0003FFFD", 0),
+ ("\U0003FFFE", 1),
+ ("\U0003FFFF", 1),
+ ("\U0004FFFD", 0),
+ ("\U0004FFFE", 1),
+ ("\U0004FFFF", 1),
+ ("\U0005FFFD", 0),
+ ("\U0005FFFE", 1),
+ ("\U0005FFFF", 1),
+ ("\U0006FFFD", 0),
+ ("\U0006FFFE", 1),
+ ("\U0006FFFF", 1),
+ ("\U0007FFFD", 0),
+ ("\U0007FFFE", 1),
+ ("\U0007FFFF", 1),
+ ("\U0008FFFD", 0),
+ ("\U0008FFFE", 1),
+ ("\U0008FFFF", 1),
+ ("\U0009FFFD", 0),
+ ("\U0009FFFE", 1),
+ ("\U0009FFFF", 1),
+ ("\U000AFFFD", 0),
+ ("\U000AFFFE", 1),
+ ("\U000AFFFF", 1),
+ ("\U000BFFFD", 0),
+ ("\U000BFFFE", 1),
+ ("\U000BFFFF", 1),
+ ("\U000CFFFD", 0),
+ ("\U000CFFFE", 1),
+ ("\U000CFFFF", 1),
+ ("\U000DFFFD", 0),
+ ("\U000DFFFE", 1),
+ ("\U000DFFFF", 1),
+ ("\U000EFFFD", 0),
+ ("\U000EFFFE", 1),
+ ("\U000EFFFF", 1),
+ ("\U000FFFFD", 0),
+ ("\U000FFFFE", 1),
+ ("\U000FFFFF", 1),
+ ("\U0010FFFD", 0),
+ ("\U0010FFFE", 1),
+ ("\U0010FFFF", 1),
+ ("\x01\x01\x01", 3),
+ ("a\x01a\x01a\x01a", 3)])
+def test_invalid_codepoints(inp, num):
+ stream = HTMLUnicodeInputStream(StringIO(inp))
+ for _i in range(len(inp)):
+ stream.char()
+ assert len(stream.errors) == num
+
+
+@pytest.mark.skipif(not supports_lone_surrogates, reason="doesn't support lone surrogates")
+@pytest.mark.parametrize("inp,num",
+ [("'\\uD7FF'", 0),
+ ("'\\uD800'", 1),
+ ("'\\uDBFF'", 1),
+ ("'\\uDC00'", 1),
+ ("'\\uDFFF'", 1),
+ ("'\\uE000'", 0),
+ ("'\\uD800\\uD800\\uD800'", 3),
+ ("'a\\uD800a\\uD800a\\uD800a'", 3),
+ ("'\\uDFFF\\uDBFF'", 2),
+ pytest.mark.skipif(sys.maxunicode == 0xFFFF,
+ ("'\\uDBFF\\uDFFF'", 2),
+ reason="narrow Python")])
+def test_invalid_codepoints_surrogates(inp, num):
+ inp = eval(inp) # pylint:disable=eval-used
+ fp = StringIO(inp)
+ if ord(max(fp.read())) > 0xFFFF:
+ pytest.skip("StringIO altered string")
+ fp.seek(0)
+ stream = HTMLUnicodeInputStream(fp)
+ for _i in range(len(inp)):
+ stream.char()
+ assert len(stream.errors) == num
diff --git a/html5lib/tests/test_treeadapters.py b/html5lib/tests/test_treeadapters.py
index 5f38b6c3..95e56c00 100644
--- a/html5lib/tests/test_treeadapters.py
+++ b/html5lib/tests/test_treeadapters.py
@@ -1,6 +1,6 @@
from __future__ import absolute_import, division, unicode_literals
-from . import support # flake8: noqa
+from . import support # noqa
import html5lib
from html5lib.treeadapters import sax
@@ -25,7 +25,7 @@ def test_to_sax():
('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
('characters', '\n '),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
- ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
+ ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index 045d9d7b..332027ac 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -31,7 +31,7 @@ def test_all_tokens():
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
]
- for treeName, treeCls in sorted(treeTypes.items()):
+ for _, treeCls in sorted(treeTypes.items()):
if treeCls is None:
continue
p = html5parser.HTMLParser(tree=treeCls["builder"])
diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py
index c6163a1f..255c1859 100644
--- a/html5lib/tests/tokenizer.py
+++ b/html5lib/tests/tokenizer.py
@@ -19,6 +19,7 @@ def __init__(self, initialState, lastStartTag=None):
self._lastStartTag = lastStartTag
def parse(self, stream, encoding=None, innerHTML=False):
+ # pylint:disable=unused-argument
tokenizer = self.tokenizer(stream, encoding)
self.outputTokens = []
diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py
index 79774578..dd6ea75f 100644
--- a/html5lib/tokenizer.py
+++ b/html5lib/tokenizer.py
@@ -1,9 +1,6 @@
from __future__ import absolute_import, division, unicode_literals
-try:
- chr = unichr # flake8: noqa
-except NameError:
- pass
+from six import unichr as chr
from collections import deque
@@ -147,8 +144,8 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
output = "&"
charStack = [self.stream.char()]
- if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
- or (allowedChar is not None and allowedChar == charStack[0])):
+ if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
+ (allowedChar is not None and allowedChar == charStack[0])):
self.stream.unget(charStack[0])
elif charStack[0] == "#":
@@ -924,7 +921,7 @@ def attributeNameState(self):
if self.lowercaseAttrName:
self.currentToken["data"][-1][0] = (
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
- for name, value in self.currentToken["data"][:-1]:
+ for name, _ in self.currentToken["data"][:-1]:
if self.currentToken["data"][-1][0] == name:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"duplicate-attribute"})
@@ -1716,11 +1713,11 @@ def cdataSectionState(self):
else:
data.append(char)
- data = "".join(data)
+ data = "".join(data) # pylint:disable=redefined-variable-type
# Deal with null here rather than in the parser
nullCount = data.count("\u0000")
if nullCount > 0:
- for i in range(nullCount):
+ for _ in range(nullCount):
self.tokenQueue.append({"type": tokenTypes["ParseError"],
"data": "invalid-codepoint"})
data = data.replace("\u0000", "\uFFFD")
diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py
index 57d71304..4f978466 100644
--- a/html5lib/treeadapters/__init__.py
+++ b/html5lib/treeadapters/__init__.py
@@ -5,7 +5,7 @@
__all__ = ["sax"]
try:
- from . import genshi # flake8: noqa
+ from . import genshi # noqa
except ImportError:
pass
else:
diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/_base.py
index 8196f591..900a724c 100644
--- a/html5lib/treebuilders/_base.py
+++ b/html5lib/treebuilders/_base.py
@@ -126,6 +126,7 @@ class TreeBuilder(object):
commentClass - the class to use for comments
doctypeClass - the class to use for doctypes
"""
+ # pylint:disable=not-callable
# Document class
documentClass = None
diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py
index 8656244f..b7df74b2 100644
--- a/html5lib/treebuilders/dom.py
+++ b/html5lib/treebuilders/dom.py
@@ -109,7 +109,7 @@ def getNameTuple(self):
nameTuple = property(getNameTuple)
- class TreeBuilder(_base.TreeBuilder):
+ class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable
def documentClass(self):
self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
return weakref.proxy(self)
@@ -158,6 +158,7 @@ def insertText(self, data, parent=None):
else:
# HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'):
+ # pylint:disable=protected-access
if Node.TEXT_NODE not in self.dom._child_node_types:
self.dom._child_node_types = list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE)
diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py
index 2c8ed19f..d394148d 100644
--- a/html5lib/treebuilders/etree.py
+++ b/html5lib/treebuilders/etree.py
@@ -1,4 +1,6 @@
from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
from six import text_type
import re
@@ -253,7 +255,7 @@ def serializeElement(element, indent=0):
return "\n".join(rv)
- def tostring(element):
+ def tostring(element): # pylint:disable=unused-variable
"""Serialize an element and its child nodes to a string"""
rv = []
filter = ihatexml.InfosetFilter()
@@ -307,7 +309,7 @@ def serializeElement(element):
return "".join(rv)
- class TreeBuilder(_base.TreeBuilder):
+ class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py
index 138b30bd..2a69769b 100644
--- a/html5lib/treebuilders/etree_lxml.py
+++ b/html5lib/treebuilders/etree_lxml.py
@@ -10,6 +10,7 @@
"""
from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
import warnings
import re
@@ -53,7 +54,6 @@ def _getChildNodes(self):
def testSerializer(element):
rv = []
- finalText = None
infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
def serializeElement(element, indent=0):
@@ -128,16 +128,12 @@ def serializeElement(element, indent=0):
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0)
- if finalText is not None:
- rv.append("|%s\"%s\"" % (' ' * 2, finalText))
-
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
- finalText = None
def serializeElement(element):
if not hasattr(element, "tag"):
@@ -173,9 +169,6 @@ def serializeElement(element):
serializeElement(element)
- if finalText is not None:
- rv.append("%s\"" % (' ' * 2, finalText))
-
return "".join(rv)
@@ -193,9 +186,11 @@ def __init__(self, namespaceHTMLElements, fullTree=False):
self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict):
- def __init__(self, element, value={}):
+ def __init__(self, element, value=None):
+ if value is None:
+ value = {}
self._element = element
- dict.__init__(self, value)
+ dict.__init__(self, value) # pylint:disable=non-parent-init-called
for key, value in self.items():
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
@@ -303,12 +298,14 @@ def insertDoctype(self, token):
self.doctype = doctype
def insertCommentInitial(self, data, parent=None):
+ assert parent is None or parent is self.document
+ assert self.document._elementTree is None
self.initial_comments.append(data)
def insertCommentMain(self, data, parent=None):
if (parent == self.document and
self.document._elementTree.getroot()[-1].tag == comment_type):
- warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
+ warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
super(TreeBuilder, self).insertComment(data, parent)
def insertRoot(self, token):
diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py
index 73c8e26a..d3b0c50e 100644
--- a/html5lib/treewalkers/etree.py
+++ b/html5lib/treewalkers/etree.py
@@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
- class TreeWalker(_base.NonRecursiveTreeWalker):
+ class TreeWalker(_base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
"""Given the particular ElementTree representation, this implementation,
to avoid using recursion, returns "nodes" as tuples with the following
content:
@@ -38,7 +38,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
- elt, key, parents, flag = node
+ elt, _, _, flag = node
if flag in ("text", "tail"):
return _base.TEXT, getattr(elt, flag)
else:
diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py
index 83cd1654..61cbfede 100644
--- a/html5lib/treewalkers/genshistream.py
+++ b/html5lib/treewalkers/genshistream.py
@@ -25,7 +25,7 @@ def __iter__(self):
yield token
def tokens(self, event, next):
- kind, data, pos = event
+ kind, data, _ = event
if kind == START:
tag, attribs = data
name = tag.localname
diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py
index 36850086..7d99adc2 100644
--- a/html5lib/treewalkers/lxmletree.py
+++ b/html5lib/treewalkers/lxmletree.py
@@ -117,6 +117,7 @@ def __len__(self):
class TreeWalker(_base.NonRecursiveTreeWalker):
def __init__(self, tree):
+ # pylint:disable=redefined-variable-type
if hasattr(tree, "getroot"):
self.fragmentChildren = set()
tree = Root(tree)
diff --git a/html5lib/trie/__init__.py b/html5lib/trie/__init__.py
index a8cca8a9..a5ba4bf1 100644
--- a/html5lib/trie/__init__.py
+++ b/html5lib/trie/__init__.py
@@ -4,9 +4,11 @@
Trie = PyTrie
+# pylint:disable=wrong-import-position
try:
from .datrie import Trie as DATrie
except ImportError:
pass
else:
Trie = DATrie
+# pylint:enable=wrong-import-position
diff --git a/html5lib/trie/_base.py b/html5lib/trie/_base.py
index 724486b1..25eece46 100644
--- a/html5lib/trie/_base.py
+++ b/html5lib/trie/_base.py
@@ -7,7 +7,8 @@ class Trie(Mapping):
"""Abstract base class for tries"""
def keys(self, prefix=None):
- keys = super().keys()
+ # pylint:disable=arguments-differ
+ keys = super(Trie, self).keys()
if prefix is None:
return set(keys)
diff --git a/html5lib/utils.py b/html5lib/utils.py
index c70de172..5fe237a0 100644
--- a/html5lib/utils.py
+++ b/html5lib/utils.py
@@ -22,12 +22,12 @@
# surrogates, and there is no mechanism to further escape such
# escapes.
try:
- _x = eval('"\\uD800"')
+ _x = eval('"\\uD800"') # pylint:disable=eval-used
if not isinstance(_x, text_type):
# We need this with u"" because of http://bugs.jython.org/issue2039
- _x = eval('u"\\uD800"')
+ _x = eval('u"\\uD800"') # pylint:disable=eval-used
assert isinstance(_x, text_type)
-except:
+except: # pylint:disable=bare-except
supports_lone_surrogates = False
else:
supports_lone_surrogates = True
@@ -52,7 +52,7 @@ def __init__(self, items=()):
# anything here.
_dictEntries = []
for name, value in items:
- if type(name) in (list, tuple, frozenset, set):
+ if isinstance(name, (list, tuple, frozenset, set)):
for item in name:
_dictEntries.append((item, value))
else:
diff --git a/parse.py b/parse.py
index cceea84d..2ed8f1c2 100755
--- a/parse.py
+++ b/parse.py
@@ -5,7 +5,6 @@
"""
import sys
-import os
import traceback
from optparse import OptionParser
@@ -15,9 +14,10 @@
from html5lib import constants
from html5lib import utils
+
def parse():
optParser = getOptParser()
- opts,args = optParser.parse_args()
+ opts, args = optParser.parse_args()
encoding = "utf8"
try:
@@ -25,7 +25,10 @@ def parse():
# Try opening from the internet
if f.startswith('http://'):
try:
- import urllib.request, urllib.parse, urllib.error, cgi
+ import urllib.request
+ import urllib.parse
+ import urllib.error
+ import cgi
f = urllib.request.urlopen(f)
contentType = f.headers.get('content-type')
if contentType:
@@ -41,7 +44,7 @@ def parse():
try:
# Try opening from file system
f = open(f, "rb")
- except IOError as e:
+ except IOError as e:
sys.stderr.write("Unable to open file: %s\n" % e)
sys.exit(1)
except IndexError:
@@ -82,14 +85,15 @@ def parse():
if document:
printOutput(p, document, opts)
t2 = time.time()
- sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
+ sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1))
else:
- sys.stderr.write("\n\nRun took: %fs"%(t1-t0))
+ sys.stderr.write("\n\nRun took: %fs" % (t1 - t0))
else:
document = run(parseMethod, f, encoding, opts.scripting)
if document:
printOutput(p, document, opts)
+
def run(parseMethod, f, encoding, scripting):
try:
document = parseMethod(f, encoding=encoding, scripting=scripting)
@@ -98,6 +102,7 @@ def run(parseMethod, f, encoding, scripting):
traceback.print_exc()
return document
+
def printOutput(parser, document, opts):
if opts.encoding:
print("Encoding:", parser.tokenizer.stream.charEncoding)
@@ -116,7 +121,7 @@ def printOutput(parser, document, opts):
elif tb == "etree":
sys.stdout.write(utils.default_etree.tostring(document))
elif opts.tree:
- if not hasattr(document,'__getitem__'):
+ if not hasattr(document, '__getitem__'):
document = [document]
for fragment in document:
print(parser.tree.testSerializer(fragment))
@@ -126,7 +131,7 @@ def printOutput(parser, document, opts):
kwargs = {}
for opt in serializer.HTMLSerializer.options:
try:
- kwargs[opt] = getattr(opts,opt)
+ kwargs[opt] = getattr(opts, opt)
except:
pass
if not kwargs['quote_char']:
@@ -142,12 +147,14 @@ def printOutput(parser, document, opts):
encoding = "utf-8"
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
sys.stdout.write(text)
- if not text.endswith('\n'): sys.stdout.write('\n')
+ if not text.endswith('\n'):
+ sys.stdout.write('\n')
if opts.error:
- errList=[]
+ errList = []
for pos, errorcode, datavars in parser.errors:
- errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
- sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
+ errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
+ sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
+
def getOptParser():
parser = OptionParser(usage=__doc__)
diff --git a/setup.cfg b/setup.cfg
index 2a9acf13..3152ac54 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,11 @@
[bdist_wheel]
universal = 1
+
+[pep8]
+ignore = N
+max-line-length = 139
+exclude = .git,__pycache__,.tox,doc
+
+[flake8]
+ignore = N
+max-line-length = 139
diff --git a/setup.py b/setup.py
index b6ea24af..b42ba400 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
from setuptools import setup
-classifiers=[
+classifiers = [
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
@@ -20,9 +20,9 @@
'Programming Language :: Python :: 3.5',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Text Processing :: Markup :: HTML'
- ]
+]
-packages = ['html5lib'] + ['html5lib.'+name
+packages = ['html5lib'] + ['html5lib.' + name
for name in os.listdir(os.path.join('html5lib'))
if os.path.isdir(os.path.join('html5lib', name)) and
not name.startswith('.') and name != 'tests']
@@ -39,9 +39,9 @@
assignments = filter(lambda x: isinstance(x, ast.Assign), t.body)
for a in assignments:
if (len(a.targets) == 1 and
- isinstance(a.targets[0], ast.Name) and
- a.targets[0].id == "__version__" and
- isinstance(a.value, ast.Str)):
+ isinstance(a.targets[0], ast.Name) and
+ a.targets[0].id == "__version__" and
+ isinstance(a.value, ast.Str)):
version = a.value.s
setup(name='html5lib',
diff --git a/utils/entities.py b/utils/entities.py
index 116a27cb..6dccf5f0 100644
--- a/utils/entities.py
+++ b/utils/entities.py
@@ -2,50 +2,59 @@
import html5lib
+
def parse(path="html5ents.xml"):
return html5lib.parse(open(path), treebuilder="lxml")
+
def entity_table(tree):
return dict((entity_name("".join(tr[0].xpath(".//text()"))),
entity_characters(tr[1].text))
for tr in tree.xpath("//h:tbody/h:tr",
- namespaces={"h":"http://www.w3.org/1999/xhtml"}))
+ namespaces={"h": "http://www.w3.org/1999/xhtml"}))
+
def entity_name(inp):
return inp.strip()
+
def entity_characters(inp):
return "".join(codepoint_to_character(item)
- for item in inp.split()
- if item)
+ for item in inp.split()
+ if item)
+
def codepoint_to_character(inp):
- return ("\U000"+inp[2:]).decode("unicode-escape")
+ return ("\\U000" + inp[2:]).decode("unicode-escape")
+
def make_tests_json(entities):
test_list = make_test_list(entities)
tests_json = {"tests":
- [make_test(*item) for item in test_list]
+ [make_test(*item) for item in test_list]
}
return tests_json
+
def make_test(name, characters, good):
return {
- "description":test_description(name, good),
- "input":"&%s"%name,
- "output":test_expected(name, characters, good)
- }
+ "description": test_description(name, good),
+ "input": "&%s" % name,
+ "output": test_expected(name, characters, good)
+ }
+
def test_description(name, good):
with_semicolon = name.endswith(";")
- semicolon_text = {True:"with a semi-colon",
- False:"without a semi-colon"}[with_semicolon]
+ semicolon_text = {True: "with a semi-colon",
+ False: "without a semi-colon"}[with_semicolon]
if good:
- text = "Named entity: %s %s"%(name, semicolon_text)
+ text = "Named entity: %s %s" % (name, semicolon_text)
else:
- text = "Bad named entity: %s %s"%(name, semicolon_text)
+ text = "Bad named entity: %s %s" % (name, semicolon_text)
return text
+
def test_expected(name, characters, good):
rv = []
if not good or not name.endswith(";"):
@@ -53,6 +62,7 @@ def test_expected(name, characters, good):
rv.append(["Character", characters])
return rv
+
def make_test_list(entities):
tests = []
for entity_name, characters in entities.items():
@@ -61,20 +71,23 @@ def make_test_list(entities):
tests.append((entity_name, characters, True))
return sorted(tests)
+
def subentity_exists(entity_name, entities):
for i in range(1, len(entity_name)):
if entity_name[:-i] in entities:
return True
return False
+
def make_entities_code(entities):
- entities_text = "\n".join(" \"%s\": u\"%s\","%(
- name, entities[name].encode(
- "unicode-escape").replace("\"", "\\\""))
- for name in sorted(entities.keys()))
+ entities_text = "\n".join(" \"%s\": u\"%s\"," % (
+ name, entities[name].encode(
+ "unicode-escape").replace("\"", "\\\""))
+ for name in sorted(entities.keys()))
return """entities = {
%s
-}"""%entities_text
+}""" % entities_text
+
def main():
entities = entity_table(parse())
@@ -85,4 +98,3 @@ def main():
if __name__ == "__main__":
main()
-
diff --git a/utils/spider.py b/utils/spider.py
index ac5f9fbe..3a325888 100644
--- a/utils/spider.py
+++ b/utils/spider.py
@@ -7,7 +7,9 @@
s.spider("http://www.google.com", maxURLs=100)
"""
-import urllib.request, urllib.error, urllib.parse
+import urllib.request
+import urllib.error
+import urllib.parse
import urllib.robotparser
import md5
@@ -16,11 +18,13 @@
import html5lib
from html5lib.treebuilders import etree
+
class Spider(object):
+
def __init__(self):
self.unvisitedURLs = set()
self.visitedURLs = set()
- self.buggyURLs=set()
+ self.buggyURLs = set()
self.robotParser = urllib.robotparser.RobotFileParser()
self.contentDigest = {}
self.http = httplib2.Http(".cache")
@@ -70,18 +74,18 @@ def updateURLs(self, tree):
update the list of visited and unvisited URLs according to whether we
have seen them before or not"""
urls = set()
- #Remove all links we have already visited
+ # Remove all links we have already visited
for link in tree.findall(".//a"):
- try:
- url = urllib.parse.urldefrag(link.attrib['href'])[0]
- if (url and url not in self.unvisitedURLs and url
+ try:
+ url = urllib.parse.urldefrag(link.attrib['href'])[0]
+ if (url and url not in self.unvisitedURLs and url
not in self.visitedURLs):
- urls.add(url)
- except KeyError:
- pass
+ urls.add(url)
+ except KeyError:
+ pass
- #Remove all non-http URLs and add a suitable base URL where that is
- #missing
+ # Remove all non-http URLs and add a suitable base URL where that is
+ # missing
newUrls = set()
for url in urls:
splitURL = list(urllib.parse.urlsplit(url))
@@ -93,23 +97,22 @@ def updateURLs(self, tree):
urls = newUrls
responseHeaders = {}
- #Now we want to find the content types of the links we haven't visited
+ # Now we want to find the content types of the links we haven't visited
for url in urls:
try:
resp, content = self.http.request(url, "HEAD")
responseHeaders[url] = resp
- except AttributeError as KeyError:
- #Don't know why this happens
+ except AttributeError:
+ # Don't know why this happens
pass
-
- #Remove links not of content-type html or pages not found
- #XXX - need to deal with other status codes?
+ # Remove links not of content-type html or pages not found
+ # XXX - need to deal with other status codes?
toVisit = set([url for url in urls if url in responseHeaders and
- "html" in responseHeaders[url]['content-type'] and
- responseHeaders[url]['status'] == "200"])
+ "html" in responseHeaders[url]['content-type'] and
+ responseHeaders[url]['status'] == "200"])
- #Now check we are allowed to spider the page
+ # Now check we are allowed to spider the page
for url in toVisit:
robotURL = list(urllib.parse.urlsplit(url)[:2])
robotURL.extend(["robots.txt", "", ""])