-
Notifications
You must be signed in to change notification settings - Fork 181
Expand file tree
/
Copy pathverify_html_resources.py
More file actions
80 lines (62 loc) · 2.23 KB
/
verify_html_resources.py
File metadata and controls
80 lines (62 loc) · 2.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python3
import argparse
import os
import sys
import urllib.parse
from lxml import html
RESOURCE_RELS = {
'stylesheet',
'icon',
'shortcut icon',
'apple-touch-icon',
}
def is_external(url):
parsed = urllib.parse.urlparse(url)
return bool(parsed.scheme or parsed.netloc)
def check_attr(root, html_file, attr, value):
if not value or value.startswith(('#', 'mailto:', 'javascript:', 'data:')):
return None
if is_external(value):
return None
parsed = urllib.parse.urlparse(value)
if not parsed.path:
return None
decoded = urllib.parse.unquote(parsed.path)
target = os.path.normpath(os.path.join(os.path.dirname(html_file), decoded))
if os.path.exists(target):
return None
return html_file, attr, value, os.path.relpath(target, root)
def should_check_href(element):
if element.tag == 'link':
rel = (element.get('rel') or '').strip().lower()
return rel in RESOURCE_RELS
return False
def main():
parser = argparse.ArgumentParser()
parser.add_argument('root')
args = parser.parse_args()
root = os.path.abspath(args.root)
missing = []
for dirpath, _, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.html'):
continue
html_file = os.path.join(dirpath, filename)
doc = html.parse(html_file)
for element in doc.iter():
if element.get('src') is not None:
item = check_attr(root, html_file, 'src', element.get('src'))
if item:
missing.append(item)
if element.get('href') is not None and should_check_href(element):
item = check_attr(root, html_file, 'href', element.get('href'))
if item:
missing.append(item)
for html_file, attr, value, target in missing[:200]:
print('{}: missing {}={!r} -> {}'.format(
os.path.relpath(html_file, root), attr, value, target))
if len(missing) > 200:
print('... {} more missing resources'.format(len(missing) - 200))
return 1 if missing else 0
if __name__ == '__main__':
sys.exit(main())