-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathaudit.py
More file actions
106 lines (92 loc) · 4.39 KB
/
audit.py
File metadata and controls
106 lines (92 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# audit.py
import xml.etree.cElementTree as ET
import re
""" Finding unique tags and frequency of the tags """
def count_tags(filename):
tags = {}
for event, elem in ET.iterparse(filename, events=("start",)):
if elem.tag not in tags:
tags[elem.tag] = 1
else:
tags[elem.tag] += 1
return tags
""" Finding unique keys of tag element and frequency of the keys """
def count_tag_keys(filename):
tag_keys = {}
for event, elem in ET.iterparse(filename, events=("start",)):
if elem.tag == "tag":
for tag in elem.iter('tag'):
k = tag.get('k')
if k not in tag_keys:
tag_keys[k] = 1
else:
tag_keys[k] += 1
return tag_keys
""" Finding problematic keys """
# Compile a regular expression pattern into a regular expression object
without_colons = re.compile(r'^([a-z]|_)*$', re.IGNORECASE)
without_colons_num = re.compile(r'^([a-z]|_)*\d$', re.IGNORECASE)
with_colons = re.compile(r'^([a-z]|_)+:([a-z]|_)+', re.IGNORECASE)
problemchars = re.compile(r'[=\+/&<>;\'\"\?%#$@\,\. \t\r\n]')
# Finding the number of problematic keys
def count_key_types(filename):
count_keys = {"without_colons": 0, "with_colons": 0, "problemchars": 0, "other": 0}
for event, elem in ET.iterparse(filename, events=("start",)):
if elem.tag == "tag":
for tag in elem.iter('tag'):
k = tag.get('k')
if without_colons.search(elem.attrib['k']) or without_colons_num.search(elem.attrib['k']):
count_keys['without_colons'] = count_keys['without_colons'] + 1
elif with_colons.search(elem.attrib['k']):
count_keys['with_colons'] = count_keys['with_colons'] + 1
elif problemchars.search(elem.attrib['k']):
count_keys['problemchars'] = count_keys['problemchars'] + 1
else:
count_keys['other'] = count_keys['other'] + 1
return count_keys
# Finding problematic keys
def key_type(filename):
keys = {"without_colons": set(), "with_colons": set(), "problemchars": set(), "other": set()}
for event, elem in ET.iterparse(filename, events = ('start',)):
if elem.tag == "tag":
for tag in elem.iter('tag'):
if without_colons.search(elem.attrib['k']) or without_colons_num.search(elem.attrib['k']):
keys['without_colons'].add(tag.attrib['k'])
elif with_colons.search(elem.attrib['k']):
keys['with_colons'].add(tag.attrib['k'])
elif problemchars.search(elem.attrib['k']):
keys['problemchars'].add(tag.attrib['k'])
else:
keys['other'].add(tag.attrib['k'])
return keys
""" Finding unique street types """
# Compile a regular expression pattern into a regular expression object
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE) # matches the very last word in a street name
# Finding unique street types
def get_street_types(filename):
street_names = set() # a set is an unordered collection with no duplicate elements
street_types = set()
for event, elem in ET.iterparse(filename, events=("start",)):
if elem.tag in ['node','way']:
for tag in elem.iter("tag"): # elem.iter returns iteration all tags nested within the elem
if tag.attrib['k'] == "addr:street":
v = tag.get('v')
if v != "":
street_names.add(v)
for street_name in street_names:
m = street_type_re.search(street_name) # search the pattern in the street_name
if m:
street_type = m.group() # returns subgroups of the match,
# no argument sets to zero, meaning the whole match is returned
if street_type not in street_types:
street_types.add(street_type)
return street_types
""" Finding unique users """
def get_user(filename):
users = set() # a set is an unordered collection with no duplicate elements
for event, elem in ET.iterparse(filename, events=("start",)):
if elem.tag in ['node','way','relation']:
u = elem.attrib['uid'] # or alternatively you can use u = elem.get('uid')
if u != "":
users.add(u)
return users