This repository was archived by the owner on Nov 29, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanapyzerparser.py
More file actions
232 lines (192 loc) · 10.3 KB
/
anapyzerparser.py
File metadata and controls
232 lines (192 loc) · 10.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# Import the pathlib library for cross platform file path abstraction
import pathlib
# Import the re library to support regular expressions
import re
# The AnaPyzerParser class contains all methods involved in parsing information from a text or log file.
class AnaPyzerParser:
# Constructor
def __init__(self):
self.DEFAULT_FILE_PATH = pathlib.Path.home()
self._in_file_path = pathlib.Path('')
self._error_listener = None
self._success_listener = None
# parse_common_apache_to_list() parses an apache log that has been exported in the common, default format by an
# apache web server. This method is in need of additional work that will make it functional with logs
# that have custom configurations
# Reference for Common Log Format:
# https://httpd.apache.org/docs/1.3/logs.html#common
@staticmethod
def parse_common_apache_to_list(in_file):
if not in_file:
return None
log_data = {}
# universal_names = ['date', 'timestamp', 'service-name', 'server-name', 'server-ip', 'method', 'uri-stem',
# 'uri-query', 'server-port', 'username', 'client-ip', 'user-agent', 'cookie',
# 'referrer', 'host', 'http-status', 'protocol-substatus', 'win32-status', 'bytes-sent',
# 'bytes-received', 'time-taken'] - Dan unused variable
# initialize placeholder variables in log_data array representing each of the w3c format parameters
i = 0
# as long as there are lines in the log
# Split string into list of individual words with space as delimiter lines in the file, loop:
for line in in_file:
# Use split to cut date/timestamp combined line out of data line
date_ts = line.split('[', 1)
# Use split to separate date and timestamp
date_ts = date_ts[1].split(":", 1)
# Isolate timestamp from remaining information in line
date_ts[1] = date_ts[1].split(' ', 1)[0]
# Create new split line for extracting other data
split_line = line.split(' ')
request_info = line.split('"', 2)[1]
referer = "-"
method = request_info.split('/', 1)[0]
# if the request was a GET method, then uri-stem server-client status and bytes received data should exist
if "GET" in method:
uri_stem = request_info.split(' ')[1]
sc_status = split_line[8]
bytes_received = split_line[9]
else:
uri_stem = '-'
sc_status = '-'
bytes_received = '0'
client_ip = split_line[0]
data = [date_ts[0], date_ts[1], client_ip, method, uri_stem, sc_status, bytes_received, referer]
log_data[i] = data
i += 1
# length represents the number of lines of DATA present in returned parsed list
log_data['length'] = i
log_data['date'] = 0
log_data['timestamp'] = 1
log_data['client-ip'] = 2
log_data['method'] = 3
log_data['uri-stem'] = 4
log_data['sc-status'] = 5
log_data['bytes-sent'] = 6
log_data['referer'] = 7
if log_data['length'] is 0:
log_data = None
# return the list containing data
return log_data
# parse_w3c_to_list will parse all information from an IIS/W3C format log into a list
# With the locations of each field denoted in the parsed_log['parameter'] field
# For instance, if c-ip is parsed into the 2 index of each line, requesting parsed_log['c-ip'] will return 2
# This also works in reverse, so if you need the c-ip from each line, you request parsed_log[parsed_log['c-ip']]
# For information on what each tag means refer to:
# https://stackify.com/how-to-interpret-iis-logs/
@classmethod
def parse_w3c_to_list(cls, in_file):
if not in_file:
return None
log_data = {}
potential_parameters = ['date', 'time', 's-sitename', 's-computername', 's-ip', 'cs-method', 'cs-uri-stem',
'cs-uri-query', 's-port', 'cs-username', 'c-ip', 'cs(UserAgent)', 'cs(Cookie)',
'cs(Referer)', 'cs-host', 'sc-status', 'sc-substatus', 'sc-win32-status', 'sc-bytes',
'cs-bytes', 'time-taken']
universal_names = ['date', 'timestamp', 'service-name', 'server-name', 'server-ip', 'method', 'uri-stem',
'uri-query', 'server-port', 'username', 'client-ip', 'user-agent', 'cookie',
'referer', 'host', 'http-status', 'protocol-substatus', 'win32-status', 'bytes-sent',
'bytes-received', 'time-taken']
log_data['fields'] = -1
# initialize placeholder variables in log_data array representing each of the w3c format parameters
for parameter in potential_parameters:
log_data[parameter] = -1
i = 0
# as long as there are lines in the file, loop:
for line in in_file:
# Split string into list of individual words with space as delimiter
split_line = line.split(' ')
# Every header line at the top of the log will start with a #, making it
# easy to differentiate between data and the header
if '#' in split_line[0]:
log_data[str(split_line[0])] = split_line
if '#Date' in split_line[0] and log_data['date'] == -1:
log_data['date'] = split_line[1]
if '#Fields' in split_line[0]:
# Check the fields line for all available data being logged
log_data['fields'] = 1
j = 0
for element in split_line:
for parameter in potential_parameters:
if element == parameter:
log_data[parameter] = j - 1
j += 1
else:
if log_data['fields'] == -1:
raise IndexError()
log_data[i] = split_line
i += 1
# once log file is parsed, assign the new positions of each requested parameter in the log_data list
# this will prevent issues when using methods that rely on tagged element values representing element
# placement in array
k = 0
for parameter in potential_parameters:
# add an index in the log_data array representing the universal name for each field
log_data[universal_names[k]] = log_data[parameter]
k += 1
# length represents the number of lines of DATA present in returned parsed list
log_data['length'] = i
if log_data['length'] is 0:
log_data = None
# return the list containing CSV data
return log_data
# requested parameters list can consist of the following, using the official IIS naming convention found in header
# For information on what each tag means refer to:
# https://stackify.com/how-to-interpret-iis-logs/
@classmethod
def parse_w3c_requested_to_list(cls, in_file, requested_parameters):
if not in_file:
return None
log_data = {}
potential_parameters = ['date', 'time', 's-sitename', 's-computername', 's-ip', 'cs-method', 'cs-uri-stem',
'cs-uri-query', 's-port', 'cs-username', 'c-ip', 'cs(UserAgent)', 'cs(Cookie)',
'cs(Referer)', 'cs-host', 'sc-status', 'sc-substatus', 'sc-win32-status', 'sc-bytes',
'cs-bytes', 'time-taken']
log_data['header'] = False
# initialize placeholder variables in log_data array representing each of the w3c format parameters
for parameter in potential_parameters:
log_data[parameter] = -1
i = 0
# as long as there are lines in the file, loop:
for line in in_file:
# Split string into list of individual words with space as delimiter
split_line = line.split(' ')
# Every header line at the top of the log will start with a #, making it
# easy to differentiate between data and the header
if '#' in split_line[0]:
log_data[str(split_line[0])] = split_line
log_data['header'] = True
if '#Date' in split_line[0] and log_data['date'] == -1:
log_data['date'] = split_line[1]
if '#Fields' in split_line[0]:
# Check the fields line for all available data being logged
# j keeps count of the placement of each parameter in the split line
j = 0
for element in split_line:
# iterate through list of potential parameters
for parameter in potential_parameters:
# mark the location of each parameter present in relation to the list to be created
if element == parameter:
log_data[parameter] = j
j += 1
else:
if not log_data['header']:
return None
# initialize log_data[i] as a blank list to allow for use of append method
log_data[i] = []
# iterate through array of requested_parameters and add the information requested to the parsed list
for parameter in requested_parameters:
if log_data.get(parameter):
log_data[i].append(split_line[log_data[parameter]])
else:
pass
i += 1
# once log file is parsed, assign the new positions of each requested parameter in the log_data list to
# prevent issues when using methods that rely on tagged element values representing element placement in array
k = 0
for parameter in requested_parameters:
log_data[parameter] = k
k += 1
# length represents the number of lines of DATA present in returned parsed list
log_data['length'] = i
# return the list containing w3c data
return log_data