-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbrowserhandler.py
More file actions
347 lines (291 loc) · 14.3 KB
/
browserhandler.py
File metadata and controls
347 lines (291 loc) · 14.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
import sqlite3
import os.path
import time
import json
import glob
import sys
USER_DIR = os.path.expanduser('~') # Cross-platform courtesy of Python
# Apple Safari constants
SAFARI_HISTORY_DB = USER_DIR + '/Library/Safari/History.db'
SAFARI_EPOCH = 978307200 # midnight UTC on 1 January 2001, as per the usual epoch of midnight GMT on 1/1/70
SAFARI_EPOCH_WEBKIT = 116_444_73600 + 978307200 # Seconds between 1/1/1601 and 1/1/2001 - bridge Safari and Chrome
# Mozilla Firefox constants
if sys.platform == 'darwin':
# Apple Macintosh
FIREFOX_DIR = USER_DIR + '/Library/Application Support/Firefox'
elif sys.platform == 'win32':
# Microsoft Windows
FIREFOX_DIR = USER_DIR + '/AppData/Roaming/Mozilla/Firefox'
else:
# UNIX arts-and-crafts
FIREFOX_DIR = USER_DIR + '.mozilla/firefox'
# Vivaldi constants
if sys.platform == 'darwin':
VIVALDI_DIR = USER_DIR + '/Library/Application Support/Vivaldi'
elif sys.platform == 'win32':
VIVALDI_DIR = USER_DIR + '/AppData/Local/Vivaldi'
else:
VIVALDI_DIR = USER_DIR + '.config/vivaldi'
class VisitContainer:
url = None
title = None
time = 0
counter = 0
duration = 0
def __init__(self, url, title, time, counter, duration=0):
self.url = url
self.title = title
self.time = time
self.counter = counter
# Only Chrome-based browsers give this to me, and calculating it based on time is error prone.
# Thus only used with those.
self.duration = duration
class SafariHistory:
# Allow browser-wide attributes.
def __init__(self):
self.maximum_counter = -1
self.entries = {}
def get_all_safari_data():
# This was written before Safari had profiles, and they are thus not supported.
# SQLITE3 Database Structures:
# history_items table:
# index 0: ID, an integer
# index 1: URL, a TEXT
# index 2: domain_expansion, a TEXT
# index 3: visit_count, an INTEGER
# index 4: daily_visit_counters, a BLOB
# index 5: weekly_vision_counts, a BLOB
# index 6: autocomplete_triggers, a BLOB
# index 7: should_recompute_derived_visit_counts, an INTEGER
# index 8: visit_count_score, an INTEGER
# index 9: status_code, an INTEGER
# history_visits table:
# index 0: id, an INTEGER (NB that IDs are not shared across tables
# index 1: history_item, an INTEGER
# index 2: visit_time, an INTEGER (expressed in seconds elapsed since midnight UTC on 1 January 2001
# index 3: title, a TEXT
# index 4: load_successful, a BOOLEAN
# index 5: http_non_get, a BOOLEAN
# index 6: synthesized, a BOOLEAN
# index 7: redirect_source, an INTEGER (presumably, an ID)
# index 8: redirect_destination, an INTEGER (also presumably an ID)
# index 9: origin, an INTEGER (also also presumably an ID)
# index 10: generation, an INTEGER
# index 11: attributes, an INTEGER
# index 12: score, an INTEGER
cur = sqlite3.connect(SAFARI_HISTORY_DB).cursor()
timestamp = (time.time() - 86400*14) - SAFARI_EPOCH
# Enable fetching for the current day only
visits = cur.execute('select * from history_visits where visit_time > ?', [timestamp]).fetchall() #where visit_time > ?', [timestamp]).fetchall()
domains = cur.execute('select distinct domain_expansion from history_items').fetchall()
results = SafariHistory()
for i in domains:
if i is not None:
results.entries[i[0]] = [] # blunt instrument to prevent key None from creeping in.
for i in range(len(visits) - 1):
history_item = cur.execute('select * from history_items where id = ?', [visits[i][1]]).fetchall()[0]
# https://www.
# 01234567890123
if history_item[2] is None:
domain = history_item[1]
if domain[4] == 's': # secure connection
domain = domain[8:]
else:
domain = domain[7:]
domain = domain.split('/')[0]
domain = os.path.splitext(domain)[0]
domain = domain.replace('www.', '')
data = VisitContainer(
history_item[1],
visits[i][3],
visits[i][2] - timestamp,
history_item[3]
)
if data.counter > results.maximum_counter:
results.maximum_counter = data.counter
if domain not in results.entries:
results.entries[domain] = [data]
else:
results.entries[domain].append(data)
else:
data = VisitContainer
(
history_item[1],
visits[i][3],
visits[i][2] - timestamp,
history_item[3]
)
if data.counter > results.maximum_counter:
results.maximum_counter = data.counter
results.entries[history_item[2]].append(data)
return results.entries
class VivaldiProfile:
# Various bits of data are stored in ~/Library/Application Support/Vivaldi/
# Separate directories are used for each profile
# The salient pieces are as follows
# The file "Preferenes" (no extension) stores user settings, including the profile name
# It is a vast JSON object; name is stored in the "name" key of the object under
# the "profile" key. Thus, using the python dictionary model, it may be accessed like so:
# preferences_json_object_name['profile']['name']
# The file "History" (again, with no extension) is an SQLite3 database with the following structure
# (I have only described the relevant portions here; the overall structure is considerably more intricate)
#
# The clusters_and_visits table is structured as follows:
# 0: cluster_id, an INTEGER (this refers, presumably, to an item in the "clusters" table)
# 1: visit_id, an INTEGER (this likewise presumably refers to an item in the "visits" table)
# 2: score, a NUMERIC (something, perhaps, to do with Vivaldi's history UI?)
# 3: engagement_score, a NUMERIC (will precipitate the rise of Skynet)
# 4: url_for_deduping, a LONGVARCHAR - this is the domain, with protocol, for a particular history entry
# 5: url_for_display, a LONGVARCHAR - this contains the website's full URL.
# The visits table is structured as follows:
# 0: id, an INTEGER (I suspect that it is relatively self-explanatory)
# 1: url, an INTEGER (purpose TBD)
# 2: visit_time, an INTEGER (the time of the visit, expressed using Chrome the timestamp format)
# 3: from_visit, an INTEGER (presumably some form of cross-referencing)
# 4: visit_duration, an INTEGER
# 5: none of the other fields seem to be of relevance
# The urls table is structured as follows:
# 0: id, an INTEGER, a unique intra-table ID.
# 1: url, a LONGVARCHAR, the URL in question
# 2: title, a LONGVARCHAR, the page's title
# 3: visit_count, an INTEGER, the number of visits to the URL in question
# 4: typed_count, an INTEGER, the number of times the user has typed this URL into the address bar.
# 5: last_visit_time, an INTEGER, the time of the most recent visit (expressed in Chrome/WebKit time)
# 6: hidden, an INTEGER, which serves a purpose that I have not discerned.
@staticmethod
def get_all_profile_names():
disk_profiles = glob.glob(VIVALDI_DIR + '/Profile*')
profile_names = [None] * len(disk_profiles)
for i in range(len(disk_profiles)):
with open(disk_profiles[i] + '/Preferences') as f:
name = json.load(f)['profile']['name']
f.close()
profile_names[i] = name
return profile_names
@staticmethod
def __chrome_to_safari_time(t):
result = t // 1_000_000 # microseconds to seconds
result -= SAFARI_EPOCH_WEBKIT # Webkit to Safari/Unix
return result
def __init__(self, path):
self.maximum_counter = -1
self.maximum_duration = -1
self.cursor = sqlite3.connect(path + '/History').cursor()
self.entries = None
self.get_visits() # 3/13/24: this is an expensive operation, but necessary to do anything with this object. Include it here.
def get_visits(self):
visits = self.cursor.execute('select * from clusters_and_visits').fetchall()
results = {}
for counter, i in enumerate(visits):
if i[4] not in results:
# The domain has not yet been processed. An empty array with the proper key name must be added
# so that calls to results[<something or other>].append() work as intended
results[i[4]] = []
# Because the relevant data are stored across three different tables, cross-referencing is necessary
# This must, unfortunately, be done for each and every object - a real nuisance for performance.
visit = self.cursor.execute('select * from visits where id = ?', [i[1]]).fetchall()[0]
try:
url_object = self.cursor.execute('select * from urls where url = ?', [i[5]]).fetchall()[0]
except IndexError:
# There is no such URL present in the database. We thus cannot fetch required data, and the visit
# may as well not exist. Pass it by accordingly.
continue
time = VivaldiProfile.__chrome_to_safari_time(visit[2])
data = VisitContainer
(
i[5],
url_object[2],
time,
url_object[3],
visit[4] # Duration, because I have that here firsthand.
)
if data.counter > self.maximum_counter:
self.maximum_counter = data.counter
if data.duration > self.maximum_duration:
self.maximum_duration = data.duration
results[i[4]].append(data)
self.entries = results
return results
def get_all_vivaldi_data():
# Timestamps, as mentioned use the Chrome/Webkit format. This means that they represent microseconds
# elapsed since midnight UTC on January 1, 1601.
profiles = {}
for i in VivaldiProfile.get_all_profile_names():
profiles[i] = None
for counter, i in enumerate(profiles.keys()):
p = VivaldiProfile(VIVALDI_DIR + '/Profile %d' % (counter + 1))
profiles[i] = p
return profiles
class FirefoxProfile:
# Partial documentation on the Firefox history database format:
# Times (called "dates") are expressed in microseconds since midnight UTC on 1 January 1970
# The moz_historyvisits table is structured as follows:
# index 0: id, an INTEGER - a unique intra-table ID
# index 1: from_visit, an INTEGER - purpose obscure
# index 2: place_id, an INTEGER - this references the relevant row in the moz_places table, described below
# index 3: visit_date, an INTEGER - presumably, the date
# index 4: visit_type, an INTEGER - purpose obscure
# index 5: session, an INTEGER - purpose obscure
# index 6: source, an INTEGER - purpose obscure
# index 7: triggeringPlaceId, an INTEGER - purpose obscure
# The moz_plaes table describes each website
# index 0: id, an INTEGER - an intra-table id.
# index 1: url, a LONGVARCHAR - contains the page's URL
# index 2: title, a LONGVARCHAR - contains the page's title
# index 3: rev_host, a LONGVARCHAR - contains a reversed version of the website's host.
# index 4: visit_count, an INTEGER - the number of visits to a particular site
# index 5: hidden, an INTEGER - purpose obscure
# index 6: typed, an INTEGER - indicates whether the user directly typed the URL.
# index 7: frecency, an INTEGER - purpose obscure.
# index 8: last_visit_date, an INTEGER - time of the most recent visit
# index 9: guid, a TEXT - purpose obscure
# index 10: foreign_count, an INTEGER - purpose obscure
# index 11: url_hash - a hash, using an unknown algorithm, of the URL.
# index 12: description, a TEXT - purpose obscure
# index 13: preview_image_url, a TEXT - purpose obscure
# index 14: site_name, a TEXT - seemlingly a repeat of the URL
# index 15: origin_id, an INTEGER - presumably, this is an ID in some other table
# index 16: recalc_frecency - the "frecency" of something or other.
# index 17: alt_frecency - purpose obscure
# index 18: recalc_alt_frecency - purpose obscure.
@staticmethod
def __mozilla_to_safari_time(t):
return (t / 1_000_000) - SAFARI_EPOCH
def __init__(self, path):
self.cursor = sqlite3.connect(path + '/places.sqlite').cursor()
self.entries = {}
self.get_visits() # 3/13/24: you'll have to do this anyway, so do it here.
def get_visits(self):
try:
visits = self.cursor.execute('select * from moz_historyvisits').fetchall()
except sqlite3.OperationalError:
# no such table, which indicates that no history exists
return {}
try:
for counter, i in enumerate(visits):
metadata = self.cursor.execute('select * from moz_places where id = ?', [i[2]]).fetchall()[0]
domain = ''.join(reversed(metadata[3]))
if domain not in self.entries:
self.entries[domain] = []
time = self.__mozilla_to_safari_time(i[3]);
data = VisitContainer
(
None,
metadata[2],
time,
i[4]
)
if data.counter > self.maximum_counter:
self.maximum_counter = data.counter
self.entries[domain].append(data)
except IndexError:
pass
return self.entries
def get_all_firefox_data():
profiles = {}
disk_profiles = glob.glob(FIREFOX_DIR + '/Profiles/*default')
for counter, i in enumerate(disk_profiles):
p = FirefoxProfile(i)
p.entries = p.get_visits()
profiles[i.split('/')[-1]] = p
return profiles