From db775840f0e87494ad65e66dc5c1efdba37c5201 Mon Sep 17 00:00:00 2001 From: Karl Beecken Date: Tue, 27 Sep 2022 16:28:19 +0200 Subject: [PATCH 1/4] switch to maxmind geoip2 --- parse3.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/parse3.py b/parse3.py index f34062a..3d41808 100644 --- a/parse3.py +++ b/parse3.py @@ -2,20 +2,20 @@ import stem import urllib.request import time -import pygeoip +import geoip2.database import tarfile import shutil from stem.descriptor import DocumentHandler, parse_file -GEOIP_FILENAME = "GeoLiteCity.dat" -geoip_db = None +GEOIP_FILENAME = "./GeoLite2-City.mmdb" def geo_ip_lookup(ip_address): - record = geoip_db.record_by_addr(ip_address) - if record is None: - return (False, False) - return (record['longitude'], record['latitude']) + with geoip2.database.Reader(GEOIP_FILENAME) as reader: + response = reader.city(ip_address) + if response is None: + return (False, False) + return (response.location.longitude, response.location.latitude) def dl_server_descriptors(year, month): """ Download server descriptors from CollecTor. """ @@ -320,11 +320,9 @@ def usage(): if not os.path.isfile(GEOIP_FILENAME): print("%s not found. It must be in the same directory as this script." % \ GEOIP_FILENAME) - print("Get the Maxmind city database here:") - print("-> https://dev.maxmind.com/geoip/legacy/geolite") + print("Get the Maxmind GeoIP2 city database here:") + print("-> https://dev.maxmind.com/geoip/geolite2-free-geolocation-data") sys.exit(1) - # Open GeoIP database. - geoip_db = pygeoip.GeoIP(GEOIP_FILENAME) month = day = 0 try: From 9ae950b061a70fc68b5660cf2efd8020f7fbfd7c Mon Sep 17 00:00:00 2001 From: Karl Beecken Date: Tue, 27 Sep 2022 16:31:07 +0200 Subject: [PATCH 2/4] update README --- README.md | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 9c2678a..3445dbf 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ -Tor Consensus and Server Descriptor Parser ------------------------------------------- +## Tor Consensus and Server Descriptor Parser This is the version of the script that formats the data in a way that it is compatible with [Torflow](https://github.com/unchartedsoftware/torflow) @@ -15,32 +14,30 @@ existent). Example: - $ python3 parse3.py 2010 07 09 - > Only July 9th, 2010 will be processed. + $ python3 parse3.py 2010 07 09 + > Only July 9th, 2010 will be processed. - $ python3 parse3.py 2010 08 - > August of 2010 will be processed. + $ python3 parse3.py 2010 08 + > August of 2010 will be processed. - $ python3 parse3.py 2010 - > All 2010 will be processed. + $ python3 parse3.py 2010 + > All 2010 will be processed. -Note ----- +## Note Decompression of lzma file (.xz) is not yet supported for Python 2. You'll have to uncompress them yourself for now. -Requirements ------------- +## Requirements - - Maxmind Geo IP city database in binary format (GeoLiteCity.dat). - https://dev.maxmind.com/geoip/legacy/geolite + - Maxmind GeoIP2 city database in binary format (GeoLite2-City.mmdb). + https://dev.maxmind.com/geoip/geolite2-free-geolocation-data - - pygeoip - $ pip install pygeoip + - geoip2 + $ pip install geoip2 - - tarfile (Only for parse3.py) - $ pip install tarfile + - tarfile (Only for parse3.py) + $ pip install tarfile - - Stem library - https://stem.torproject.org/ - $ pip install stem + - Stem library - https://stem.torproject.org/ + $ pip install stem From c1d2226b9b5a51e37ea5fc55cc14ad95d6cfdc34 Mon Sep 17 00:00:00 2001 From: Karl Beecken Date: Tue, 27 Sep 2022 20:34:08 +0200 Subject: [PATCH 3/4] fix exception handling --- parse3.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/parse3.py b/parse3.py index 3d41808..476b5ba 100644 --- a/parse3.py +++ b/parse3.py @@ -12,10 +12,15 @@ def geo_ip_lookup(ip_address): with geoip2.database.Reader(GEOIP_FILENAME) as reader: - response = reader.city(ip_address) - if response is None: + try: + response = reader.city(ip_address) + except geoip2.errors.AddressNotFoundError: + return (False, False) + except Exception as e: + print(" [-] Error while looking up %s" % (ip_address)) return (False, False) - return (response.location.longitude, response.location.latitude) + else: + return (response.location.longitude, response.location.latitude) def dl_server_descriptors(year, month): """ Download server descriptors from CollecTor. """ From 4cbe3c24a1c7f2bbfc647a80cba17fae208137fb Mon Sep 17 00:00:00 2001 From: Karl Beecken Date: Tue, 27 Sep 2022 20:34:24 +0200 Subject: [PATCH 4/4] format --- parse3.py | 68 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/parse3.py b/parse3.py index 476b5ba..0db2a9e 100644 --- a/parse3.py +++ b/parse3.py @@ -1,4 +1,5 @@ -import os, sys +import os +import sys import stem import urllib.request import time @@ -10,6 +11,7 @@ GEOIP_FILENAME = "./GeoLite2-City.mmdb" + def geo_ip_lookup(ip_address): with geoip2.database.Reader(GEOIP_FILENAME) as reader: try: @@ -22,6 +24,7 @@ def geo_ip_lookup(ip_address): else: return (response.location.longitude, response.location.latitude) + def dl_server_descriptors(year, month): """ Download server descriptors from CollecTor. """ url = "https://collector.torproject.org/archive/relay-descriptors/server-descriptors" @@ -43,7 +46,7 @@ def dl_server_descriptors(year, month): try: request = urllib.request.urlopen("%s/%s" % (url, filename)) if request.code != 200: - print(" [-] Unable to fetch server descriptors %s at %s" % \ + print(" [-] Unable to fetch server descriptors %s at %s" % (filename, url)) return None except Exception as e: @@ -54,6 +57,7 @@ def dl_server_descriptors(year, month): fp.close() return save_path + def dl_consensus(year, month): """ Download consensus from CollecTor. """ url = "https://collector.torproject.org/archive/relay-descriptors/consensuses" @@ -86,6 +90,7 @@ def dl_consensus(year, month): fp.close() return save_path + def dl_extra_infos(year, month): """ Download extra infos from CollecTor. """ url = "https://collector.torproject.org/archive/relay-descriptors/extra-infos" @@ -107,7 +112,8 @@ def dl_extra_infos(year, month): try: request = urllib.request.urlopen("%s/%s" % (url, filename)) if request.code != 200: - print(" [-] Unable to fetch extra infos %s at %s" % (filename, url)) + print(" [-] Unable to fetch extra infos %s at %s" % + (filename, url)) return None except Exception as e: print(" [-] Unable to fetch %s/%s" % (url, filename)) @@ -117,6 +123,7 @@ def dl_extra_infos(year, month): fp.close() return save_path + def uncompress(path, dst): # Remove .tar.xz dirname = path[:-7] @@ -126,6 +133,7 @@ def uncompress(path, dst): with tarfile.open(path) as f: f.extractall(dst) + def get_previous_data(year, month, day): # If day is undefined or if day is 1, we have to get the previous month # server descriptors data to get the descriptors. @@ -145,24 +153,28 @@ def get_previous_data(year, month, day): prev_ei_path = dl_extra_infos(prev_year, str_month) return prev_sd_path, prev_ei_path + def create_csv_file(year, month, day): # Process the consensuses that we are interested in. csv_filename = 'data/relays-%s-%s-%s-00-00-00.csv' % \ - (year, month, day) + (year, month, day) if os.path.exists(csv_filename): print(" [+] CSV %s exists, skipping!" % (csv_filename)) return None csv = open(csv_filename, 'w+') print(" [+] Creating CSV file %s" % (csv_filename)) - csv.write('Name,Fingerprint,Flags,IP,OrPort,ObservedBW,GuardClients,DirClients,Uptime,Longitude,Latitude)\n') + csv.write( + 'Name,Fingerprint,Flags,IP,OrPort,ObservedBW,GuardClients,DirClients,Uptime,Longitude,Latitude)\n') return csv + def client_ips_to_string(ei_dict, sep): l = [] for key, value in ei_dict.items(): l.append('%s:%s' % (key, value)) return sep.join(l) + def write_csv_data(consensus, sd_path, prev_sd_path, ei_path, prev_ei_path, year, month, day): """ Write data from consensus to CSV file """ csv_fp = create_csv_file(year, month, day) @@ -178,13 +190,15 @@ def write_csv_data(consensus, sd_path, prev_sd_path, ei_path, prev_ei_path, year fp = desc.fingerprint digest = desc.digest.lower() - sd_filename = "%s/%s/%s/%s" % (sd_path[:-7], digest[0], digest[1], digest) + sd_filename = "%s/%s/%s/%s" % (sd_path[:-7], + digest[0], digest[1], digest) try: sd = next(parse_file(sd_filename)) except Exception as e: if prev_sd_path is None: continue - sd_filename = "%s/%s/%s/%s" % (prev_sd_path[:-7], digest[0], digest[1], digest) + sd_filename = "%s/%s/%s/%s" % ( + prev_sd_path[:-7], digest[0], digest[1], digest) try: sd = next(parse_file(sd_filename)) except Exception as e: @@ -196,13 +210,15 @@ def write_csv_data(consensus, sd_path, prev_sd_path, ei_path, prev_ei_path, year dir_ips = "" if sd.extra_info_digest is not None: digest = sd.extra_info_digest.lower() - ei_filename = "%s/%s/%s/%s" % (ei_path[:-7], digest[0], digest[1], digest) + ei_filename = "%s/%s/%s/%s" % (ei_path[:-7], + digest[0], digest[1], digest) try: ei = next(parse_file(ei_filename)) except Exception as e: if prev_ei_path is None: continue - ei_filename = "%s/%s/%s/%s" % (prev_ei_path[:-7], digest[0], digest[1], digest) + ei_filename = "%s/%s/%s/%s" % ( + prev_ei_path[:-7], digest[0], digest[1], digest) try: ei = next(parse_file(ei_filename)) except Exception as e: @@ -231,11 +247,13 @@ def write_csv_data(consensus, sd_path, prev_sd_path, ei_path, prev_ei_path, year flag += "H" csv_fp.write("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" % (desc.nickname, - desc.fingerprint, flag, desc.address, desc.or_port, - float(sd.observed_bandwidth/1000.0/1000.0), entry_ips, - dir_ips, sd.uptime, lon, lat)) + desc.fingerprint, flag, desc.address, desc.or_port, + float( + sd.observed_bandwidth/1000.0/1000.0), entry_ips, + dir_ips, sd.uptime, lon, lat)) csv_fp.close() + def make_monthly_csv(year, month, day): """ Create the CSV files for the given year/month. If day is defined, only @@ -270,12 +288,14 @@ def make_monthly_csv(year, month, day): match_found = True consensus_pathname = \ "./consensuses/consensuses-%s-%s/%s/%s-%s-%s-00-00-00-consensus" % \ - (year, str_month, dir_day, year, str_month, dir_day) + (year, str_month, dir_day, year, str_month, dir_day) print(" [+] Reading consensus %s" % (consensus_pathname)) try: - consensus = next(parse_file(consensus_pathname, document_handler = DocumentHandler.DOCUMENT)) + consensus = next(parse_file(consensus_pathname, + document_handler=DocumentHandler.DOCUMENT)) except Exception as e: - print(" [-] Consensus %s not found. Skipping!" % (consensus_pathname)) + print(" [-] Consensus %s not found. Skipping!" % + (consensus_pathname)) continue # Nullify the previous path if we aren't the first of the month. @@ -283,21 +303,23 @@ def make_monthly_csv(year, month, day): prev_ei_path = None prev_sd_path = None write_csv_data(consensus, sd_path, prev_sd_path, ei_path, prev_ei_path, - str(year), str_month, dir_day) + str(year), str_month, dir_day) if match_found is False: print(" [-] Date not found in consensus") # Cleanup consensus and server descriptors for this month. - #shutil.rmtree(consensus_path) - #shutil.rmtree(sd_path) - #if prev_sd_path is not None: + # shutil.rmtree(consensus_path) + # shutil.rmtree(sd_path) + # if prev_sd_path is not None: # shutil.rmtree(prev_sd_path) + def make_yearly_csv(year): """ DOC DOC """ for month in range(1, 12): make_monthly_csv(year, month, 0) + def run(year, month, day): """ Using the given date, download the needed files and create the csv file(s). @@ -308,22 +330,24 @@ def run(year, month, day): make_yearly_csv(year) # Cleanup what's left if any. - #for dirname in os.listdir('./consensuses'): + # for dirname in os.listdir('./consensuses'): # shutil.rmtree(dirname) - #for dirname in os.listdir('./server-descriptors'): + # for dirname in os.listdir('./server-descriptors'): # shutil.rmtree(dirname) + def usage(): print("Usage: %s [ []]" % (sys.argv[0])) sys.exit(1) + if __name__ == '__main__': if len(sys.argv) == 1: usage() # Make sure we have a GeoIP database (maxmind) if not os.path.isfile(GEOIP_FILENAME): - print("%s not found. It must be in the same directory as this script." % \ + print("%s not found. It must be in the same directory as this script." % GEOIP_FILENAME) print("Get the Maxmind GeoIP2 city database here:") print("-> https://dev.maxmind.com/geoip/geolite2-free-geolocation-data")