diff --git a/expand_urls.py b/expand_urls.py new file mode 100644 index 0000000..c3ae105 --- /dev/null +++ b/expand_urls.py @@ -0,0 +1,105 @@ +import configparser +import glob +import os +import re +import requests +import time +from urllib.parse import urlparse +from parser import read_json_from_js_file + +class URLExpander: + def __init__(self): + self.config = configparser.ConfigParser(allow_no_value=True, interpolation=None, strict=False) + self.config.optionxform = str + self.config.read('expand_urls.ini') + try: + self.shorteners = self.config.options('shorteners') + except configparser.Error: + print('No configuration found, using default configuration') + self.config['shorteners'] = {} + self.config['mappings'] = {} + self.shorteners = ['t.co', '7ax.de', 'bit.ly', 'buff.ly', 'cnn.it', 'ct.de', 'flic.kr', 'go.shr.lc', 'ift.tt', 'instagr.am', 'is.gd', 'j.mp', 'ku-rz.de', 'p.dw.com', 'pl0p.de', 'spon.de', 'sz.de', 'tiny.cc', 'tinyurl.com', 'trib.al', 'wp.me', 'www.sz.de', 'yfrog.com'] + [self.config.set('shorteners', x) for x in self.shorteners] + with open('expand_urls.ini', 'w') as inifile: + self.config.write(inifile) + + def get_input_filenames(self): + input_folder = '.' + + # Identify the file and folder names - they change slightly depending on the archive size it seems + data_folder = os.path.join(input_folder, 'data') + tweet_js_filename_templates = ['tweet.js', 'tweets.js', 'tweets-part*.js'] + input_filenames = [] + for tweet_js_filename_template in tweet_js_filename_templates: + input_filenames += glob.glob(os.path.join(data_folder, tweet_js_filename_template)) + if len(input_filenames)==0: + print(f'Error: no files matching {tweet_js_filename_templates} in {data_folder}') + exit() + return input_filenames + + def process_tweets(self): + for tweets_js_filename in self.get_input_filenames(): + print(f'Parsing {tweets_js_filename}...') + json = read_json_from_js_file(tweets_js_filename) + [self.parse_tweet(tweet) for tweet in json] + + def save_mapping(self, original_url, expanded_url): + self.config['mappings'][original_url] = expanded_url + with open('expand_urls.ini', 'w') as inifile: + self.config.write(inifile) + + def mapping_exists(self, original_url): + try: + tmp = self.config['mappings'][original_url] + except KeyError: # TODO: this fails always + return False + return True + + def parse_tweet(self, tweet): + tweet = tweet['tweet'] + if 'entities' in tweet and 'urls' in tweet['entities'] and len(tweet['entities']['urls']) > 0: + for url in tweet['entities']['urls']: + if 'url' in url and 'expanded_url' in url: + original_url = url['expanded_url'] + if not self.mapping_exists(original_url): + expanded_url = self.expand_short_url(original_url) + if expanded_url != original_url: + self.save_mapping(original_url, expanded_url) + else: + # really old tweets may contain URLs as plain text in the body + possible_urls = re.finditer(r"https?://[a-z0-9\.]+/[a-z0-9?]{10}", tweet['full_text'], re.MULTILINE | re.IGNORECASE) + for (_, match) in enumerate(possible_urls): + matched_url = match.group(0) + if not self.mapping_exists(matched_url): + expanded_url = self.expand_short_url(matched_url) + if (expanded_url != matched_url): + self.save_mapping(matched_url, expanded_url) + + def is_short_url(self, url): + hostname = urlparse(url).hostname + if any(shortener == hostname for shortener in self.shorteners): + return True + return False + + def expand_short_url(self, url): + if self.is_short_url(url): + try: + request = requests.head(url) + time.sleep(0.75) + except: + pass + if request.ok == False: + return url + try: + url_from_location_header = request.headers['location'] + except KeyError: + return url + if not url_from_location_header.startswith('http'): + return url + elif ':443' in url_from_location_header or self.is_short_url(url_from_location_header): + url_from_location_header = self.expand_short_url(url_from_location_header.replace('http:', 'https:')) + url = url_from_location_header + return url + +if __name__ == '__main__': + URLExpander().process_tweets()