From 23dba29e8791ae8822384ebfa57e30dddca18d79 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Tue, 22 Nov 2022 23:10:01 +0100 Subject: [PATCH 01/46] WIP: save known tweets to file and re-use them, to avoid re-downloads. See TODO comments in this commit. --- parser.py | 96 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 33 deletions(-) diff --git a/parser.py b/parser.py index c935512..d9b8e0d 100755 --- a/parser.py +++ b/parser.py @@ -171,8 +171,23 @@ def collect_tweet_id(tweet): tweet = tweet['tweet'] return tweet['id_str'] - -def collect_tweet_references(tweet, known_tweet_ids, counts): +def add_known_tweet(known_tweets, new_tweet): + if 'tweet' in new_tweet.keys(): + new_tweet = new_tweet['tweet'] + tweet_id = new_tweet['id_str'] + if tweet_id in known_tweets: + if known_tweets[tweet_id] == new_tweet: + pass + #print(f"Tweet {tweet_id} was already known with identical contents") + else: + print(f"Tweet {tweet_id} redefined with new contents, NEEDS MERGE") + # TODO add recursive dict merging. Try this one: https://stackoverflow.com/a/7205107/39946 + known_tweets[tweet_id] = new_tweet + else: + #print(f"Tweet {tweet_id} is new") + known_tweets[tweet_id] = new_tweet + +def collect_tweet_references(tweet, known_tweets, counts): if 'tweet' in tweet.keys(): tweet = tweet['tweet'] tweet_ids = set() @@ -188,8 +203,8 @@ def collect_tweet_references(tweet, known_tweet_ids, counts): counts['quote'] += 1 # Collect previous tweets in conversation - if 'in_reply_to_status_id_str' in tweet: - if (tweet['in_reply_to_status_id_str'] in known_tweet_ids): + if 'in_reply_to_status_id_str' in tweet and tweet['in_reply_to_status_id_str'] is not None: + if (tweet['in_reply_to_status_id_str'] in known_tweets): counts['known_reply'] += 1 else: tweet_ids.add(tweet['in_reply_to_status_id_str']) @@ -206,6 +221,9 @@ def collect_tweet_references(tweet, known_tweet_ids, counts): tweet_ids.add(tweet['id_str']) counts['media'] += 1 + if None in tweet_ids: + raise Exception(f"Tweet has id None: {tweet}") + return tweet_ids def convert_tweet(tweet, username, archive_media_folder, output_media_folder_name, @@ -231,7 +249,7 @@ def convert_tweet(tweet, username, archive_media_folder, output_media_folder_nam header_markdown = '' header_html = '' if 'in_reply_to_status_id' in tweet: - # match and remove all occurences of '@username ' at the start of the body + # match and remove all occurrences of '@username ' at the start of the body replying_to = re.match(r'^(@[0-9A-Za-z_]* )*', body_markdown)[0] if replying_to: body_markdown = body_markdown[len(replying_to):] @@ -313,7 +331,7 @@ def convert_tweet(tweet, username, archive_media_folder, output_media_folder_nam # extract user_id:handle connections if 'in_reply_to_user_id' in tweet and 'in_reply_to_screen_name' in tweet: id = tweet['in_reply_to_user_id'] - if int(id) >= 0: # some ids are -1, not sure why + if id is not None and int(id) >= 0: # some ids are -1, not sure why handle = tweet['in_reply_to_screen_name'] users[id] = UserData(id=id, handle=handle) if 'entities' in tweet and 'user_mentions' in tweet['entities']: @@ -463,27 +481,36 @@ def parse_tweets(input_filenames, username, users, html_template, archive_media_ Copy the media used to output_media_folder_name. Collect user_id:user_handle mappings for later use, in 'users'. Returns the mapping from media filename to best-quality URL. - """ - tweets = [] + """ + converted_tweets = [] media_sources = [] counts = defaultdict(int) - known_tweet_ids = set() - - # TODO Load tweets that we saved in an earlier run between pass 2 and 3 + known_tweets = {} + + # TODO If we run this tool mutliple times, in `known_tweets` we will have our own tweets as + # well as related tweets by others. With each run, the tweet graph is expanded. We probably do + # not want this. To stop it, implement one of these: + # 1. keep own tweets and other tweets in different dicts + # 2. put them all in one dict, but mark the tweets by others, so that certain steps will ignore them + # 3. use the data that is already present in a tweet to distinguish own tweets from others + + # Load tweets that we saved in an earlier run between pass 2 and 3 + tweet_dict_filename = 'known_tweets.json' + if os.path.exists(tweet_dict_filename): + with open(tweet_dict_filename, 'r', encoding='utf8') as f: + known_tweets = json.load(f) - # First pass: collect IDs of known tweets + # Fist pass: Load tweets from all archive files and add them to known_tweets for tweets_js_filename in input_filenames: - json = read_json_from_js_file(tweets_js_filename) - print (f"Processing {len(json)} tweets in {tweets_js_filename}...") - for tweet in json: - known_tweet_ids.add(collect_tweet_id(tweet)) + json_result = read_json_from_js_file(tweets_js_filename) + for tweet in json_result: + add_known_tweet(known_tweets, tweet) - # Second pass: collect IDs of references tweets, excluding known tweets from pass 1 tweet_ids_to_download = set() - for tweets_js_filename in input_filenames: - json = read_json_from_js_file(tweets_js_filename) - for tweet in json: - tweet_ids_to_download.update(collect_tweet_references(tweet, known_tweet_ids, counts)) + + # Second pass: Iterate through all those tweets + for tweet in known_tweets.values(): + tweet_ids_to_download.update(collect_tweet_references(tweet, known_tweets, counts)) # Download referenced tweets referenced_tweets = [] @@ -500,24 +527,27 @@ def parse_tweets(input_filenames, username, users, html_template, archive_media_ with requests.Session() as session: bearer_token = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' guest_token = get_twitter_api_guest_token(session, bearer_token) - referenced_tweets = get_tweets(session, bearer_token, guest_token, list(tweet_ids_to_download), False) - # TODO Save tweets to a file, merging with contents of existing file if present # TODO We could download user data together with the tweets, because we will need it anyway. But we might download the data for each user multiple times then. + downloaded_tweets = get_tweets(session, bearer_token, guest_token, list(tweet_ids_to_download), False) + for downloaded_tweet in downloaded_tweets.values(): + add_known_tweet(known_tweets, downloaded_tweet) + with open(tweet_dict_filename, "w") as outfile: + json.dump(known_tweets, outfile, indent=2) + print(f"Saved {len(known_tweets)} tweets to '{tweet_dict_filename}'.") + except Exception as err: print(f'Failed to download tweets: {err}') # Third pass: convert tweets, using the downloaded references from pass 2 - for tweets_js_filename in input_filenames: - json = read_json_from_js_file(tweets_js_filename) - for tweet in json: - tweets.append(convert_tweet(tweet, username, archive_media_folder, - output_media_folder_name, tweet_icon_path, - media_sources, users, referenced_tweets)) - tweets.sort(key=lambda tup: tup[0]) # oldest first + for tweet in known_tweets.values(): + converted_tweets.append(convert_tweet(tweet, username, archive_media_folder, + output_media_folder_name, tweet_icon_path, + media_sources, users, referenced_tweets)) + converted_tweets.sort(key=lambda tup: tup[0]) # oldest first # Group tweets by month (for markdown) grouped_tweets_markdown = defaultdict(list) - for timestamp, md, _ in tweets: + for timestamp, md, _ in converted_tweets: # Use a markdown filename that can be imported into Jekyll: YYYY-MM-DD-your-title-here.md dt = datetime.datetime.fromtimestamp(timestamp) markdown_filename = f'{dt.year}-{dt.month:02}-01-Tweet-Archive-{dt.year}-{dt.month:02}.md' # change to group by day or year or timestamp @@ -530,11 +560,11 @@ def parse_tweets(input_filenames, username, users, html_template, archive_media_ f.write(md_string) # Write into html file - all_html_string = '
\n'.join(html for _, _, html in tweets) + all_html_string = '
\n'.join(html for _, _, html in converted_tweets) with open(output_html_filename, 'w', encoding='utf-8') as f: f.write(html_template.format(all_html_string)) - print(f'Wrote {len(tweets)} tweets to *.md and {output_html_filename}, with images and video embedded from {output_media_folder_name}') + print(f'Wrote {len(converted_tweets)} tweets to *.md and {output_html_filename}, with images and video embedded from {output_media_folder_name}') return media_sources From d1fb57ce12be653d6ed2c1df396e12c91d3d65c6 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Wed, 23 Nov 2022 19:00:06 +0100 Subject: [PATCH 02/46] Tweet merging and a lot of error handling. Reduce number of useless re-downloads, but they still happen. --- parser.py | 197 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 147 insertions(+), 50 deletions(-) diff --git a/parser.py b/parser.py index d9b8e0d..87707c9 100755 --- a/parser.py +++ b/parser.py @@ -29,6 +29,7 @@ import subprocess import sys import time +import traceback # hot-loaded if needed, see import_module(): # imagesize # requests @@ -70,6 +71,8 @@ def get_twitter_api_guest_token(session, bearer_token): return guest_token +# TODO if downloading fails within the for loop, we should be able to return the already +# fetched users, but also make it clear that it is incomplete. Maybe do it like in get_tweets. def get_twitter_users(session, bearer_token, guest_token, user_ids): """Asks Twitter for all metadata associated with user_ids.""" users = {} @@ -92,32 +95,44 @@ def get_twitter_users(session, bearer_token, guest_token, user_ids): def get_tweets(session, bearer_token, guest_token, tweet_ids, include_user=True, include_alt_text=True): """ Get the json metadata for a multiple tweets. - If include_user is False, you will only get a numerical id for the user.""" + If include_user is False, you will only get a numerical id for the user. + Returns `tweets, remaining_tweet_ids` where `tweets`. If all goes well, `tweets` will contain all + tweets, and `remaining_tweet_ids` is empty. If something goes wrong, downloading is stopped + and only the tweets we got until then are returned. + TODO In some cases, up to 100 tweets may be both in `tweets` and `remaining_tweet_ids`.""" tweets = {} remaining_tweet_ids = tweet_ids.copy() - while remaining_tweet_ids: - max_batch = 100 - tweet_id_batch = remaining_tweet_ids[:max_batch] - tweet_id_list = ",".join(tweet_id_batch) - print(f"Download {len(tweet_id_batch)} tweets of {len(remaining_tweet_ids)} remaining...") - query_url = f"https://api.twitter.com/1.1/statuses/lookup.json?id={tweet_id_list}&tweet_mode=extended" - if not include_user: - query_url += "&trim_user=1" - if include_alt_text: - query_url += "&include_ext_alt_text=1" - response = session.get(query_url, - headers={'authorization': f'Bearer {bearer_token}', 'x-guest-token': guest_token}) - if response.status_code == 429: - # Rate limit exceeded - get a new token - guest_token = get_twitter_api_guest_token(session, bearer_token) - continue - if not response.status_code == 200: - raise Exception(f'Failed to get tweets: {response}') - response_json = json.loads(response.content) - for tweet in response_json: - tweets[tweet["id_str"]] = tweet - remaining_tweet_ids = remaining_tweet_ids[max_batch:] - return tweets + try: + while remaining_tweet_ids: + max_batch = 100 + tweet_id_batch = remaining_tweet_ids[:max_batch] + tweet_id_list = ",".join(map(str,tweet_id_batch)) + print(f"Download {len(tweet_id_batch)} tweets of {len(remaining_tweet_ids)} remaining...") + query_url = f"https://api.twitter.com/1.1/statuses/lookup.json?id={tweet_id_list}&tweet_mode=extended" + if not include_user: + query_url += "&trim_user=1" + if include_alt_text: + query_url += "&include_ext_alt_text=1" + response = session.get(query_url, + headers={'authorization': f'Bearer {bearer_token}', 'x-guest-token': guest_token}, timeout=5) + if response.status_code == 429: + # Rate limit exceeded - get a new token + guest_token = get_twitter_api_guest_token(session, bearer_token) + continue + if not response.status_code == 200: + raise Exception(f'Failed to get tweets: {response}') + response_json = json.loads(response.content) + for tweet in response_json: + if "id_str" in tweet: + tweets[tweet["id_str"]] = tweet + else: + print (f"Tweet could not be returned because it has no id: {tweet}") + remaining_tweet_ids = remaining_tweet_ids[max_batch:] + except Exception as err: + traceback.print_exc() + print(f"Exception during batch download of tweets: {err}"); + print(f"Try to work with the tweets we got so far."); + return tweets, remaining_tweet_ids def lookup_users(user_ids, users): """Fill the users dictionary with data from Twitter""" @@ -171,54 +186,117 @@ def collect_tweet_id(tweet): tweet = tweet['tweet'] return tweet['id_str'] +# returns an it if you give it either an int or a str that can be parsed as +# an int. Otherwise, returns None. +def parse_as_number(str_or_number): + if isinstance(str_or_number, str): + if str_or_number.isnumeric(): + return int(str_or_number) + else: + return None + elif isinstance(str_or_number, int): + return str_or_number + else: + return None + + +# Taken from https://stackoverflow.com/a/7205107/39946, then adapted to +# some commonly observed twitter specifics. +def merge(a, b, path=None): + "merges b into a" + if path is None: path = [] + for key in b: + if key in a: + if isinstance(a[key], dict) and isinstance(b[key], dict): + merge(a[key], b[key], path + [str(key)]) + elif a[key] == b[key]: + pass # same leaf value + elif key == 'retweet_count' or key == 'favorite_count': + a[key] = max(parse_as_number(a[key]), parse_as_number(b[key])) + elif key in ['possibly_sensitive']: + # ignore conflicts in unimportant fields that tend to differ + pass + elif parse_as_number(a[key]) == parse_as_number(b[key]): + # Twitter sometimes puts numbers into strings, so that the same number might be 3 or '3' + a[key] = parse_as_number(a[key]) + elif a[key] is None and b[key] is not None: + # just as if `not key in a` + a[key] = b[key] + elif a[key] is not None and b[key] is None: + # Nothing to update + pass + else: + raise Exception(f"Conflict at {'.'.join(path + [str(key)])}, value '{a[key]}' vs. '{b[key]}'") + else: + a[key] = b[key] + return a + +def unwrap_tweet(tweet): + if 'tweet' in tweet.keys(): + return tweet['tweet'] + else: + return tweet + def add_known_tweet(known_tweets, new_tweet): - if 'tweet' in new_tweet.keys(): - new_tweet = new_tweet['tweet'] tweet_id = new_tweet['id_str'] if tweet_id in known_tweets: if known_tweets[tweet_id] == new_tweet: pass #print(f"Tweet {tweet_id} was already known with identical contents") else: - print(f"Tweet {tweet_id} redefined with new contents, NEEDS MERGE") - # TODO add recursive dict merging. Try this one: https://stackoverflow.com/a/7205107/39946 - known_tweets[tweet_id] = new_tweet + try: + merge(known_tweets[tweet_id], new_tweet) + except Exception as err: + print(f"Tweet {tweet_id} could not be merged: {err}") + else: #print(f"Tweet {tweet_id} is new") known_tweets[tweet_id] = new_tweet def collect_tweet_references(tweet, known_tweets, counts): - if 'tweet' in tweet.keys(): - tweet = tweet['tweet'] + tweet = unwrap_tweet(tweet) tweet_ids = set() + # Collect quoted tweets - if 'entities' in tweet and 'urls' in tweet['entities']: + if has_path(tweet, ['entities', 'urls']): for url in tweet['entities']['urls']: if 'url' in url and 'expanded_url' in url: expanded_url = url['expanded_url'] matches = re.match(r'^https://twitter.com/([0-9A-Za-z_]*)/status/(\d+)$', expanded_url) if (matches): #user_handle = matches[1] - tweet_ids.add(matches[2]) - counts['quote'] += 1 + quoted_id = matches[2] + if (quoted_id in known_tweets): + counts['known_quote'] += 1 + else: + tweet_ids.add(quoted_id) + print(f"Need to download tweet {tweet['id_str']} because of being quoted") + counts['quote'] += 1 # Collect previous tweets in conversation - if 'in_reply_to_status_id_str' in tweet and tweet['in_reply_to_status_id_str'] is not None: - if (tweet['in_reply_to_status_id_str'] in known_tweets): + # Only do this for tweets from our original archive + if 'from_archive' in tweet and has_path(tweet, ['in_reply_to_status_id_str']): + prev_tweet_id = parse_as_number(tweet['in_reply_to_status_id_str']) + if (prev_tweet_id in known_tweets): counts['known_reply'] += 1 else: - tweet_ids.add(tweet['in_reply_to_status_id_str']) + tweet_ids.add(prev_tweet_id) + print(f"Need to download tweet {prev_tweet_id} because of reply to it") counts['reply'] += 1 # Collect retweets - if 'full_text' in tweet and tweet['full_text'].startswith('RT @'): + # Don't do this if we already re-downloaded this tweet + if not 'from_api' in tweet and 'full_text' in tweet and tweet['full_text'].startswith('RT @'): tweet_ids.add(tweet['id_str']) + print(f"Need to download tweet {tweet['id_str']} because of retweet") counts['retweet'] += 1 # Collect tweets with media, which might lack alt text # TODO we might filter for media which has "type" : "photo" because there is no alt text for videos - if 'entities' in tweet and 'media' in tweet['entities']: + # Don't do this if we already re-downloaded this tweet with alt texts enabled + if not 'download_with_alt_text' in tweet and has_path(tweet, ['entities', 'media']): tweet_ids.add(tweet['id_str']) + print(f"Need to download tweet {tweet['id_str']} because of contained media") counts['media'] += 1 if None in tweet_ids: @@ -226,19 +304,28 @@ def collect_tweet_references(tweet, known_tweets, counts): return tweet_ids +# Walks a path through nested dicts or lists, and returns True if all the keys are present, and all of the values are not None +def has_path(dict, index_path): + for index in index_path: + if not index in dict: + return False + dict = dict[index] + if dict is None: + return False + return True + def convert_tweet(tweet, username, archive_media_folder, output_media_folder_name, tweet_icon_path, media_sources, users, referenced_tweets): """Converts a JSON-format tweet. Returns tuple of timestamp, markdown and HTML.""" # TODO actually use `referenced_tweets` - if 'tweet' in tweet.keys(): - tweet = tweet['tweet'] + tweet = unwrap_tweet(tweet) timestamp_str = tweet['created_at'] timestamp = int(round(datetime.datetime.strptime(timestamp_str, '%a %b %d %X %z %Y').timestamp())) # Example: Tue Mar 19 14:05:17 +0000 2019 body_markdown = tweet['full_text'] body_html = tweet['full_text'] tweet_id_str = tweet['id_str'] # replace t.co URLs with their original versions - if 'entities' in tweet and 'urls' in tweet['entities']: + if has_path(tweet, ['entities', 'urls']): for url in tweet['entities']['urls']: if 'url' in url and 'expanded_url' in url: expanded_url = url['expanded_url'] @@ -248,7 +335,7 @@ def convert_tweet(tweet, username, archive_media_folder, output_media_folder_nam # if the tweet is a reply, construct a header that links the names of the accounts being replied to the tweet being replied to header_markdown = '' header_html = '' - if 'in_reply_to_status_id' in tweet: + if has_path(tweet, ['in_reply_to_status_id']): # match and remove all occurrences of '@username ' at the start of the body replying_to = re.match(r'^(@[0-9A-Za-z_]* )*', body_markdown)[0] if replying_to: @@ -267,7 +354,7 @@ def convert_tweet(tweet, username, archive_media_folder, output_media_folder_nam header_markdown += f'Replying to [{name_list}]({replying_to_url})\n\n' header_html += f'Replying to {name_list}
' # replace image URLs with image links to local files - if 'entities' in tweet and 'media' in tweet['entities'] and 'extended_entities' in tweet and 'media' in tweet['extended_entities']: + if has_path(tweet, ['entities', 'media', 0, 'url']) and has_path(tweet, ['extended_entities', 'media']): original_url = tweet['entities']['media'][0]['url'] markdown = '' html = '' @@ -334,7 +421,7 @@ def convert_tweet(tweet, username, archive_media_folder, output_media_folder_nam if id is not None and int(id) >= 0: # some ids are -1, not sure why handle = tweet['in_reply_to_screen_name'] users[id] = UserData(id=id, handle=handle) - if 'entities' in tweet and 'user_mentions' in tweet['entities']: + if 'entities' in tweet and 'user_mentions' in tweet['entities'] and tweet['entities']['user_mentions'] is not None: for mention in tweet['entities']['user_mentions']: id = mention['id'] if int(id) >= 0: # some ids are -1, not sure why @@ -487,7 +574,7 @@ def parse_tweets(input_filenames, username, users, html_template, archive_media_ counts = defaultdict(int) known_tweets = {} - # TODO If we run this tool mutliple times, in `known_tweets` we will have our own tweets as + # TODO If we run this tool multiple times, in `known_tweets` we will have our own tweets as # well as related tweets by others. With each run, the tweet graph is expanded. We probably do # not want this. To stop it, implement one of these: # 1. keep own tweets and other tweets in different dicts @@ -504,6 +591,8 @@ def parse_tweets(input_filenames, username, users, html_template, archive_media_ for tweets_js_filename in input_filenames: json_result = read_json_from_js_file(tweets_js_filename) for tweet in json_result: + tweet = unwrap_tweet(tweet) + tweet['from_archive'] = True add_known_tweet(known_tweets, tweet) tweet_ids_to_download = set() @@ -528,8 +617,13 @@ def parse_tweets(input_filenames, username, users, html_template, archive_media_ bearer_token = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' guest_token = get_twitter_api_guest_token(session, bearer_token) # TODO We could download user data together with the tweets, because we will need it anyway. But we might download the data for each user multiple times then. - downloaded_tweets = get_tweets(session, bearer_token, guest_token, list(tweet_ids_to_download), False) + downloaded_tweets, remaining_tweet_ids = get_tweets(session, bearer_token, guest_token, list(tweet_ids_to_download), False) + # TODO maybe react if remaining_tweet_ids contains tweets for downloaded_tweet in downloaded_tweets.values(): + downloaded_tweet = unwrap_tweet(downloaded_tweet) + downloaded_tweet['from_api'] = True + downloaded_tweet['download_with_user'] = False + downloaded_tweet['download_with_alt_text'] = True add_known_tweet(known_tweets, downloaded_tweet) with open(tweet_dict_filename, "w") as outfile: json.dump(known_tweets, outfile, indent=2) @@ -540,9 +634,12 @@ def parse_tweets(input_filenames, username, users, html_template, archive_media_ # Third pass: convert tweets, using the downloaded references from pass 2 for tweet in known_tweets.values(): - converted_tweets.append(convert_tweet(tweet, username, archive_media_folder, - output_media_folder_name, tweet_icon_path, - media_sources, users, referenced_tweets)) + try: + converted_tweets.append(convert_tweet(tweet, username, archive_media_folder, + output_media_folder_name, tweet_icon_path, + media_sources, users, referenced_tweets)) + except Exception as err: + print(f"Could not convert tweet {tweet['id_str']} because: {err}") converted_tweets.sort(key=lambda tup: tup[0]) # oldest first # Group tweets by month (for markdown) From c7de3750b22a24195a59e23343241baedd662122 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Wed, 23 Nov 2022 20:50:50 +0100 Subject: [PATCH 03/46] replace image URLs in DMs with links to local files --- parser.py | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/parser.py b/parser.py index 2e86bf2..b2fc1c3 100755 --- a/parser.py +++ b/parser.py @@ -474,7 +474,7 @@ def chunks(lst: list, n: int): yield lst[i:i + n] -def parse_direct_messages(data_folder, username, users, user_id_url_template, dm_output_filename_template): +def parse_direct_messages(data_folder, output_media_folder_name, username, users, user_id_url_template, dm_output_filename_template): """Parse data_folder/direct-messages.js, write to one markdown file per conversation. Query Twitter API for the missing user handles, if the user agrees. """ @@ -507,11 +507,49 @@ def parse_direct_messages(data_folder, username, users, user_id_url_template, dm to_id = message_create['recipientId'] body = message_create['text'] # replace t.co URLs with their original versions - if 'urls' in message_create: + if 'urls' in message_create and len(message_create['urls']) > 0: for url in message_create['urls']: if 'url' in url and 'expanded' in url: expanded_url = url['expanded'] body = body.replace(url['url'], expanded_url) + # replace image URLs with image links to local files + if 'mediaUrls' in message_create \ + and len(message_create['mediaUrls']) == 1 \ + and 'urls' in message_create: + original_expanded_url = message_create['urls'][0]['expanded'] + message_id = message_create['id'] + media_hash_and_type = message_create['mediaUrls'][0].split('/')[-1] + archive_media_filename = f'{message_id}-{media_hash_and_type}' + new_url = output_media_folder_name + archive_media_filename + archive_media_path = \ + os.path.join(data_folder, 'direct_messages_media', archive_media_filename) + if os.path.isfile(archive_media_path): + # found a matching image, use this one + if not os.path.isfile(new_url): + shutil.copy(archive_media_path, new_url) + image_markdown = f'\n![]({new_url})\n' + body = body.replace(original_expanded_url, image_markdown) + + # TODO: Save the online location of the best-quality version of this file, + # for later upgrading if wanted (see convert_tweet method, but probably + # with a different url scheme) + else: + archive_media_paths = glob.glob( + os.path.join(data_folder, 'direct_messages_media', message_id + '*')) + if len(archive_media_paths) > 0: + for archive_media_path in archive_media_paths: + archive_media_filename = os.path.split(archive_media_path)[-1] + media_url = f'{output_media_folder_name}{archive_media_filename}' + if not os.path.isfile(media_url): + shutil.copy(archive_media_path, media_url) + video_markdown = f'\n\n' + body = body.replace(original_expanded_url, video_markdown) + # TODO: save the online location of the best-quality version (see above) + else: + print(f'Warning: missing local file: {archive_media_path}. ' + f'Using original link instead: {original_expanded_url})') + created_at = message_create['createdAt'] # example: 2022-01-27T15:58:52.744Z timestamp = \ int(round(datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp())) @@ -625,7 +663,7 @@ def main(): output_media_folder_name, tweet_icon_path, output_html_filename) parse_followings(data_folder, users, user_id_URL_template, output_following_filename) parse_followers(data_folder, users, user_id_URL_template, output_followers_filename) - parse_direct_messages(data_folder, username, users, user_id_URL_template, dm_output_filename_template) + parse_direct_messages(data_folder, output_media_folder_name, username, users, user_id_URL_template, dm_output_filename_template) # Download larger images, if the user agrees print(f"\nThe archive doesn't contain the original-size images. We can attempt to download them from twimg.com.") From f4e628b394fc7a18d69ce71664ecfdbe28ee169d Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Thu, 24 Nov 2022 01:27:55 +0100 Subject: [PATCH 04/46] url format for original size version of DM images, and comments explaining why it's not used --- parser.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/parser.py b/parser.py index b2fc1c3..20f19e3 100755 --- a/parser.py +++ b/parser.py @@ -519,6 +519,7 @@ def parse_direct_messages(data_folder, output_media_folder_name, username, users original_expanded_url = message_create['urls'][0]['expanded'] message_id = message_create['id'] media_hash_and_type = message_create['mediaUrls'][0].split('/')[-1] + media_id = message_create['mediaUrls'][0].split('/')[-2] archive_media_filename = f'{message_id}-{media_hash_and_type}' new_url = output_media_folder_name + archive_media_filename archive_media_path = \ @@ -530,9 +531,24 @@ def parse_direct_messages(data_folder, output_media_folder_name, username, users image_markdown = f'\n![]({new_url})\n' body = body.replace(original_expanded_url, image_markdown) - # TODO: Save the online location of the best-quality version of this file, - # for later upgrading if wanted (see convert_tweet method, but probably - # with a different url scheme) + # Save the online location of the best-quality version of this file, + # for later upgrading if wanted + best_quality_url = \ + f'https://ton.twitter.com/i//ton/data/dm/' \ + f'{message_id}/{media_id}/{media_hash_and_type}' + # there is no ':orig' here, the url without any suffix has the original size + + # TODO: a cookie (and a 'Referer: https://twitter.com' header) + # is needed to retrieve it, so the url might be useless anyway... + + # WARNING: Do not uncomment the statement below until the cookie problem is solved! + # media_sources.append( + # ( + # os.path.join(output_media_folder_name, archive_media_filename), + # best_quality_url + # ) + # ) + else: archive_media_paths = glob.glob( os.path.join(data_folder, 'direct_messages_media', message_id + '*')) @@ -545,7 +561,10 @@ def parse_direct_messages(data_folder, output_media_folder_name, username, users video_markdown = f'\n\n' body = body.replace(original_expanded_url, video_markdown) - # TODO: save the online location of the best-quality version (see above) + + # TODO: maybe also save the online location of the best-quality version for videos? + # (see above) + else: print(f'Warning: missing local file: {archive_media_path}. ' f'Using original link instead: {original_expanded_url})') From 38f232aae3cca2bfd224e21f3e62d012a22a6ef3 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Thu, 24 Nov 2022 03:09:16 +0100 Subject: [PATCH 05/46] parse group DMs and output them as markdown --- parser.py | 248 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) diff --git a/parser.py b/parser.py index 8ae3902..146e71f 100755 --- a/parser.py +++ b/parser.py @@ -589,6 +589,252 @@ def parse_direct_messages(data_folder, username, users, user_id_url_template, dm f"({num_written_messages} total messages) to {num_written_files} markdown files\n") +def make_conversation_name_safe_for_filename(conversation_name: str) -> str: + """ + Remove/replace characters that could be unsafe in filenames + """ + forbidden_chars = \ + ['"', "'", '*', '/', '\\', ':', '<', '>', '?', '|', '!', '@', ';', ',', '=', '.', '\n', '\r', '\t'] + new_conversation_name = '' + for char in conversation_name: + if char in forbidden_chars: + new_conversation_name = new_conversation_name + '_' + elif char.isspace(): + # replace spaces with underscores + new_conversation_name = new_conversation_name + '_' + elif char == 0x7F or (0x1F >= ord(char) >= 0x00): + # 0x00 - 0x1F and 0x7F are also forbidden, just discard them + continue + else: + new_conversation_name = new_conversation_name + char + + return new_conversation_name + + +def find_group_direct_message_participants(conversation: dict) -> set: + """ + Find IDs of all participating Users in a group direct message + """ + group_user_ids = set() + if 'dmConversation' in conversation and 'conversationId' in conversation['dmConversation']: + dm_conversation = conversation['dmConversation'] + if 'messages' in dm_conversation: + for message in dm_conversation['messages']: + if 'messageCreate' in message: + group_user_ids.add(message['messageCreate']['senderId']) + elif 'joinConversation' in message: + group_user_ids.add(message['joinConversation']['initiatingUserId']) + for participant_id in message['joinConversation']['participantsSnapshot']: + group_user_ids.add(participant_id) + return group_user_ids + + +def parse_group_direct_messages(data_folder, username, users, user_id_url_template, group_dm_output_filename_template): + """Parse data_folder/direct-messages-group.js, write to one markdown file per conversation. + Query Twitter API for the missing user handles, if the user agrees. + """ + # Scan the group DMs for missing user handles + group_dms_json = read_json_from_js_file(os.path.join(data_folder, 'direct-messages-group.js')) + dm_user_ids = set() + for conversation in group_dms_json: + participants = find_group_direct_message_participants(conversation) + for participant_id in participants: + dm_user_ids.add(participant_id) + lookup_users(list(dm_user_ids), users) + + # Parse the group DMs, store messages and metadata in a dict + conversations_messages = defaultdict(list) + conversations_metadata = defaultdict(dict) + for conversation in group_dms_json: + if 'dmConversation' in conversation and 'conversationId' in conversation['dmConversation']: + dm_conversation = conversation['dmConversation'] + conversation_id = dm_conversation['conversationId'] + participants = find_group_direct_message_participants(conversation) + participant_names = [] + for participant_id in participants: + if participant_id in users: + participant_names.append(users[participant_id].handle) + else: + participant_names.append(user_id_url_template.format(participant_id)) + + # save names in metadata + conversations_metadata[conversation_id]['participants'] = participants + conversations_metadata[conversation_id]['participant_names'] = participant_names + conversations_metadata[conversation_id]['conversation_names'] = [(0, conversation_id)] + messages = [] + if 'messages' in dm_conversation: + for message in dm_conversation['messages']: + if 'messageCreate' in message: + message_create = message['messageCreate'] + if all(tag in message_create for tag in ['senderId', 'text', 'createdAt']): + from_id = message_create['senderId'] + body = message_create['text'] + # replace t.co URLs with their original versions + if 'urls' in message_create: + for url in message_create['urls']: + if 'url' in url and 'expanded' in url: + expanded_url = url['expanded'] + body = body.replace(url['url'], expanded_url) + # TODO: image links + created_at = message_create['createdAt'] # example: 2022-01-27T15:58:52.744Z + timestamp = int(round( + datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() + )) + from_handle = users[from_id].handle.replace('_', '\\_') if from_id in users \ + else user_id_url_template.format(from_id) + message_markdown = f'\n\n### {from_handle}: ({created_at}) ###\n```\n{body}\n```' + messages.append((timestamp, message_markdown)) + elif "conversationNameUpdate" in message: + conversation_name_update = message['conversationNameUpdate'] + if all(tag in conversation_name_update for tag in ['initiatingUserId', 'name', 'createdAt']): + from_id = conversation_name_update['initiatingUserId'] + body = f"_changed group name to: {conversation_name_update['name']}_" + created_at = conversation_name_update['createdAt'] # example: 2022-01-27T15:58:52.744Z + timestamp = int(round( + datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() + )) + from_handle = users[from_id].handle.replace('_', '\\_') if from_id in users \ + else user_id_url_template.format(from_id) + message_markdown = f'\n\n### {from_handle}: ({created_at}) ###\n\n{body}\n' + messages.append((timestamp, message_markdown)) + # save metadata about name change: + conversations_metadata[conversation_id]['conversation_names'].append( + (timestamp, conversation_name_update['name']) + ) + elif "joinConversation" in message: + join_conversation = message['joinConversation'] + if all(tag in join_conversation for tag in ['initiatingUserId', 'createdAt']): + from_id = join_conversation['initiatingUserId'] + created_at = join_conversation['createdAt'] # example: 2022-01-27T15:58:52.744Z + timestamp = int(round( + datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() + )) + from_handle = users[from_id].handle.replace('_', '\\_') if from_id in users \ + else user_id_url_template.format(from_id) + escaped_username = username.replace('_', '\\_') + body = f'_{from_handle} added {escaped_username} to the group_' + message_markdown = f'\n\n### {from_handle}: ({created_at}) ###\n\n{body}\n' + messages.append((timestamp, message_markdown)) + elif "participantsJoin" in message: + participants_join = message['participantsJoin'] + if all(tag in participants_join for tag in ['initiatingUserId', 'userIds', 'createdAt']): + from_id = participants_join['initiatingUserId'] + created_at = participants_join['createdAt'] # example: 2022-01-27T15:58:52.744Z + timestamp = int(round( + datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() + )) + from_handle = users[from_id].handle.replace('_', '\\_') if from_id in users \ + else user_id_url_template.format(from_id) + joined_ids = participants_join['userIds'] + joined_handles = [users[joined_id].handle.replace('_', '\\_') if joined_id in users + else user_id_url_template.format(joined_id) for joined_id in joined_ids] + name_list = ', '.join(joined_handles[:-1]) + \ + (f' and {joined_handles[-1]}' if len(joined_handles) > 1 else + joined_handles[0]) + body = f'_{from_handle} added {name_list} to the group_' + message_markdown = f'\n\n### {from_handle}: ({created_at}) ###\n\n{body}\n' + messages.append((timestamp, message_markdown)) + elif "participantsLeave" in message: + participants_leave = message['participantsLeave'] + if all(tag in participants_leave for tag in ['userIds', 'createdAt']): + created_at = participants_leave['createdAt'] # example: 2022-01-27T15:58:52.744Z + timestamp = int(round( + datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() + )) + left_ids = participants_leave['userIds'] + left_handles = [users[left_id].handle.replace('_', '\\_') if left_id in users + else user_id_url_template.format(left_id) for left_id in left_ids] + name_list = ', '.join(left_handles[:-1]) + \ + (f' and {left_handles[-1]}' if len(left_handles) > 1 else + left_handles[0]) + body = f'_{name_list} left the group_' + message_markdown = f'\n\n### {name_list}: ({created_at}) ###\n\n{body}\n' + messages.append((timestamp, message_markdown)) + + # collect messages per conversation in conversations_messages dict + conversations_messages[conversation_id].extend(messages) + + # output as one file per conversation (or part of long conversation) + num_written_messages = 0 + num_written_files = 0 + for conversation_id, messages in conversations_messages.items(): + # sort messages by timestamp + messages.sort(key=lambda tup: tup[0]) + # create conversation name for use in filename: + # first, try to find an official name in the parsed conversation data + conversations_metadata[conversation_id]['conversation_names'].sort(key=lambda tup: tup[0], reverse=True) + official_name = conversations_metadata[conversation_id]['conversation_names'][0][1] + safe_group_name = make_conversation_name_safe_for_filename(official_name) + if len(safe_group_name) < 2: + # discard name if it's too short (because of collision risk) + group_name = conversation_id + else: + group_name = safe_group_name + + if group_name == conversation_id: + # try to make a nice list of participant handles for the conversation name + handles = [] + for participant_id in conversations_metadata[conversation_id]['participants']: + if participant_id in users: + participant_handle = users[participant_id].handle + if participant_handle != username: + handles.append(participant_handle) + if len(handles) == 1: + group_name = \ + f'{handles[0]}_and_{len(conversations_metadata[conversation_id]["participants"]) - 1}_more' + elif len(handles) == 2 and len(conversations_metadata[conversation_id]["participants"]) == 3: + group_name = f'{handles[0]}_and_{handles[1]}_and_{username}' + elif len(handles) >= 2: + group_name = \ + f'{handles[0]}_and_{handles[1]}_and' \ + f'_{len(conversations_metadata[conversation_id]["participants"]) - 2}_more' + else: + # just use the conversation id + group_name = conversation_id + + # create a list of names of the form '@name1, @name2 and @name3' + # to use as a headline in the output file + escaped_participant_names = [ + participant_name.replace('_', '\\_') + for participant_name in conversations_metadata[conversation_id]['participant_names'] + ] + name_list = ', '.join(escaped_participant_names[:-1]) + \ + (f' and {escaped_participant_names[-1]}' + if len(escaped_participant_names) > 1 + else escaped_participant_names[0]) + + if len(messages) > 1000: + for chunk_index, chunk in enumerate(chunks(messages, 1000)): + markdown = '' + markdown += f'# {official_name}\n' + markdown += f'## Group conversation between {name_list}, part {chunk_index + 1}: ##\n' + markdown += ''.join(md for _, md in chunk) + conversation_output_filename = \ + group_dm_output_filename_template.format(f'{group_name}_part{chunk_index + 1:03}') + + # write part to a markdown file + with open(conversation_output_filename, 'w', encoding='utf8') as f: + f.write(markdown) + print(f'Wrote {len(chunk)} messages to {conversation_output_filename}') + num_written_files += 1 + else: + markdown = '' + markdown += f'# {official_name}\n' + markdown += f'## Group conversation between {name_list}: ##\n' + markdown += ''.join(md for _, md in messages) + conversation_output_filename = group_dm_output_filename_template.format(group_name) + + with open(conversation_output_filename, 'w', encoding='utf8') as f: + f.write(markdown) + print(f'Wrote {len(messages)} messages to {conversation_output_filename}') + num_written_files += 1 + + num_written_messages += len(messages) + + print(f"\nWrote {len(conversations_messages)} direct message group conversations " + f"({num_written_messages} total messages) to {num_written_files} markdown files") + + def main(): input_folder = '.' @@ -601,6 +847,7 @@ def main(): output_followers_filename = 'followers.txt' user_id_URL_template = 'https://twitter.com/i/user/{}' dm_output_filename_template = 'DMs-Archive-{}.md' + group_dm_output_filename_template = 'DMs-Group-Archive-{}.md' html_template = """\ @@ -641,6 +888,7 @@ def main(): parse_followings(data_folder, users, user_id_URL_template, output_following_filename) parse_followers(data_folder, users, user_id_URL_template, output_followers_filename) parse_direct_messages(data_folder, username, users, user_id_URL_template, dm_output_filename_template) + parse_group_direct_messages(data_folder, username, users, user_id_URL_template, group_dm_output_filename_template) # Download larger images, if the user agrees print(f"\nThe archive doesn't contain the original-size images. We can attempt to download them from twimg.com.") From f52dfa30121ec8c9a6f6e38201d2570e2f92d481 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Thu, 24 Nov 2022 23:34:36 +0100 Subject: [PATCH 06/46] Reorganize file output paths. This is mainly archieved by adding `create_path_for_file_` methods to `PathConfig` and using them, and using `rel_url` to link to media files. --- parser.py | 137 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 93 insertions(+), 44 deletions(-) diff --git a/parser.py b/parser.py index c16c25a..11e804c 100755 --- a/parser.py +++ b/parser.py @@ -45,6 +45,46 @@ def __init__(self, id, handle = None): self.id = id self.handle = handle +class PathConfig: + """ + Helper class containing constants for various directories and files. + + The script will only add / change / delete content in its own directories, which start with `parser-`. + Files within `parser-output` are the end result that the user is probably interested in. + Files within `parser-cache` are temporary working files, which improve the efficiency if you run + this script multiple times. They can safely be removed without harming the consistency of the + files within `parser-output`. + """ + def __init__(self, dir_archive): + self.dir_archive = dir_archive + self.dir_input_data = os.path.join(self.dir_archive, 'data') + self.dir_input_media = find_dir_input_media(self.dir_input_data) + self.dir_output = os.path.join(dir_archive, 'parser-output') + self.dir_output_media = os.path.join(self.dir_output, 'media') + self.dir_output_cache = os.path.join(self.dir_archive, 'parser-cache') + self.file_account_js = os.path.join(self.dir_input_data, 'account.js') + self.file_download_log = os.path.join(self.dir_output_cache, 'download_log.txt') + self.file_tweet_icon = os.path.join(self.dir_output_media, 'tweet.ico') + self.files_input_tweets = find_files_input_tweets(self.dir_input_data) + + # structured like an actual tweet output file, can be used to compute relative urls to a media file + self.example_file_output_tweets = self.create_path_for_file_output_tweets(year=2020, month=12) + + def create_path_for_file_output_tweets(self, year, month, format="html", kind="tweets")->str: + """Builds the path for a tweet-archive file based on some properties.""" + return os.path.join(self.dir_output, f"{kind}-{format}", f"{year:04}", f"{year:04}-{month:02}-01-{kind}.{format}") + + def create_path_for_file_output_dms(self, name, index=None, format="html", kind="DMs")->str: + """Builds the path for a dm-archive file based on some properties.""" + index_suffix = "" + if (index): + index_suffix = f"-part{index:02}" + return os.path.join(self.dir_output, kind, f"{kind}-{name}{index_suffix}.{format}") + + def create_path_for_file_output_single(self, format: str, kind: str)->str: + """Builds the path for a single output file which, i.e. one that is not part of a larger group or sequence.""" + return os.path.join(self.dir_output, f"{kind}.{format}") + def import_module(module): """Imports a module specified by a string. Example: requests = import_module('requests')""" @@ -59,6 +99,24 @@ def import_module(module): return importlib.import_module(module) +def open_and_mkdirs(path_file): + """Opens a file for writing. If the parent directory does not exist yet, it is created first.""" + mkdirs_for_file(path_file) + return open(path_file, 'w', encoding='utf-8') + + +def mkdirs_for_file(path_file): + """Creates the parent directory of the given file, if it does not exist yet.""" + path_dir = os.path.split(path_file)[0] + os.makedirs(path_dir, exist_ok=True) + + +def rel_url(media_path, document_path): + """Computes the relative URL needed to link from `document_path` to `media_path`. + Assumes that `document_path` points to a file (e.g. `.md` or `.html`), not a directory.""" + return os.path.relpath(media_path, os.path.split(document_path)[0]).replace("\\", "/") + + def get_twitter_api_guest_token(session, bearer_token): """Returns a Twitter API guest token for the current session.""" guest_token_response = session.post("https://api.twitter.com/1.1/guest/activate.json", @@ -134,13 +192,13 @@ def read_json_from_js_file(filename): return json.loads(data) -def extract_username(paths): +def extract_username(paths: PathConfig): """Returns the user's Twitter username from account.js.""" account = read_json_from_js_file(paths.file_account_js) return account[0]['account']['username'] -def convert_tweet(tweet, username, media_sources, users, paths): +def convert_tweet(tweet, username, media_sources, users, paths: PathConfig): """Converts a JSON-format tweet. Returns tuple of timestamp, markdown and HTML.""" if 'tweet' in tweet.keys(): tweet = tweet['tweet'] @@ -176,7 +234,7 @@ def convert_tweet(tweet, username, media_sources, users, paths): header_markdown = '' header_html = '' if 'in_reply_to_status_id' in tweet: - # match and remove all occurences of '@username ' at the start of the body + # match and remove all occurrences of '@username ' at the start of the body replying_to = re.match(r'^(@[0-9A-Za-z_]* )*', body_markdown)[0] if replying_to: body_markdown = body_markdown[len(replying_to):] @@ -205,7 +263,7 @@ def convert_tweet(tweet, username, media_sources, users, paths): archive_media_filename = tweet_id_str + '-' + original_filename archive_media_path = os.path.join(paths.dir_input_media, archive_media_filename) file_output_media = os.path.join(paths.dir_output_media, archive_media_filename) - media_url = f'{os.path.split(paths.dir_output_media)[1]}/{archive_media_filename}' + media_url = rel_url(file_output_media, paths.example_file_output_tweets) markdown += '' if not markdown and body_markdown == original_url else '\n\n' html += '' if not html and body_html == original_url else '
' if os.path.isfile(archive_media_path): @@ -224,7 +282,7 @@ def convert_tweet(tweet, username, media_sources, users, paths): for archive_media_path in archive_media_paths: archive_media_filename = os.path.split(archive_media_path)[-1] file_output_media = os.path.join(paths.dir_output_media, archive_media_filename) - media_url = f'{os.path.split(paths.dir_output_media)[1]}/{archive_media_filename}' + media_url = rel_url(file_output_media, paths.example_file_output_tweets) if not os.path.isfile(file_output_media): shutil.copy(archive_media_path, file_output_media) markdown += f'\n' @@ -255,8 +313,9 @@ def convert_tweet(tweet, username, media_sources, users, paths): body_html = '

' + '
\n'.join(body_html.splitlines()) + '
' # append the original Twitter URL as a link original_tweet_url = f'https://twitter.com/{username}/status/{tweet_id_str}' - body_markdown = header_markdown + body_markdown + f'\n\n [{timestamp_str}]({original_tweet_url})' - body_html = header_html + body_html + f' {timestamp_str}

' + icon_url = rel_url(paths.file_tweet_icon, paths.example_file_output_tweets) + body_markdown = header_markdown + body_markdown + f'\n\n [{timestamp_str}]({original_tweet_url})' + body_html = header_html + body_html + f' {timestamp_str}

' # extract user_id:handle connections if 'in_reply_to_user_id' in tweet and 'in_reply_to_screen_name' in tweet: id = tweet['in_reply_to_user_id'] @@ -366,13 +425,14 @@ def download_file_if_larger(url, filename, index, count, sleep_time): return False, 0 -def download_larger_media(media_sources, paths): +def download_larger_media(media_sources, paths: PathConfig): """Uses (filename, URL) tuples in media_sources to download files from remote storage. Aborts downloads if the remote file is the same size or smaller than the existing local version. Retries the failed downloads several times, with increasing pauses between each to avoid being blocked. """ # Log to file as well as the console logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') + mkdirs_for_file(paths.file_download_log) logfile_handler = logging.FileHandler(filename=paths.file_download_log, mode='w') logfile_handler.setLevel(logging.INFO) logging.getLogger().addHandler(logfile_handler) @@ -407,7 +467,7 @@ def download_larger_media(media_sources, paths): print(f'Wrote log to {paths.file_download_log}') -def parse_tweets(username, users, html_template, paths): +def parse_tweets(username, users, html_template, paths: PathConfig): """Read tweets from paths.files_input_tweets, write to *.md and *.html. Copy the media used to paths.dir_output_media. Collect user_id:user_handle mappings for later use, in 'users'. @@ -426,18 +486,19 @@ def parse_tweets(username, users, html_template, paths): for timestamp, md, html in tweets: # Use a (markdown) filename that can be imported into Jekyll: YYYY-MM-DD-your-title-here.md dt = datetime.datetime.fromtimestamp(timestamp) - filename = f'{dt.year}-{dt.month:02}-01-Tweet-Archive-{dt.year}-{dt.month:02}' # change to group by day or year or timestamp - grouped_tweets[filename].append((md, html)) + grouped_tweets[(dt.year, dt.month)].append((md, html)) - for filename, content in grouped_tweets.items(): + for (year, month), content in grouped_tweets.items(): # Write into *.md files md_string = '\n\n----\n\n'.join(md for md, _ in content) - with open(f'{filename}.md', 'w', encoding='utf-8') as f: + md_path = paths.create_path_for_file_output_tweets(year, month, format="md") + with open_and_mkdirs(md_path) as f: f.write(md_string) # Write into *.html files html_string = '
\n'.join(html for _, html in content) - with open(f'{filename}.html', 'w', encoding='utf-8') as f: + html_path = paths.create_path_for_file_output_tweets(year, month, format="html") + with open_and_mkdirs(html_path) as f: f.write(html_template.format(html_string)) print(f'Wrote {len(tweets)} tweets to *.md and *.html, with images and video embedded from {paths.dir_output_media}') @@ -445,7 +506,7 @@ def parse_tweets(username, users, html_template, paths): return media_sources -def parse_followings(users, URL_template_user_id, paths): +def parse_followings(users, URL_template_user_id, paths: PathConfig): """Parse paths.dir_input_data/following.js, write to paths.file_output_following. Query Twitter API for the missing user handles, if the user agrees. """ @@ -460,12 +521,13 @@ def parse_followings(users, URL_template_user_id, paths): handle = users[id].handle if id in users else '~unknown~handle~' following.append(handle + ' ' + URL_template_user_id.format(id)) following.sort() - with open(paths.file_output_following, 'w', encoding='utf8') as f: + following_output_path = paths.create_path_for_file_output_single(format="txt", kind="following") + with open_and_mkdirs(following_output_path) as f: f.write('\n'.join(following)) - print(f"Wrote {len(following)} accounts to {paths.file_output_following}") + print(f"Wrote {len(following)} accounts to {following_output_path}") -def parse_followers(users, URL_template_user_id, paths): +def parse_followers(users, URL_template_user_id, paths: PathConfig): """Parse paths.dir_input_data/followers.js, write to paths.file_output_followers. Query Twitter API for the missing user handles, if the user agrees. """ @@ -480,9 +542,10 @@ def parse_followers(users, URL_template_user_id, paths): handle = users[id].handle if id in users else '~unknown~handle~' followers.append(handle + ' ' + URL_template_user_id.format(id)) followers.sort() - with open(paths.file_output_followers, 'w', encoding='utf8') as f: + followers_output_path = paths.create_path_for_file_output_single(format="txt", kind="followers") + with open_and_mkdirs(followers_output_path) as f: f.write('\n'.join(followers)) - print(f"Wrote {len(followers)} accounts to {paths.file_output_followers}") + print(f"Wrote {len(followers)} accounts to {followers_output_path}") def chunks(lst: list, n: int): @@ -491,7 +554,7 @@ def chunks(lst: list, n: int): yield lst[i:i + n] -def parse_direct_messages(username, users, URL_template_user_id, paths): +def parse_direct_messages(username, users, URL_template_user_id, paths: PathConfig): """Parse paths.dir_input_data/direct-messages.js, write to one markdown file per conversation. Query Twitter API for the missing user handles, if the user agrees. """ @@ -564,24 +627,23 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): markdown = '' markdown += f'## Conversation between {username} and {other_user_name}, part {chunk_index+1}: ##\n' markdown += ''.join(md for _, md in chunk) - conversation_output_filename = \ - paths.file_template_dm_output.format(f'{other_user_short_name}_part{chunk_index+1:03}') + conversation_output_path = paths.create_path_for_file_output_dms(name=other_user_short_name, index=(chunk_index + 1), format="md") # write part to a markdown file - with open(conversation_output_filename, 'w', encoding='utf8') as f: + with open_and_mkdirs(conversation_output_path) as f: f.write(markdown) - print(f'Wrote {len(chunk)} messages to {conversation_output_filename}') + print(f'Wrote {len(chunk)} messages to {conversation_output_path}') num_written_files += 1 else: markdown = '' markdown += f'## Conversation between {username} and {other_user_name}: ##\n' markdown += ''.join(md for _, md in messages) - conversation_output_filename = paths.file_template_dm_output.format(other_user_short_name) + conversation_output_path = paths.create_path_for_file_output_dms(name=other_user_short_name, format="md") - with open(conversation_output_filename, 'w', encoding='utf8') as f: + with open_and_mkdirs(conversation_output_path) as f: f.write(markdown) - print(f'Wrote {len(messages)} messages to {conversation_output_filename}') + print(f'Wrote {len(messages)} messages to {conversation_output_path}') num_written_files += 1 num_written_messages += len(messages) @@ -590,23 +652,8 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): f"({num_written_messages} total messages) to {num_written_files} markdown files\n") -class PathConfig: - """Helper class containing constants for various directories and files.""" - def __init__(self, dir_archive, dir_output): - self.dir_input_data = os.path.join(dir_archive, 'data') - self.dir_input_media = find_dir_input_media(self.dir_input_data) - self.dir_output_media = os.path.join(dir_output, 'media') - self.file_output_following = os.path.join(dir_output, 'following.txt') - self.file_output_followers = os.path.join(dir_output, 'followers.txt') - self.file_template_dm_output = os.path.join(dir_output, 'DMs-Archive-{}.md') - self.file_account_js = os.path.join(self.dir_input_data, 'account.js') - self.file_download_log = os.path.join(self.dir_output_media, 'download_log.txt') - self.file_tweet_icon = os.path.join(self.dir_output_media, 'tweet.ico') - self.files_input_tweets = find_files_input_tweets(self.dir_input_data) - - def main(): - paths = PathConfig(dir_archive='.', dir_output='.') + paths = PathConfig(dir_archive='.') # Extract the username from data/account.js if not os.path.isfile(paths.file_account_js): @@ -642,6 +689,8 @@ def main(): if not os.path.isfile(paths.file_tweet_icon): shutil.copy('assets/images/favicon.ico', paths.file_tweet_icon); + # TODO move files from older top-level folders, if they have been written by an older version of this script + media_sources = parse_tweets(username, users, html_template, paths) parse_followings(users, URL_template_user_id, paths) parse_followers(users, URL_template_user_id, paths) From 3efe1b7260f381ecc2cbb46912941a47b4c333f1 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Fri, 25 Nov 2022 00:29:18 +0100 Subject: [PATCH 07/46] user handles in group dms output filename sorted by activity (for reproducible, deterministic filenames) --- parser.py | 63 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/parser.py b/parser.py index 2cb6bca..6c28bf7 100755 --- a/parser.py +++ b/parser.py @@ -612,9 +612,9 @@ def make_conversation_name_safe_for_filename(conversation_name: str) -> str: return new_conversation_name -def find_group_direct_message_participants(conversation: dict) -> set: +def find_group_dm_conversation_participant_ids(conversation: dict) -> set: """ - Find IDs of all participating Users in a group direct message + Find IDs of all participating Users in a group direct message conversation """ group_user_ids = set() if 'dmConversation' in conversation and 'conversationId' in conversation['dmConversation']: @@ -627,6 +627,10 @@ def find_group_direct_message_participants(conversation: dict) -> set: group_user_ids.add(message['joinConversation']['initiatingUserId']) for participant_id in message['joinConversation']['participantsSnapshot']: group_user_ids.add(participant_id) + elif "participantsJoin" in message: + group_user_ids.add(message['participantsJoin']['initiatingUserId']) + for participant_id in message['participantsJoin']['userIds']: + group_user_ids.add(participant_id) return group_user_ids @@ -644,13 +648,13 @@ def parse_group_direct_messages(data_folder, username, users, user_id_url_templa lookup_users(list(dm_user_ids), users) # Parse the group DMs, store messages and metadata in a dict - conversations_messages = defaultdict(list) - conversations_metadata = defaultdict(dict) + group_conversations_messages = defaultdict(list) + group_conversations_metadata = defaultdict(dict) for conversation in group_dms_json: if 'dmConversation' in conversation and 'conversationId' in conversation['dmConversation']: dm_conversation = conversation['dmConversation'] conversation_id = dm_conversation['conversationId'] - participants = find_group_direct_message_participants(conversation) + participants = find_group_dm_conversation_participant_ids(conversation) participant_names = [] for participant_id in participants: if participant_id in users: @@ -659,9 +663,10 @@ def parse_group_direct_messages(data_folder, username, users, user_id_url_templa participant_names.append(user_id_url_template.format(participant_id)) # save names in metadata - conversations_metadata[conversation_id]['participants'] = participants - conversations_metadata[conversation_id]['participant_names'] = participant_names - conversations_metadata[conversation_id]['conversation_names'] = [(0, conversation_id)] + group_conversations_metadata[conversation_id]['participants'] = participants + group_conversations_metadata[conversation_id]['participant_names'] = participant_names + group_conversations_metadata[conversation_id]['conversation_names'] = [(0, conversation_id)] + group_conversations_metadata[conversation_id]['participant_message_count'] = defaultdict(int) messages = [] if 'messages' in dm_conversation: for message in dm_conversation['messages']: @@ -669,6 +674,8 @@ def parse_group_direct_messages(data_folder, username, users, user_id_url_templa message_create = message['messageCreate'] if all(tag in message_create for tag in ['senderId', 'text', 'createdAt']): from_id = message_create['senderId'] + # count how many messages this user has sent to the group + group_conversations_metadata[conversation_id]['participant_message_count'][from_id] += 1 body = message_create['text'] # replace t.co URLs with their original versions if 'urls' in message_create: @@ -699,7 +706,7 @@ def parse_group_direct_messages(data_folder, username, users, user_id_url_templa message_markdown = f'\n\n### {from_handle}: ({created_at}) ###\n\n{body}\n' messages.append((timestamp, message_markdown)) # save metadata about name change: - conversations_metadata[conversation_id]['conversation_names'].append( + group_conversations_metadata[conversation_id]['conversation_names'].append( (timestamp, conversation_name_update['name']) ) elif "joinConversation" in message: @@ -752,19 +759,26 @@ def parse_group_direct_messages(data_folder, username, users, user_id_url_templa message_markdown = f'\n\n### {name_list}: ({created_at}) ###\n\n{body}\n' messages.append((timestamp, message_markdown)) - # collect messages per conversation in conversations_messages dict - conversations_messages[conversation_id].extend(messages) + # collect messages per conversation in group_conversations_messages dict + group_conversations_messages[conversation_id].extend(messages) # output as one file per conversation (or part of long conversation) num_written_messages = 0 num_written_files = 0 - for conversation_id, messages in conversations_messages.items(): + for conversation_id, messages in group_conversations_messages.items(): # sort messages by timestamp messages.sort(key=lambda tup: tup[0]) # create conversation name for use in filename: # first, try to find an official name in the parsed conversation data - conversations_metadata[conversation_id]['conversation_names'].sort(key=lambda tup: tup[0], reverse=True) - official_name = conversations_metadata[conversation_id]['conversation_names'][0][1] + + # Not-so-fun fact: + # If the name was set before the archive's owner joined the group, the name is not included + # in the archive data and can't be found anywhere (except by looking it up from twitter, + # and that would probably need a cookie). So there are many groups that do actually have a name, + # but it can't be used here because we don't know it. + + group_conversations_metadata[conversation_id]['conversation_names'].sort(key=lambda tup: tup[0], reverse=True) + official_name = group_conversations_metadata[conversation_id]['conversation_names'][0][1] safe_group_name = make_conversation_name_safe_for_filename(official_name) if len(safe_group_name) < 2: # discard name if it's too short (because of collision risk) @@ -775,20 +789,23 @@ def parse_group_direct_messages(data_folder, username, users, user_id_url_templa if group_name == conversation_id: # try to make a nice list of participant handles for the conversation name handles = [] - for participant_id in conversations_metadata[conversation_id]['participants']: + for participant_id, message_count in \ + group_conversations_metadata[conversation_id]['participant_message_count'].items(): if participant_id in users: participant_handle = users[participant_id].handle if participant_handle != username: - handles.append(participant_handle) + handles.append((participant_handle, message_count)) + # sort so that the most active users are at the start of the list + handles.sort(key=lambda tup: tup[1], reverse=True) if len(handles) == 1: group_name = \ - f'{handles[0]}_and_{len(conversations_metadata[conversation_id]["participants"]) - 1}_more' - elif len(handles) == 2 and len(conversations_metadata[conversation_id]["participants"]) == 3: - group_name = f'{handles[0]}_and_{handles[1]}_and_{username}' + f'{handles[0][0]}_and_{len(group_conversations_metadata[conversation_id]["participants"]) - 1}_more' + elif len(handles) == 2 and len(group_conversations_metadata[conversation_id]["participants"]) == 3: + group_name = f'{handles[0][0]}_and_{handles[1][0]}_and_{username}' elif len(handles) >= 2: group_name = \ - f'{handles[0]}_and_{handles[1]}_and' \ - f'_{len(conversations_metadata[conversation_id]["participants"]) - 2}_more' + f'{handles[0][0]}_and_{handles[1][0]}_and' \ + f'_{len(group_conversations_metadata[conversation_id]["participants"]) - 2}_more' else: # just use the conversation id group_name = conversation_id @@ -797,7 +814,7 @@ def parse_group_direct_messages(data_folder, username, users, user_id_url_templa # to use as a headline in the output file escaped_participant_names = [ participant_name.replace('_', '\\_') - for participant_name in conversations_metadata[conversation_id]['participant_names'] + for participant_name in group_conversations_metadata[conversation_id]['participant_names'] ] name_list = ', '.join(escaped_participant_names[:-1]) + \ (f' and {escaped_participant_names[-1]}' @@ -832,7 +849,7 @@ def parse_group_direct_messages(data_folder, username, users, user_id_url_templa num_written_messages += len(messages) - print(f"\nWrote {len(conversations_messages)} direct message group conversations " + print(f"\nWrote {len(group_conversations_messages)} direct message group conversations " f"({num_written_messages} total messages) to {num_written_files} markdown files") From 9155e3651c5b91b2ebea69962606fb8faec69601 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Fri, 25 Nov 2022 00:47:33 +0100 Subject: [PATCH 08/46] Bugfix: merge lists inside merged dicts, prevents None in known_tweets. --- parser.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/parser.py b/parser.py index aaa0e50..1831ac2 100755 --- a/parser.py +++ b/parser.py @@ -210,6 +210,10 @@ def merge(a, b, path=None): if key in a: if isinstance(a[key], dict) and isinstance(b[key], dict): merge(a[key], b[key], path + [str(key)]) + elif isinstance(a[key], list) and isinstance(b[key], list): + for item_b in b[key]: + if item_b not in a[key]: + a[key].append(item_b) elif a[key] == b[key]: pass # same leaf value elif key == 'retweet_count' or key == 'favorite_count': From ba62af04b48c5628203a1e9fc25a27932f9716a9 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Fri, 25 Nov 2022 01:05:37 +0100 Subject: [PATCH 09/46] use pathconfig for group dm output filenames --- parser.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/parser.py b/parser.py index 6c28bf7..3cbfa38 100755 --- a/parser.py +++ b/parser.py @@ -634,18 +634,18 @@ def find_group_dm_conversation_participant_ids(conversation: dict) -> set: return group_user_ids -def parse_group_direct_messages(data_folder, username, users, user_id_url_template, group_dm_output_filename_template): """Parse data_folder/direct-messages-group.js, write to one markdown file per conversation. Query Twitter API for the missing user handles, if the user agrees. """ # Scan the group DMs for missing user handles - group_dms_json = read_json_from_js_file(os.path.join(data_folder, 'direct-messages-group.js')) dm_user_ids = set() for conversation in group_dms_json: participants = find_group_direct_message_participants(conversation) for participant_id in participants: dm_user_ids.add(participant_id) lookup_users(list(dm_user_ids), users) +def parse_group_direct_messages(username, users, user_id_url_template, paths): + group_dms_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'direct-messages-group.js')) # Parse the group DMs, store messages and metadata in a dict group_conversations_messages = defaultdict(list) @@ -828,7 +828,7 @@ def parse_group_direct_messages(data_folder, username, users, user_id_url_templa markdown += f'## Group conversation between {name_list}, part {chunk_index + 1}: ##\n' markdown += ''.join(md for _, md in chunk) conversation_output_filename = \ - group_dm_output_filename_template.format(f'{group_name}_part{chunk_index + 1:03}') + paths.file_template_group_dm_output.format(f'{group_name}_part{chunk_index + 1:03}') # write part to a markdown file with open(conversation_output_filename, 'w', encoding='utf8') as f: @@ -840,7 +840,7 @@ def parse_group_direct_messages(data_folder, username, users, user_id_url_templa markdown += f'# {official_name}\n' markdown += f'## Group conversation between {name_list}: ##\n' markdown += ''.join(md for _, md in messages) - conversation_output_filename = group_dm_output_filename_template.format(group_name) + conversation_output_filename = paths.file_template_group_dm_output.format(group_name) with open(conversation_output_filename, 'w', encoding='utf8') as f: f.write(markdown) @@ -856,21 +856,21 @@ def parse_group_direct_messages(data_folder, username, users, user_id_url_templa class PathConfig: """Helper class containing constants for various directories and files.""" def __init__(self, dir_archive, dir_output): - self.dir_input_data = os.path.join(dir_archive, 'data') - self.dir_input_media = find_dir_input_media(self.dir_input_data) - self.dir_output_media = os.path.join(dir_output, 'media') - self.file_output_following = os.path.join(dir_output, 'following.txt') - self.file_output_followers = os.path.join(dir_output, 'followers.txt') - self.file_template_dm_output = os.path.join(dir_output, 'DMs-Archive-{}.md') - self.file_account_js = os.path.join(self.dir_input_data, 'account.js') - self.file_download_log = os.path.join(self.dir_output_media, 'download_log.txt') - self.file_tweet_icon = os.path.join(self.dir_output_media, 'tweet.ico') - self.files_input_tweets = find_files_input_tweets(self.dir_input_data) + self.dir_input_data = os.path.join(dir_archive, 'data') + self.dir_input_media = find_dir_input_media(self.dir_input_data) + self.dir_output_media = os.path.join(dir_output, 'media') + self.file_output_following = os.path.join(dir_output, 'following.txt') + self.file_output_followers = os.path.join(dir_output, 'followers.txt') + self.file_template_dm_output = os.path.join(dir_output, 'DMs-Archive-{}.md') + self.file_template_group_dm_output = os.path.join(dir_output, 'DMs-Group-Archive-{}.md') + self.file_account_js = os.path.join(self.dir_input_data, 'account.js') + self.file_download_log = os.path.join(self.dir_output_media, 'download_log.txt') + self.file_tweet_icon = os.path.join(self.dir_output_media, 'tweet.ico') + self.files_input_tweets = find_files_input_tweets(self.dir_input_data) def main(): paths = PathConfig(dir_archive='.', dir_output='.') - group_dm_output_filename_template = 'DMs-Group-Archive-{}.md' # Extract the username from data/account.js if not os.path.isfile(paths.file_account_js): @@ -910,8 +910,8 @@ def main(): parse_followings(users, URL_template_user_id, paths) parse_followers(users, URL_template_user_id, paths) parse_direct_messages(username, users, URL_template_user_id, paths) - parse_group_direct_messages(paths.dir_input_data, username, users, URL_template_user_id, group_dm_output_filename_template) + parse_group_direct_messages(username, users, URL_template_user_id, paths) # Download larger images, if the user agrees print(f"\nThe archive doesn't contain the original-size images. We can attempt to download them from twimg.com.") print(f'Please be aware that this script may download a lot of data, which will cost you money if you are') From 75fc713d93a6691139112fdf1bfc9075d9083f36 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Fri, 25 Nov 2022 01:08:09 +0100 Subject: [PATCH 10/46] separate collection of user ids from content parsing and output generation (for group dms) --- parser.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/parser.py b/parser.py index 3cbfa38..2602380 100755 --- a/parser.py +++ b/parser.py @@ -634,17 +634,26 @@ def find_group_dm_conversation_participant_ids(conversation: dict) -> set: return group_user_ids - """Parse data_folder/direct-messages-group.js, write to one markdown file per conversation. - Query Twitter API for the missing user handles, if the user agrees. +def collect_user_ids_from_group_direct_messages(paths) -> list: """ - # Scan the group DMs for missing user handles - dm_user_ids = set() + Collect all user ids that appear in the group direct messages archive data. + (For use in bulk online lookup from Twitter.) + """ + # read JSON file from archive + group_dms_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'direct-messages-group.js')) + # collect all user ids in a set + group_dms_user_ids = set() for conversation in group_dms_json: - participants = find_group_direct_message_participants(conversation) + participants = find_group_dm_conversation_participant_ids(conversation) for participant_id in participants: - dm_user_ids.add(participant_id) - lookup_users(list(dm_user_ids), users) + group_dms_user_ids.add(participant_id) + return list(group_dms_user_ids) + + def parse_group_direct_messages(username, users, user_id_url_template, paths): + """Parse data_folder/direct-messages-group.js, write to one markdown file per conversation. + """ + # read JSON file from archive group_dms_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'direct-messages-group.js')) # Parse the group DMs, store messages and metadata in a dict @@ -911,7 +920,16 @@ def main(): parse_followers(users, URL_template_user_id, paths) parse_direct_messages(username, users, URL_template_user_id, paths) + # find user ids to look up from group dms + group_dms_user_ids = collect_user_ids_from_group_direct_messages(paths) + # TODO: separate the collecting of user ids out of the other parse* functions in the same way + # and pool the lookups together before all of the other parsing & output generation + # look them up + lookup_users(group_dms_user_ids, users) + + # parse the content of group dms and write to output files parse_group_direct_messages(username, users, URL_template_user_id, paths) + # Download larger images, if the user agrees print(f"\nThe archive doesn't contain the original-size images. We can attempt to download them from twimg.com.") print(f'Please be aware that this script may download a lot of data, which will cost you money if you are') From feef67907dade53be71c81083af19f130d694d5f Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Fri, 25 Nov 2022 01:30:07 +0100 Subject: [PATCH 11/46] Make a minimal commit to test if GitHub is still broken... --- parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser.py b/parser.py index 1831ac2..ffea0aa 100755 --- a/parser.py +++ b/parser.py @@ -187,7 +187,7 @@ def collect_tweet_id(tweet): tweet = tweet['tweet'] return tweet['id_str'] -# returns an it if you give it either an int or a str that can be parsed as +# returns an int if you give it either an int or a str that can be parsed as # an int. Otherwise, returns None. def parse_as_number(str_or_number): if isinstance(str_or_number, str): From 04e3a75f96f7117dbf7cec7ef1db99de9643a7f4 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Fri, 25 Nov 2022 01:49:18 +0100 Subject: [PATCH 12/46] copy image files from group dms to output media dir and embed images in markdown output --- parser.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/parser.py b/parser.py index 2602380..d9b38f6 100755 --- a/parser.py +++ b/parser.py @@ -692,7 +692,65 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): if 'url' in url and 'expanded' in url: expanded_url = url['expanded'] body = body.replace(url['url'], expanded_url) - # TODO: image links + # replace image URLs with image links to local files + if 'mediaUrls' in message_create \ + and len(message_create['mediaUrls']) == 1 \ + and 'urls' in message_create: + original_expanded_url = message_create['urls'][0]['expanded'] + message_id = message_create['id'] + media_hash_and_type = message_create['mediaUrls'][0].split('/')[-1] + media_id = message_create['mediaUrls'][0].split('/')[-2] + archive_media_filename = f'{message_id}-{media_hash_and_type}' + new_url = os.path.join(paths.dir_output_media, archive_media_filename) + archive_media_path = \ + os.path.join(paths.dir_input_data, 'direct_messages_group_media', + archive_media_filename) + if os.path.isfile(archive_media_path): + # found a matching image, use this one + if not os.path.isfile(new_url): + shutil.copy(archive_media_path, new_url) + image_markdown = f'\n![]({new_url})\n' + body = body.replace(original_expanded_url, image_markdown) + + # Save the online location of the best-quality version of this file, + # for later upgrading if wanted + best_quality_url = \ + f'https://ton.twitter.com/i//ton/data/dm/' \ + f'{message_id}/{media_id}/{media_hash_and_type}' + # there is no ':orig' here, the url without any suffix has the original size + + # TODO: a cookie (and a 'Referer: https://twitter.com' header) + # is needed to retrieve it, so the url might be useless anyway... + + # WARNING: Do not uncomment the statement below until the cookie problem is solved! + # media_sources.append( + # ( + # os.path.join(output_media_folder_name, archive_media_filename), + # best_quality_url + # ) + # ) + + else: + archive_media_paths = glob.glob( + os.path.join(paths.dir_input_data, 'direct_messages_group_media', + message_id + '*')) + if len(archive_media_paths) > 0: + for archive_media_path in archive_media_paths: + archive_media_filename = os.path.split(archive_media_path)[-1] + media_url = os.path.join(paths.dir_output_media, + archive_media_filename) + if not os.path.isfile(media_url): + shutil.copy(archive_media_path, media_url) + video_markdown = f'\n\n' + body = body.replace(original_expanded_url, video_markdown) + + # TODO: maybe also save the online location of the best-quality version for videos? + # (see above) + + else: + print(f'Warning: missing local file: {archive_media_path}. ' + f'Using original link instead: {original_expanded_url})') created_at = message_create['createdAt'] # example: 2022-01-27T15:58:52.744Z timestamp = int(round( datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() From fc6abe641e9b2336ddde5bea60331c3a03d8861e Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Fri, 25 Nov 2022 11:33:39 +0100 Subject: [PATCH 13/46] refactored: collect user ids from dms (separately) and do lookup before parsing --- parser.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/parser.py b/parser.py index f649743..ba189fe 100755 --- a/parser.py +++ b/parser.py @@ -491,21 +491,30 @@ def chunks(lst: list, n: int): yield lst[i:i + n] -def parse_direct_messages(username, users, URL_template_user_id, paths): - """Parse paths.dir_input_data/direct-messages.js, write to one markdown file per conversation. - Query Twitter API for the missing user handles, if the user agrees. +def collect_user_ids_from_direct_messages(paths) -> list: """ - # Scan the DMs for missing user handles + Collect all user ids that appear in the direct messages archive data. + (For use in bulk online lookup from Twitter.) + """ + # read JSON file from archive dms_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'direct-messages.js')) - dm_user_ids = set() + # collect all user ids in a set + dms_user_ids = set() for conversation in dms_json: if 'dmConversation' in conversation and 'conversationId' in conversation['dmConversation']: dm_conversation = conversation['dmConversation'] conversation_id = dm_conversation['conversationId'] user1_id, user2_id = conversation_id.split('-') - dm_user_ids.add(user1_id) - dm_user_ids.add(user2_id) - lookup_users(list(dm_user_ids), users) + dms_user_ids.add(user1_id) + dms_user_ids.add(user2_id) + return list(dms_user_ids) + + +def parse_direct_messages(username, users, URL_template_user_id, paths): + """Parse paths.dir_input_data/direct-messages.js, write to one markdown file per conversation. + """ + # read JSON file + dms_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'direct-messages.js')) # Parse the DMs and store the messages in a dict conversations_messages = defaultdict(list) @@ -652,6 +661,10 @@ def main(): media_sources = parse_tweets(username, users, html_template, paths) parse_followings(users, URL_template_user_id, paths) parse_followers(users, URL_template_user_id, paths) + + dms_user_ids = collect_user_ids_from_direct_messages(paths) + print(f'found {len(dms_user_ids)} user IDs in direct messages.') + lookup_users(dms_user_ids, users) parse_direct_messages(username, users, URL_template_user_id, paths) # Download larger images, if the user agrees From 4a76d17b7b8d13cf6b59388d752f333f764a7bc6 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Fri, 25 Nov 2022 11:40:20 +0100 Subject: [PATCH 14/46] refactored: collect user ids from followers (separately) and do lookup before parsing --- parser.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/parser.py b/parser.py index ba189fe..318f646 100755 --- a/parser.py +++ b/parser.py @@ -465,9 +465,23 @@ def parse_followings(users, URL_template_user_id, paths): print(f"Wrote {len(following)} accounts to {paths.file_output_following}") +def collect_user_ids_from_followers(paths) -> list: + """ + Collect all user ids that appear in the followers archive data. + (For use in bulk online lookup from Twitter.) + """ + # read JSON file from archive + follower_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'follower.js')) + # collect all user ids in a list + follower_ids = [] + for follower in follower_json: + if 'follower' in follower and 'accountId' in follower['follower']: + follower_ids.append(follower['follower']['accountId']) + return follower_ids + + def parse_followers(users, URL_template_user_id, paths): """Parse paths.dir_input_data/followers.js, write to paths.file_output_followers. - Query Twitter API for the missing user handles, if the user agrees. """ followers = [] follower_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'follower.js')) @@ -475,10 +489,9 @@ def parse_followers(users, URL_template_user_id, paths): for follower in follower_json: if 'follower' in follower and 'accountId' in follower['follower']: follower_ids.append(follower['follower']['accountId']) - lookup_users(follower_ids, users) - for id in follower_ids: - handle = users[id].handle if id in users else '~unknown~handle~' - followers.append(handle + ' ' + URL_template_user_id.format(id)) + for follower_id in follower_ids: + handle = users[follower_id].handle if follower_id in users else '~unknown~handle~' + followers.append(handle + ' ' + URL_template_user_id.format(follower_id)) followers.sort() with open(paths.file_output_followers, 'w', encoding='utf8') as f: f.write('\n'.join(followers)) @@ -660,6 +673,10 @@ def main(): media_sources = parse_tweets(username, users, html_template, paths) parse_followings(users, URL_template_user_id, paths) + + follower_ids = collect_user_ids_from_followers(paths) + print(f'found {len(follower_ids)} user IDs in followers.') + lookup_users(follower_ids, users) parse_followers(users, URL_template_user_id, paths) dms_user_ids = collect_user_ids_from_direct_messages(paths) From d1bfca84128c893f1de5acce25ecfc7ef6594b70 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Fri, 25 Nov 2022 11:43:53 +0100 Subject: [PATCH 15/46] refactored: collect user ids from followings (separately) and do lookup before parsing --- parser.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/parser.py b/parser.py index 318f646..c6f4554 100755 --- a/parser.py +++ b/parser.py @@ -445,9 +445,23 @@ def parse_tweets(username, users, html_template, paths): return media_sources +def collect_user_ids_from_followings(paths) -> list: + """ + Collect all user ids that appear in the followings archive data. + (For use in bulk online lookup from Twitter.) + """ + # read JSON file from archive + following_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'following.js')) + # collect all user ids in a list + following_ids = [] + for follow in following_json: + if 'following' in follow and 'accountId' in follow['following']: + following_ids.append(follow['following']['accountId']) + return following_ids + + def parse_followings(users, URL_template_user_id, paths): """Parse paths.dir_input_data/following.js, write to paths.file_output_following. - Query Twitter API for the missing user handles, if the user agrees. """ following = [] following_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'following.js')) @@ -455,10 +469,9 @@ def parse_followings(users, URL_template_user_id, paths): for follow in following_json: if 'following' in follow and 'accountId' in follow['following']: following_ids.append(follow['following']['accountId']) - lookup_users(following_ids, users) - for id in following_ids: - handle = users[id].handle if id in users else '~unknown~handle~' - following.append(handle + ' ' + URL_template_user_id.format(id)) + for following_id in following_ids: + handle = users[following_id].handle if following_id in users else '~unknown~handle~' + following.append(handle + ' ' + URL_template_user_id.format(following_id)) following.sort() with open(paths.file_output_following, 'w', encoding='utf8') as f: f.write('\n'.join(following)) @@ -672,6 +685,10 @@ def main(): shutil.copy('assets/images/favicon.ico', paths.file_tweet_icon); media_sources = parse_tweets(username, users, html_template, paths) + + following_ids = collect_user_ids_from_followings(paths) + print(f'found {len(following_ids)} user IDs in followings.') + lookup_users(following_ids, users) parse_followings(users, URL_template_user_id, paths) follower_ids = collect_user_ids_from_followers(paths) From 1609d98cef827c654138aaa769af33ecb190961a Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Fri, 25 Nov 2022 11:54:38 +0100 Subject: [PATCH 16/46] bundle the lookup of user handles (from followings, followers and direct messages) together --- parser.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/parser.py b/parser.py index c6f4554..bf86d4b 100755 --- a/parser.py +++ b/parser.py @@ -116,6 +116,7 @@ def lookup_users(user_ids, users): except Exception as err: print(f'Failed to download user data: {err}') + def read_json_from_js_file(filename): """Reads the contents of a Twitter-produced .js file into a dictionary.""" print(f'Parsing {filename}...') @@ -688,17 +689,17 @@ def main(): following_ids = collect_user_ids_from_followings(paths) print(f'found {len(following_ids)} user IDs in followings.') - lookup_users(following_ids, users) - parse_followings(users, URL_template_user_id, paths) - follower_ids = collect_user_ids_from_followers(paths) print(f'found {len(follower_ids)} user IDs in followers.') - lookup_users(follower_ids, users) - parse_followers(users, URL_template_user_id, paths) - dms_user_ids = collect_user_ids_from_direct_messages(paths) print(f'found {len(dms_user_ids)} user IDs in direct messages.') - lookup_users(dms_user_ids, users) + + # bulk lookup for user handles from followers, followings and direct messages + collected_user_ids = list(set(following_ids).union(set(follower_ids)).union(set(dms_user_ids))) + lookup_users(collected_user_ids, users) + + parse_followings(users, URL_template_user_id, paths) + parse_followers(users, URL_template_user_id, paths) parse_direct_messages(username, users, URL_template_user_id, paths) # Download larger images, if the user agrees From c61911eeddc2dd7b11d77287be81ba6483820e49 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Fri, 25 Nov 2022 12:15:20 +0100 Subject: [PATCH 17/46] add empty lines to the output for better readability --- parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parser.py b/parser.py index bf86d4b..e71c782 100755 --- a/parser.py +++ b/parser.py @@ -101,7 +101,7 @@ def lookup_users(user_ids, users): return # Account metadata observed at ~2.1KB on average. estimated_size = int(2.1 * len(filtered_user_ids)) - print(f'{len(filtered_user_ids)} users are unknown.') + print(f'\n{len(filtered_user_ids)} users are unknown.') user_input = input(f'Download user data from Twitter (approx {estimated_size:,}KB)? [y/n]') if user_input.lower() not in ('y', 'yes'): return @@ -113,6 +113,7 @@ def lookup_users(user_ids, users): retrieved_users = get_twitter_users(session, bearer_token, guest_token, filtered_user_ids) for user_id, user in retrieved_users.items(): users[user_id] = UserData(user_id, user["screen_name"]) + print() # empty line for better readability of output except Exception as err: print(f'Failed to download user data: {err}') From 373198c7214aa40747aa4f38fd15a8fb63700f25 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Fri, 25 Nov 2022 17:16:02 +0100 Subject: [PATCH 18/46] escape md control chars in md output of tweet text body --- parser.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/parser.py b/parser.py index f649743..51e796b 100755 --- a/parser.py +++ b/parser.py @@ -140,6 +140,25 @@ def extract_username(paths): return account[0]['account']['username'] +def escape_markdown(input_text: str) -> str: + """ + Escape markdown control characters from input text so that the text will not break in rendered markdown. + (Only use on unformatted text parts that do not yet have any markdown control characters added on purpose!) + """ + characters_to_escape: str = r"\_*[]()~`>#+-=|{}.!" + output_text: str = '' + for char in input_text: + if char in characters_to_escape: + # add backslash before control char + output_text = output_text + "\\" + char + elif char == '\n': + # add double space before line break + output_text = output_text + " " + char + else: + output_text = output_text + char + return output_text + + def convert_tweet(tweet, username, media_sources, users, paths): """Converts a JSON-format tweet. Returns tuple of timestamp, markdown and HTML.""" if 'tweet' in tweet.keys(): @@ -172,11 +191,12 @@ def convert_tweet(tweet, username, media_sources, users, paths): body_markdown = body_markdown.replace(url['url'], expanded_url) expanded_url_html = f'{expanded_url}' body_html = body_html.replace(url['url'], expanded_url_html) - # if the tweet is a reply, construct a header that links the names of the accounts being replied to the tweet being replied to + # if the tweet is a reply, construct a header that links the names + # of the accounts being replied to the tweet being replied to header_markdown = '' header_html = '' if 'in_reply_to_status_id' in tweet: - # match and remove all occurences of '@username ' at the start of the body + # match and remove all occurrences of '@username ' at the start of the body replying_to = re.match(r'^(@[0-9A-Za-z_]* )*', body_markdown)[0] if replying_to: body_markdown = body_markdown[len(replying_to):] @@ -191,8 +211,10 @@ def convert_tweet(tweet, username, media_sources, users, paths): name_list = ', '.join(names[:-1]) + (f' and {names[-1]}' if len(names) > 1 else names[0]) in_reply_to_status_id = tweet['in_reply_to_status_id'] replying_to_url = f'https://twitter.com/{in_reply_to_screen_name}/status/{in_reply_to_status_id}' - header_markdown += f'Replying to [{name_list}]({replying_to_url})\n\n' + header_markdown += f'Replying to [{escape_markdown(name_list)}]({replying_to_url})\n\n' header_html += f'Replying to {name_list}
' + # escape tweet body for markdown rendering: + body_markdown = escape_markdown(body_markdown) # replace image URLs with image links to local files if 'entities' in tweet and 'media' in tweet['entities'] and 'extended_entities' in tweet and 'media' in tweet['extended_entities']: original_url = tweet['entities']['media'][0]['url'] @@ -206,7 +228,7 @@ def convert_tweet(tweet, username, media_sources, users, paths): archive_media_path = os.path.join(paths.dir_input_media, archive_media_filename) file_output_media = os.path.join(paths.dir_output_media, archive_media_filename) media_url = f'{os.path.split(paths.dir_output_media)[1]}/{archive_media_filename}' - markdown += '' if not markdown and body_markdown == original_url else '\n\n' + markdown += '' if not markdown and body_markdown == escape_markdown(original_url) else '\n\n' html += '' if not html and body_html == original_url else '
' if os.path.isfile(archive_media_path): # Found a matching image, use this one @@ -248,7 +270,7 @@ def convert_tweet(tweet, username, media_sources, users, paths): print(f'Warning: missing local file: {archive_media_path}. Using original link instead: {original_url} (expands to {original_expanded_url})') markdown += f'![]({original_url})' html += f'{original_url}' - body_markdown = body_markdown.replace(original_url, markdown) + body_markdown = body_markdown.replace(escape_markdown(original_url), markdown) body_html = body_html.replace(original_url, html) # make the body a quote body_markdown = '> ' + '\n> '.join(body_markdown.splitlines()) From a8d0e05b64461a60f53d88106fc13e27795cd358 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Fri, 25 Nov 2022 19:49:47 +0100 Subject: [PATCH 19/46] check if user is in the correct folder before init of the other paths --- parser.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/parser.py b/parser.py index f649743..8d54eca 100755 --- a/parser.py +++ b/parser.py @@ -599,26 +599,32 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): class PathConfig: """Helper class containing constants for various directories and files.""" + def __init__(self, dir_archive, dir_output): - self.dir_input_data = os.path.join(dir_archive, 'data') - self.dir_input_media = find_dir_input_media(self.dir_input_data) - self.dir_output_media = os.path.join(dir_output, 'media') - self.file_output_following = os.path.join(dir_output, 'following.txt') - self.file_output_followers = os.path.join(dir_output, 'followers.txt') - self.file_template_dm_output = os.path.join(dir_output, 'DMs-Archive-{}.md') - self.file_account_js = os.path.join(self.dir_input_data, 'account.js') - self.file_download_log = os.path.join(self.dir_output_media, 'download_log.txt') - self.file_tweet_icon = os.path.join(self.dir_output_media, 'tweet.ico') - self.files_input_tweets = find_files_input_tweets(self.dir_input_data) + self.dir_input_data = os.path.join(dir_archive, 'data') + self.file_account_js = os.path.join(self.dir_input_data, 'account.js') + + # check if user is in correct folder + if not os.path.isfile(self.file_account_js): + print( + f'Error: Failed to load {self.file_account_js}. Start this script in the root folder of your Twitter archive.') + exit() + + self.dir_input_media = find_dir_input_media(self.dir_input_data) + self.dir_output_media = os.path.join(dir_output, 'media') + self.file_output_following = os.path.join(dir_output, 'following.txt') + self.file_output_followers = os.path.join(dir_output, 'followers.txt') + self.file_template_dm_output = os.path.join(dir_output, 'DMs-Archive-{}.md') + + self.file_download_log = os.path.join(self.dir_output_media, 'download_log.txt') + self.file_tweet_icon = os.path.join(self.dir_output_media, 'tweet.ico') + self.files_input_tweets = find_files_input_tweets(self.dir_input_data) def main(): paths = PathConfig(dir_archive='.', dir_output='.') - # Extract the username from data/account.js - if not os.path.isfile(paths.file_account_js): - print(f'Error: Failed to load {paths.file_account_js}. Start this script in the root folder of your Twitter archive.') - exit() + # Extract the archive owner's username from data/account.js username = extract_username(paths) # URL config From 0810231cb5cd48867297b27418e2383eb81bc996 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Sat, 26 Nov 2022 17:44:13 +0100 Subject: [PATCH 20/46] include users from group DMs in bulk handle lookup, extra prompt if there are many unknown follower handles (and some improved code formatting) --- parser.py | 100 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 35 deletions(-) diff --git a/parser.py b/parser.py index 1798919..cdd9a40 100755 --- a/parser.py +++ b/parser.py @@ -102,7 +102,7 @@ def lookup_users(user_ids, users): # Account metadata observed at ~2.1KB on average. estimated_size = int(2.1 * len(filtered_user_ids)) print(f'\n{len(filtered_user_ids)} users are unknown.') - user_input = input(f'Download user data from Twitter (approx {estimated_size:,}KB)? [y/n]') + user_input = input(f'Download user data from Twitter (approx {estimated_size:,} KB)? [y/N]') if user_input.lower() not in ('y', 'yes'): return requests = import_module('requests') @@ -174,7 +174,8 @@ def convert_tweet(tweet, username, media_sources, users, paths): body_markdown = body_markdown.replace(url['url'], expanded_url) expanded_url_html = f'{expanded_url}' body_html = body_html.replace(url['url'], expanded_url_html) - # if the tweet is a reply, construct a header that links the names of the accounts being replied to the tweet being replied to + # if the tweet is a reply, construct a header that links the names of the accounts being replied to + # to the tweet being replied to header_markdown = '' header_html = '' if 'in_reply_to_status_id' in tweet: @@ -196,7 +197,8 @@ def convert_tweet(tweet, username, media_sources, users, paths): header_markdown += f'Replying to [{name_list}]({replying_to_url})\n\n' header_html += f'Replying to {name_list}
' # replace image URLs with image links to local files - if 'entities' in tweet and 'media' in tweet['entities'] and 'extended_entities' in tweet and 'media' in tweet['extended_entities']: + if 'entities' in tweet and 'media' in tweet['entities'] and 'extended_entities' in tweet \ + and 'media' in tweet['extended_entities']: original_url = tweet['entities']['media'][0]['url'] markdown = '' html = '' @@ -229,9 +231,12 @@ def convert_tweet(tweet, username, media_sources, users, paths): media_url = f'{os.path.split(paths.dir_output_media)[1]}/{archive_media_filename}' if not os.path.isfile(file_output_media): shutil.copy(archive_media_path, file_output_media) - markdown += f'\n' - html += f'\n' - # Save the online location of the best-quality version of this file, for later upgrading if wanted + markdown += f'\n' + html += f'\n' + # Save the online location of the best-quality version of this file, + # for later upgrading if wanted if 'video_info' in media and 'variants' in media['video_info']: best_quality_url = '' best_bitrate = -1 # some valid videos are marked with bitrate=0 in the JSON @@ -257,8 +262,10 @@ def convert_tweet(tweet, username, media_sources, users, paths): body_html = '

' + '
\n'.join(body_html.splitlines()) + '
' # append the original Twitter URL as a link original_tweet_url = f'https://twitter.com/{username}/status/{tweet_id_str}' - body_markdown = header_markdown + body_markdown + f'\n\n [{timestamp_str}]({original_tweet_url})' - body_html = header_html + body_html + f' {timestamp_str}

' + body_markdown = header_markdown + body_markdown + f'\n\n ' \ + f'[{timestamp_str}]({original_tweet_url})' + body_html = header_html + body_html + f' {timestamp_str}

' # extract user_id:handle connections if 'in_reply_to_user_id' in tweet and 'in_reply_to_screen_name' in tweet: id = tweet['in_reply_to_user_id'] @@ -276,7 +283,8 @@ def convert_tweet(tweet, username, media_sources, users, paths): def find_files_input_tweets(dir_path_input_data): - """Identify the tweet archive's file and folder names - they change slightly depending on the archive size it seems.""" + """Identify the tweet archive's file and folder names - + they change slightly depending on the archive size it seems.""" input_tweets_file_templates = ['tweet.js', 'tweets.js', 'tweets-part*.js'] files_paths_input_tweets = [] for input_tweets_file_template in input_tweets_file_templates: @@ -319,11 +327,12 @@ def download_file_if_larger(url, filename, index, count, sleep_time): try: with requests.get(url, stream=True, timeout=2) as res: if not res.status_code == 200: - # Try to get content of response as `res.text`. For twitter.com, this will be empty in most (all?) cases. + # Try to get content of response as `res.text`. + # For twitter.com, this will be empty in most (all?) cases. # It is successfully tested with error responses from other domains. raise Exception(f'Download failed with status "{res.status_code} {res.reason}". Response content: "{res.text}"') byte_size_after = int(res.headers['content-length']) - if (byte_size_after != byte_size_before): + if byte_size_after != byte_size_before: # Proceed with the full download tmp_filename = filename+'.tmp' print(f'{pref}Downloading {url}... ', end='\r') @@ -335,30 +344,32 @@ def download_file_if_larger(url, filename, index, count, sleep_time): pixels_before, pixels_after = width_before * height_before, width_after * height_after pixels_percentage_increase = 100.0 * (pixels_after - pixels_before) / pixels_before - if (width_before == -1 and height_before == -1 and width_after == -1 and height_after == -1): + if width_before == -1 and height_before == -1 and width_after == -1 and height_after == -1: # could not check size of both versions, probably a video or unsupported image format os.replace(tmp_filename, filename) bytes_percentage_increase = 100.0 * (byte_size_after - byte_size_before) / byte_size_before logging.info(f'{pref}SUCCESS. New version is {bytes_percentage_increase:3.0f}% ' f'larger in bytes (pixel comparison not possible). {post}') return True, byte_size_after - elif (width_before == -1 or height_before == -1 or width_after == -1 or height_after == -1): + elif width_before == -1 or height_before == -1 or width_after == -1 or height_after == -1: # could not check size of one version, this should not happen (corrupted download?) logging.info(f'{pref}SKIPPED. Pixel size comparison inconclusive: ' f'{width_before}*{height_before}px vs. {width_after}*{height_after}px. {post}') return False, byte_size_after - elif (pixels_after >= pixels_before): + elif pixels_after >= pixels_before: os.replace(tmp_filename, filename) bytes_percentage_increase = 100.0 * (byte_size_after - byte_size_before) / byte_size_before - if (bytes_percentage_increase >= 0): + if bytes_percentage_increase >= 0: logging.info(f'{pref}SUCCESS. New version is {bytes_percentage_increase:3.0f}% larger in bytes ' - f'and {pixels_percentage_increase:3.0f}% larger in pixels. {post}') + f'and {pixels_percentage_increase:3.0f}% larger in pixels. {post}') else: - logging.info(f'{pref}SUCCESS. New version is actually {-bytes_percentage_increase:3.0f}% smaller in bytes ' - f'but {pixels_percentage_increase:3.0f}% larger in pixels. {post}') + logging.info(f'{pref}SUCCESS. New version is actually {-bytes_percentage_increase:3.0f}% ' + f'smaller in bytes but {pixels_percentage_increase:3.0f}% ' + f'larger in pixels. {post}') return True, byte_size_after else: - logging.info(f'{pref}SKIPPED. Online version has {-pixels_percentage_increase:3.0f}% smaller pixel size. {post}') + logging.info(f'{pref}SKIPPED. Online version has {-pixels_percentage_increase:3.0f}% ' + f'smaller pixel size. {post}') return True, byte_size_after else: logging.info(f'{pref}SKIPPED. Online version is same byte size, assuming same content. Not downloaded.') @@ -397,11 +408,13 @@ def download_larger_media(media_sources, paths): media_sources = retries remaining_tries -= 1 sleep_time += 2 - logging.info(f'\n{success_count} of {number_of_files} tested media files are known to be the best-quality available.\n') + logging.info(f'\n{success_count} of {number_of_files} tested media files ' + f'are known to be the best-quality available.\n') if len(retries) == 0: break if remaining_tries > 0: - print(f'----------------------\n\nRetrying the ones that failed, with a longer sleep. {remaining_tries} tries remaining.\n') + print(f'----------------------\n\nRetrying the ones that failed, with a longer sleep. ' + f'{remaining_tries} tries remaining.\n') end_time = time.time() logging.info(f'Total downloaded: {total_bytes_downloaded/2**20:.1f}MB = {total_bytes_downloaded/2**30:.2f}GB') @@ -1031,7 +1044,8 @@ def __init__(self, dir_archive, dir_output): # check if user is in correct folder if not os.path.isfile(self.file_account_js): print( - f'Error: Failed to load {self.file_account_js}. Start this script in the root folder of your Twitter archive.') + f'Error: Failed to load {self.file_account_js}. ' + f'Start this script in the root folder of your Twitter archive.') exit() self.dir_input_media = find_dir_input_media(self.dir_input_data) @@ -1051,7 +1065,6 @@ def main(): # Extract the archive owner's username from data/account.js username = extract_username(paths) - # URL config URL_template_user_id = 'https://twitter.com/i/user/{}' html_template = """\ @@ -1087,23 +1100,39 @@ def main(): print(f'found {len(follower_ids)} user IDs in followers.') dms_user_ids = collect_user_ids_from_direct_messages(paths) print(f'found {len(dms_user_ids)} user IDs in direct messages.') + group_dms_user_ids = collect_user_ids_from_group_direct_messages(paths) + print(f'found {len(group_dms_user_ids)} user IDs in group direct messages.') + + # bulk lookup for user handles from followers, followings, direct messages and group direct messages + collected_user_ids_without_followers = list( + set(following_ids).union(set(dms_user_ids)).union(set(group_dms_user_ids)) + ) + collected_user_ids_only_in_followers: set = set(follower_ids).difference(set(collected_user_ids_without_followers)) + collected_user_ids: list = list(set(collected_user_ids_without_followers).union(collected_user_ids_only_in_followers)) + + print(f'\nfound {len(collected_user_ids)} user IDs overall.') + + # give the user a choice if followers should be included in the lookup + # (but only in case they make up a large amount): + unknown_collected_user_ids: set = set(collected_user_ids).difference(users.keys()) + if len(unknown_collected_user_ids) > 10000: + unknown_follower_user_ids: set = unknown_collected_user_ids.intersection(collected_user_ids_only_in_followers) + if len(unknown_follower_user_ids) > 5000: + # Account metadata observed at ~2.1KB on average. + estimated_follower_lookup_size = int(2.1 * len(unknown_follower_user_ids)) + user_input = input(f'{len(unknown_follower_user_ids)} of the {len(unknown_collected_user_ids)} ' + f'user IDs with unknown handles are from your followers. Online lookup would be ' + f'about {estimated_follower_lookup_size:,} KB smaller without them.\n' + f'Do you want to include handles of your followers ' + f'in the online lookup of user handles? [Y/n]') + if user_input in ['n', 'N', 'no', 'No']: + collected_user_ids = collected_user_ids_without_followers - # bulk lookup for user handles from followers, followings and direct messages - collected_user_ids = list(set(following_ids).union(set(follower_ids)).union(set(dms_user_ids))) lookup_users(collected_user_ids, users) parse_followings(users, URL_template_user_id, paths) parse_followers(users, URL_template_user_id, paths) parse_direct_messages(username, users, URL_template_user_id, paths) - - # find user ids to look up from group dms - group_dms_user_ids = collect_user_ids_from_group_direct_messages(paths) - # TODO: separate the collecting of user ids out of the other parse* functions in the same way - # and pool the lookups together before all of the other parsing & output generation - # look them up - lookup_users(group_dms_user_ids, users) - - # parse the content of group dms and write to output files parse_group_direct_messages(username, users, URL_template_user_id, paths) # Download larger images, if the user agrees @@ -1115,7 +1144,8 @@ def main(): user_input = input('\nOK to start downloading? [y/n]') if user_input.lower() in ('y', 'yes'): download_larger_media(media_sources, paths) - print('In case you set your account to public before initiating the download, do not forget to protect it again.') + print('In case you set your account to public before initiating the download, ' + 'do not forget to protect it again.') if __name__ == "__main__": From 7bb5bfe295ea25916f995650b3c2048f7b0402a2 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Sat, 26 Nov 2022 18:07:43 +0100 Subject: [PATCH 21/46] some more improved code formatting --- parser.py | 65 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/parser.py b/parser.py index cdd9a40..16813ca 100755 --- a/parser.py +++ b/parser.py @@ -147,7 +147,8 @@ def convert_tweet(tweet, username, media_sources, users, paths): if 'tweet' in tweet.keys(): tweet = tweet['tweet'] timestamp_str = tweet['created_at'] - timestamp = int(round(datetime.datetime.strptime(timestamp_str, '%a %b %d %X %z %Y').timestamp())) # Example: Tue Mar 19 14:05:17 +0000 2019 + timestamp = int(round(datetime.datetime.strptime(timestamp_str, '%a %b %d %X %z %Y').timestamp())) + # Example: Tue Mar 19 14:05:17 +0000 2019 body_markdown = tweet['full_text'] body_html = tweet['full_text'] tweet_id_str = tweet['id_str'] @@ -159,7 +160,8 @@ def convert_tweet(tweet, username, media_sources, users, paths): if url.scheme != '' and url.netloc != '' and not word.endswith('\u2026'): # Shorten links similiar to twitter netloc_short = url.netloc[4:] if url.netloc.startswith("www.") else url.netloc - path_short = url.path if len(url.path + '?' + url.query) < 15 else (url.path + '?' + url.query)[:15] + '\u2026' + path_short = url.path if len(url.path + '?' + url.query) < 15 \ + else (url.path + '?' + url.query)[:15] + '\u2026' tweet['entities']['urls'].append({ 'url': word, 'expanded_url': word, @@ -220,7 +222,9 @@ def convert_tweet(tweet, username, media_sources, users, paths): html += f'' # Save the online location of the best-quality version of this file, for later upgrading if wanted best_quality_url = f'https://pbs.twimg.com/media/{original_filename}:orig' - media_sources.append((os.path.join(paths.dir_output_media, archive_media_filename), best_quality_url)) + media_sources.append( + (os.path.join(paths.dir_output_media, archive_media_filename), best_quality_url) + ) else: # Is there any other file that includes the tweet_id in its filename? archive_media_paths = glob.glob(os.path.join(paths.dir_input_media, tweet_id_str + '*')) @@ -247,12 +251,17 @@ def convert_tweet(tweet, username, media_sources, users, paths): best_quality_url = variant['url'] best_bitrate = bitrate if best_bitrate == -1: - print(f"Warning No URL found for {original_url} {original_expanded_url} {archive_media_path} {media_url}") + print(f"Warning No URL found for {original_url} {original_expanded_url} " + f"{archive_media_path} {media_url}") print(f"JSON: {tweet}") else: - media_sources.append((os.path.join(paths.dir_output_media, archive_media_filename), best_quality_url)) + media_sources.append( + (os.path.join(paths.dir_output_media, archive_media_filename), + best_quality_url) + ) else: - print(f'Warning: missing local file: {archive_media_path}. Using original link instead: {original_url} (expands to {original_expanded_url})') + print(f'Warning: missing local file: {archive_media_path}. Using original link instead: ' + f'{original_url} (expands to {original_expanded_url})') markdown += f'![]({original_url})' html += f'{original_url}' body_markdown = body_markdown.replace(original_url, markdown) @@ -330,7 +339,8 @@ def download_file_if_larger(url, filename, index, count, sleep_time): # Try to get content of response as `res.text`. # For twitter.com, this will be empty in most (all?) cases. # It is successfully tested with error responses from other domains. - raise Exception(f'Download failed with status "{res.status_code} {res.reason}". Response content: "{res.text}"') + raise Exception(f'Download failed with status "{res.status_code} {res.reason}". ' + f'Response content: "{res.text}"') byte_size_after = int(res.headers['content-length']) if byte_size_after != byte_size_before: # Proceed with the full download @@ -399,7 +409,9 @@ def download_larger_media(media_sources, paths): success_count = 0 retries = [] for index, (local_media_path, media_url) in enumerate(media_sources): - success, bytes_downloaded = download_file_if_larger(media_url, local_media_path, index + 1, number_of_files, sleep_time) + success, bytes_downloaded = download_file_if_larger( + media_url, local_media_path, index + 1, number_of_files, sleep_time + ) if success: success_count += 1 else: @@ -441,7 +453,8 @@ def parse_tweets(username, users, html_template, paths): for timestamp, md, html in tweets: # Use a (markdown) filename that can be imported into Jekyll: YYYY-MM-DD-your-title-here.md dt = datetime.datetime.fromtimestamp(timestamp) - filename = f'{dt.year}-{dt.month:02}-01-Tweet-Archive-{dt.year}-{dt.month:02}' # change to group by day or year or timestamp + filename = f'{dt.year}-{dt.month:02}-01-Tweet-Archive-{dt.year}-{dt.month:02}' + # change the line above to group by day or year or timestamp grouped_tweets[filename].append((md, html)) for filename, content in grouped_tweets.items(): @@ -455,7 +468,8 @@ def parse_tweets(username, users, html_template, paths): with open(f'{filename}.html', 'w', encoding='utf-8') as f: f.write(html_template.format(html_string)) - print(f'Wrote {len(tweets)} tweets to *.md and *.html, with images and video embedded from {paths.dir_output_media}') + print(f'Wrote {len(tweets)} tweets to *.md and *.html, ' + f'with images and video embedded from {paths.dir_output_media}') return media_sources @@ -475,7 +489,7 @@ def collect_user_ids_from_followings(paths) -> list: return following_ids -def parse_followings(users, URL_template_user_id, paths): +def parse_followings(users, user_id_url_template, paths): """Parse paths.dir_input_data/following.js, write to paths.file_output_following. """ following = [] @@ -486,7 +500,7 @@ def parse_followings(users, URL_template_user_id, paths): following_ids.append(follow['following']['accountId']) for following_id in following_ids: handle = users[following_id].handle if following_id in users else '~unknown~handle~' - following.append(handle + ' ' + URL_template_user_id.format(following_id)) + following.append(handle + ' ' + user_id_url_template.format(following_id)) following.sort() with open(paths.file_output_following, 'w', encoding='utf8') as f: f.write('\n'.join(following)) @@ -508,7 +522,7 @@ def collect_user_ids_from_followers(paths) -> list: return follower_ids -def parse_followers(users, URL_template_user_id, paths): +def parse_followers(users, user_id_url_template, paths): """Parse paths.dir_input_data/followers.js, write to paths.file_output_followers. """ followers = [] @@ -519,7 +533,7 @@ def parse_followers(users, URL_template_user_id, paths): follower_ids.append(follower['follower']['accountId']) for follower_id in follower_ids: handle = users[follower_id].handle if follower_id in users else '~unknown~handle~' - followers.append(handle + ' ' + URL_template_user_id.format(follower_id)) + followers.append(handle + ' ' + user_id_url_template.format(follower_id)) followers.sort() with open(paths.file_output_followers, 'w', encoding='utf8') as f: f.write('\n'.join(followers)) @@ -551,7 +565,7 @@ def collect_user_ids_from_direct_messages(paths) -> list: return list(dms_user_ids) -def parse_direct_messages(username, users, URL_template_user_id, paths): +def parse_direct_messages(username, users, user_id_url_template, paths): """Parse paths.dir_input_data/direct-messages.js, write to one markdown file per conversation. """ # read JSON file @@ -641,9 +655,9 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): int(round(datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp())) from_handle = users[from_id].handle.replace('_', '\\_') if from_id in users \ - else URL_template_user_id.format(from_id) + else user_id_url_template.format(from_id) to_handle = users[to_id].handle.replace('_', '\\_') if to_id in users \ - else URL_template_user_id.format(to_id) + else user_id_url_template.format(to_id) message_markdown = f'\n\n### {from_handle} -> {to_handle}: ' \ f'({created_at}) ###\n```\n{body}\n```' @@ -663,7 +677,7 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): messages.sort(key=lambda tup: tup[0]) other_user_name = users[other_user_id].handle.replace('_', '\\_') if other_user_id in users \ - else URL_template_user_id.format(other_user_id) + else user_id_url_template.format(other_user_id) other_user_short_name: str = users[other_user_id].handle if other_user_id in users else other_user_id @@ -1065,7 +1079,7 @@ def main(): # Extract the archive owner's username from data/account.js username = extract_username(paths) - URL_template_user_id = 'https://twitter.com/i/user/{}' + user_id_url_template = 'https://twitter.com/i/user/{}' html_template = """\ @@ -1108,7 +1122,8 @@ def main(): set(following_ids).union(set(dms_user_ids)).union(set(group_dms_user_ids)) ) collected_user_ids_only_in_followers: set = set(follower_ids).difference(set(collected_user_ids_without_followers)) - collected_user_ids: list = list(set(collected_user_ids_without_followers).union(collected_user_ids_only_in_followers)) + collected_user_ids: list = list(set(collected_user_ids_without_followers) + .union(collected_user_ids_only_in_followers)) print(f'\nfound {len(collected_user_ids)} user IDs overall.') @@ -1125,15 +1140,15 @@ def main(): f'about {estimated_follower_lookup_size:,} KB smaller without them.\n' f'Do you want to include handles of your followers ' f'in the online lookup of user handles? [Y/n]') - if user_input in ['n', 'N', 'no', 'No']: + if user_input.lower() in ['n', 'no']: collected_user_ids = collected_user_ids_without_followers lookup_users(collected_user_ids, users) - parse_followings(users, URL_template_user_id, paths) - parse_followers(users, URL_template_user_id, paths) - parse_direct_messages(username, users, URL_template_user_id, paths) - parse_group_direct_messages(username, users, URL_template_user_id, paths) + parse_followings(users, user_id_url_template, paths) + parse_followers(users, user_id_url_template, paths) + parse_direct_messages(username, users, user_id_url_template, paths) + parse_group_direct_messages(username, users, user_id_url_template, paths) # Download larger images, if the user agrees print(f"\nThe archive doesn't contain the original-size images. We can attempt to download them from twimg.com.") From dc1917a2d4d57b98f3771dfab8da72f415ded9ea Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Sat, 26 Nov 2022 21:40:02 +0100 Subject: [PATCH 22/46] escape md control chars in DMs and format them as quotes instead of code blocks (similar to tweets md output) --- parser.py | 95 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 39 deletions(-) diff --git a/parser.py b/parser.py index 4c5508a..7372161 100755 --- a/parser.py +++ b/parser.py @@ -453,7 +453,7 @@ def parse_tweets(username, users, html_template, paths): for filename, content in grouped_tweets.items(): # Write into *.md files - md_string = '\n\n----\n\n'.join(md for md, _ in content) + md_string = '\n\n----\n\n'.join(md for md, _ in content) with open(f'{filename}.md', 'w', encoding='utf-8') as f: f.write(md_string) @@ -551,6 +551,8 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): if 'url' in url and 'expanded' in url: expanded_url = url['expanded'] body = body.replace(url['url'], expanded_url) + # escape message body for markdown rendering: + body_markdown = escape_markdown(body) # replace image URLs with image links to local files if 'mediaUrls' in message_create \ and len(message_create['mediaUrls']) == 1 \ @@ -568,7 +570,9 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): if not os.path.isfile(new_url): shutil.copy(archive_media_path, new_url) image_markdown = f'\n![]({new_url})\n' - body = body.replace(original_expanded_url, image_markdown) + body_markdown = body_markdown.replace( + escape_markdown(original_expanded_url), image_markdown + ) # Save the online location of the best-quality version of this file, # for later upgrading if wanted @@ -599,7 +603,9 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): shutil.copy(archive_media_path, media_url) video_markdown = f'\n\n' - body = body.replace(original_expanded_url, video_markdown) + body_markdown = body_markdown.replace( + escape_markdown(original_expanded_url), video_markdown + ) # TODO: maybe also save the online location of the best-quality version for videos? # (see above) @@ -612,13 +618,15 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): timestamp = \ int(round(datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp())) - from_handle = users[from_id].handle.replace('_', '\\_') if from_id in users \ + from_handle = escape_markdown(users[from_id].handle) if from_id in users \ else URL_template_user_id.format(from_id) - to_handle = users[to_id].handle.replace('_', '\\_') if to_id in users \ + to_handle = escape_markdown(users[to_id].handle) if to_id in users \ else URL_template_user_id.format(to_id) - message_markdown = f'\n\n### {from_handle} -> {to_handle}: ' \ - f'({created_at}) ###\n```\n{body}\n```' + # make the body a quote + body_markdown = '> ' + '\n> '.join(body_markdown.splitlines()) + message_markdown = f'{from_handle} -> {to_handle}: ({created_at}) \n\n' \ + f'{body_markdown}' messages.append((timestamp, message_markdown)) # find identifier for the conversation @@ -634,12 +642,12 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): # sort messages by timestamp messages.sort(key=lambda tup: tup[0]) - other_user_name = users[other_user_id].handle.replace('_', '\\_') if other_user_id in users \ + other_user_name = escape_markdown(users[other_user_id].handle) if other_user_id in users \ else URL_template_user_id.format(other_user_id) other_user_short_name: str = users[other_user_id].handle if other_user_id in users else other_user_id - escaped_username = username.replace('_', '\\_') + escaped_username = escape_markdown(username) # if there are more than 1000 messages, the conversation was split up in the twitter archive. # following this standard, also split up longer conversations in the output files: @@ -647,9 +655,9 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): if len(messages) > 1000: for chunk_index, chunk in enumerate(chunks(messages, 1000)): markdown = '' - markdown += f'## Conversation between {escaped_username} and {other_user_name}, ' \ - f'part {chunk_index+1}: ##\n' - markdown += ''.join(md for _, md in chunk) + markdown += f'### Conversation between {escaped_username} and {other_user_name}, ' \ + f'part {chunk_index+1}: ###\n\n----\n\n' + markdown += '\n\n----\n\n'.join(md for _, md in chunk) conversation_output_filename = \ paths.file_template_dm_output.format(f'{other_user_short_name}_part{chunk_index+1:03}') @@ -661,8 +669,8 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): else: markdown = '' - markdown += f'## Conversation between {escaped_username} and {other_user_name}: ##\n' - markdown += ''.join(md for _, md in messages) + markdown += f'### Conversation between {escaped_username} and {other_user_name}: ###\n\n----\n\n' + markdown += '\n\n----\n\n'.join(md for _, md in messages) conversation_output_filename = paths.file_template_dm_output.format(other_user_short_name) with open(conversation_output_filename, 'w', encoding='utf8') as f: @@ -778,6 +786,8 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): if 'url' in url and 'expanded' in url: expanded_url = url['expanded'] body = body.replace(url['url'], expanded_url) + # escape message body for markdown rendering: + body_markdown = escape_markdown(body) # replace image URLs with image links to local files if 'mediaUrls' in message_create \ and len(message_create['mediaUrls']) == 1 \ @@ -796,7 +806,9 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): if not os.path.isfile(new_url): shutil.copy(archive_media_path, new_url) image_markdown = f'\n![]({new_url})\n' - body = body.replace(original_expanded_url, image_markdown) + body_markdown = body_markdown.replace( + escape_markdown(original_expanded_url), image_markdown + ) # Save the online location of the best-quality version of this file, # for later upgrading if wanted @@ -829,7 +841,9 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): shutil.copy(archive_media_path, media_url) video_markdown = f'\n\n' - body = body.replace(original_expanded_url, video_markdown) + body_markdown = body_markdown.replace( + escape_markdown(original_expanded_url), video_markdown + ) # TODO: maybe also save the online location of the best-quality version for videos? # (see above) @@ -841,22 +855,25 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): timestamp = int(round( datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() )) - from_handle = users[from_id].handle.replace('_', '\\_') if from_id in users \ + from_handle = escape_markdown(users[from_id].handle) if from_id in users \ else user_id_url_template.format(from_id) - message_markdown = f'\n\n### {from_handle}: ({created_at}) ###\n```\n{body}\n```' + # make the body a quote + body_markdown = '> ' + '\n> '.join(body_markdown.splitlines()) + message_markdown = f'{from_handle}: ({created_at})\n\n' \ + f'{body_markdown}' messages.append((timestamp, message_markdown)) elif "conversationNameUpdate" in message: conversation_name_update = message['conversationNameUpdate'] if all(tag in conversation_name_update for tag in ['initiatingUserId', 'name', 'createdAt']): from_id = conversation_name_update['initiatingUserId'] - body = f"_changed group name to: {conversation_name_update['name']}_" + body_markdown = f"_changed group name to: {escape_markdown(conversation_name_update['name'])}_" created_at = conversation_name_update['createdAt'] # example: 2022-01-27T15:58:52.744Z timestamp = int(round( datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() )) - from_handle = users[from_id].handle.replace('_', '\\_') if from_id in users \ + from_handle = escape_markdown(users[from_id].handle) if from_id in users \ else user_id_url_template.format(from_id) - message_markdown = f'\n\n### {from_handle}: ({created_at}) ###\n\n{body}\n' + message_markdown = f'{from_handle}: ({created_at})\n\n{body_markdown}' messages.append((timestamp, message_markdown)) # save metadata about name change: group_conversations_metadata[conversation_id]['conversation_names'].append( @@ -870,11 +887,11 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): timestamp = int(round( datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() )) - from_handle = users[from_id].handle.replace('_', '\\_') if from_id in users \ + from_handle = escape_markdown(users[from_id].handle) if from_id in users \ else user_id_url_template.format(from_id) - escaped_username = username.replace('_', '\\_') - body = f'_{from_handle} added {escaped_username} to the group_' - message_markdown = f'\n\n### {from_handle}: ({created_at}) ###\n\n{body}\n' + escaped_username = escape_markdown(username) + body_markdown = f'_{from_handle} added {escaped_username} to the group_' + message_markdown = f'{from_handle}: ({created_at})\n\n{body_markdown}' messages.append((timestamp, message_markdown)) elif "participantsJoin" in message: participants_join = message['participantsJoin'] @@ -884,16 +901,16 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): timestamp = int(round( datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() )) - from_handle = users[from_id].handle.replace('_', '\\_') if from_id in users \ + from_handle = escape_markdown(users[from_id].handle) if from_id in users \ else user_id_url_template.format(from_id) joined_ids = participants_join['userIds'] - joined_handles = [users[joined_id].handle.replace('_', '\\_') if joined_id in users + joined_handles = [escape_markdown(users[joined_id].handle) if joined_id in users else user_id_url_template.format(joined_id) for joined_id in joined_ids] name_list = ', '.join(joined_handles[:-1]) + \ (f' and {joined_handles[-1]}' if len(joined_handles) > 1 else joined_handles[0]) - body = f'_{from_handle} added {name_list} to the group_' - message_markdown = f'\n\n### {from_handle}: ({created_at}) ###\n\n{body}\n' + body_markdown = f'_{from_handle} added {name_list} to the group_' + message_markdown = f'{from_handle}: ({created_at})\n\n{body_markdown}' messages.append((timestamp, message_markdown)) elif "participantsLeave" in message: participants_leave = message['participantsLeave'] @@ -903,13 +920,13 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() )) left_ids = participants_leave['userIds'] - left_handles = [users[left_id].handle.replace('_', '\\_') if left_id in users + left_handles = [escape_markdown(users[left_id].handle) if left_id in users else user_id_url_template.format(left_id) for left_id in left_ids] name_list = ', '.join(left_handles[:-1]) + \ (f' and {left_handles[-1]}' if len(left_handles) > 1 else left_handles[0]) - body = f'_{name_list} left the group_' - message_markdown = f'\n\n### {name_list}: ({created_at}) ###\n\n{body}\n' + body_markdown = f'_{name_list} left the group_' + message_markdown = f'{name_list}: ({created_at})\n\n{body_markdown}' messages.append((timestamp, message_markdown)) # collect messages per conversation in group_conversations_messages dict @@ -966,7 +983,7 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): # create a list of names of the form '@name1, @name2 and @name3' # to use as a headline in the output file escaped_participant_names = [ - participant_name.replace('_', '\\_') + escape_markdown(participant_name) for participant_name in group_conversations_metadata[conversation_id]['participant_names'] ] name_list = ', '.join(escaped_participant_names[:-1]) + \ @@ -977,9 +994,9 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): if len(messages) > 1000: for chunk_index, chunk in enumerate(chunks(messages, 1000)): markdown = '' - markdown += f'# {official_name}\n' - markdown += f'## Group conversation between {name_list}, part {chunk_index + 1}: ##\n' - markdown += ''.join(md for _, md in chunk) + markdown += f'## {official_name} ##\n\n' + markdown += f'### Group conversation between {name_list}, part {chunk_index + 1}: ###\n\n----\n\n' + markdown += '\n\n----\n\n'.join(md for _, md in chunk) conversation_output_filename = \ paths.file_template_group_dm_output.format(f'{group_name}_part{chunk_index + 1:03}') @@ -990,9 +1007,9 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): num_written_files += 1 else: markdown = '' - markdown += f'# {official_name}\n' - markdown += f'## Group conversation between {name_list}: ##\n' - markdown += ''.join(md for _, md in messages) + markdown += f'## {official_name} ##\n\n' + markdown += f'### Group conversation between {name_list}: ###\n\n----\n\n' + markdown += '\n\n----\n\n'.join(md for _, md in messages) conversation_output_filename = paths.file_template_group_dm_output.format(group_name) with open(conversation_output_filename, 'w', encoding='utf8') as f: From d41186ad447183f4892218187a5de87383f2b372 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Sat, 26 Nov 2022 21:43:10 +0100 Subject: [PATCH 23/46] Move and/or remove output in the archive root, which was left there by previous versions of this script. --- parser.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/parser.py b/parser.py index 63a605b..14aa69b 100755 --- a/parser.py +++ b/parser.py @@ -1079,6 +1079,62 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): f"({num_written_messages} total messages) to {num_written_files} markdown files") +def migrate_old_output(paths: PathConfig): + # Create new media folder, so we can potentially use it to move files there + os.makedirs(paths.dir_output_media, exist_ok=True) + + # Move files that we can re-use: + if os.path.exists(os.path.join(paths.dir_archive, "media")): + files_to_move = glob.glob(os.path.join(paths.dir_archive, "media", "*")) + if len(files_to_move) > 0: + print(f"Moving {len(files_to_move)} files from 'media' to '{paths.dir_output_media}'") + for file_path_to_move in files_to_move: + file_name_to_move = os.path.split(file_path_to_move)[1] + print(file_name_to_move) + os.rename(file_path_to_move, os.path.join(paths.dir_output_media, file_name_to_move)) + os.rmdir(os.path.join(paths.dir_archive, "media")) + + known_tweets_old_path = os.path.join(paths.dir_archive, "known_tweets.json") + known_tweets_new_path = os.path.join(paths.dir_output_cache, "known_tweets.json") + if os.path.exists(known_tweets_old_path): + os.rename(known_tweets_old_path, known_tweets_new_path) + + # Delete files that would be overwritten anyway (if user consents): + output_globs = [ + "*Tweet-Archive*.html", + "*Tweet-Archive*.md", + "DMs-Archive-*.html", + "DMs-Archive-*.md", + "DMs-Group-Archive-*.html", + "DMs-Group-Archive-*.md", + "followers.txt", + "following.txt", + ] + files_to_delete = [] + + for output_glob in output_globs: + files_to_delete += glob.glob(os.path.join(paths.dir_archive, output_glob)) + + # TODO maybe remove those files only after the new ones have been generated? This way, the user would never + # end up with less output than before. On the other hand, they might end up with old *and* new versions + # of the output, if the script crashes before it reaches the code to delete the old version. + if len(files_to_delete) > 0: + print(f"\nThere are {len(files_to_delete)} files in the root of the archive,") + print("which were probably generated from an older version of this script.") + print("Since then, the directory layout of twitter-archive-parser has changed") + print("and these files are generated into the sub-directory 'parser-output' or") + print("various sub-sub-directories therein. These are the affected files:") + + for file_to_delete in files_to_delete: + print(file_to_delete) + + user_input = input('\nOK delete these files? (If the the directory layout would not have changed, they would be overwritten anyway) [y/N]') + if user_input.lower() in ('y', 'yes'): + for file_to_delete in files_to_delete: + os.remove(file_to_delete) + print(f"Files have been deleted. New versions of these files will be generated into 'parser-output' soon.") + + def main(): paths = PathConfig(dir_archive='.') @@ -1108,6 +1164,8 @@ def main(): users = {} + migrate_old_output(paths) + # Make a folder to copy the images and videos into. os.makedirs(paths.dir_output_media, exist_ok=True) if not os.path.isfile(paths.file_tweet_icon): From e3207b861fd24616d7c811f6183c6b58a3746e43 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Sat, 26 Nov 2022 22:12:58 +0100 Subject: [PATCH 24/46] Add method `get_consent` which ensures that yes/no questions are displayed consistently and unexpected inputs are handled more robustly. --- parser.py | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/parser.py b/parser.py index 4c5508a..42bf6a0 100755 --- a/parser.py +++ b/parser.py @@ -46,14 +46,38 @@ def __init__(self, id, handle = None): self.handle = handle +def get_consent(prompt: str, default_to_yes: bool = False): + """Asks the user for consent, using the given prompt. Accepts various versions of yes/no, or + an empty answer to accept the default. The default is 'no' unless default_to_yes is passed as + True. The default will be indicated automatically. For unacceptable answers, the user will + be asked again.""" + if default_to_yes: + suffix = " [Y/n]" + default_answer = "yes" + else: + suffix = " [y/N]" + default_answer = "no" + while True: + user_input = input(prompt + suffix) + if user_input == "": + print (f"Your empty response was assumed to mean '{default_answer}' (the default for this question).") + return default_to_yes + if user_input.lower() in ('y', 'yes'): + return True + if user_input.lower() in ('n', 'no'): + return False + print (f"Sorry, did not understand. Please answer with y, n, yes, no, or press enter to accept " + f"the default (which is '{default_answer}' in this case, as indicated by the uppercase " + f"'{default_answer.upper()[0]}'.)") + + def import_module(module): """Imports a module specified by a string. Example: requests = import_module('requests')""" try: return importlib.import_module(module) except ImportError: print(f'\nError: This script uses the "{module}" module which is not installed.\n') - user_input = input('OK to install using pip? [y/n]') - if not user_input.lower() in ('y', 'yes'): + if not get_consent('OK to install using pip?'): exit() subprocess.run([sys.executable, '-m', 'pip', 'install', module], check=True) return importlib.import_module(module) @@ -102,9 +126,9 @@ def lookup_users(user_ids, users): # Account metadata observed at ~2.1KB on average. estimated_size = int(2.1 * len(filtered_user_ids)) print(f'{len(filtered_user_ids)} users are unknown.') - user_input = input(f'Download user data from Twitter (approx {estimated_size:,}KB)? [y/n]') - if user_input.lower() not in ('y', 'yes'): + if not get_consent(f'Download user data from Twitter (approx {estimated_size:,}KB)?'): return + requests = import_module('requests') try: with requests.Session() as session: @@ -1084,9 +1108,9 @@ def main(): print(f'Please be aware that this script may download a lot of data, which will cost you money if you are') print(f'paying for bandwidth. Please be aware that the servers might block these requests if they are too') print(f'frequent. This script may not work if your account is protected. You may want to set it to public') - print(f'before starting the download.') - user_input = input('\nOK to start downloading? [y/n]') - if user_input.lower() in ('y', 'yes'): + print(f'before starting the download.\n') + + if get_consent('OK to start downloading?'): download_larger_media(media_sources, paths) print('In case you set your account to public before initiating the download, do not forget to protect it again.') From 94b1d506c5d4b0ff94f8f058ed66c714b2bd645b Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Sat, 26 Nov 2022 22:25:29 +0100 Subject: [PATCH 25/46] simplified check and clarified text for prompting the optional exclusion of a large number of follower handles from the lookup --- parser.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/parser.py b/parser.py index 8d2f180..f9fcd67 100644 --- a/parser.py +++ b/parser.py @@ -1151,18 +1151,24 @@ def main(): # give the user a choice if followers should be included in the lookup # (but only in case they make up a large amount): unknown_collected_user_ids: set = set(collected_user_ids).difference(users.keys()) - if len(unknown_collected_user_ids) > 10000: - unknown_follower_user_ids: set = unknown_collected_user_ids.intersection(collected_user_ids_only_in_followers) - if len(unknown_follower_user_ids) > 5000: - # Account metadata observed at ~2.1KB on average. - estimated_follower_lookup_size = int(2.1 * len(unknown_follower_user_ids)) - user_input = input(f'{len(unknown_follower_user_ids)} of the {len(unknown_collected_user_ids)} ' - f'user IDs with unknown handles are from your followers. Online lookup would be ' - f'about {estimated_follower_lookup_size:,} KB smaller without them.\n' - f'Do you want to include handles of your followers ' - f'in the online lookup of user handles? [Y/n]') - if user_input.lower() in ['n', 'no']: - collected_user_ids = collected_user_ids_without_followers + unknown_follower_user_ids: set = unknown_collected_user_ids.intersection(collected_user_ids_only_in_followers) + if len(unknown_follower_user_ids) > 5000: + # Account metadata observed at ~2.1KB on average. + estimated_follower_lookup_size = int(2.1 * len(unknown_follower_user_ids)) + # we can look up at least 3000 users per minute. + estimated_max_follower_lookup_time_in_minutes = len(unknown_follower_user_ids) / 3000 + print( + f'For some user IDs, the @handle is not included in the archive data. ' + f'Unknown user handles can be looked up online.' + f'{len(unknown_follower_user_ids)} of {len(unknown_collected_user_ids)} total ' + f'user IDs with unknown handles are from your followers. Online lookup would be ' + f'about {estimated_follower_lookup_size:,} KB smaller and up to ' + f'{estimated_max_follower_lookup_time_in_minutes:.1f} minutes faster without them.\n' + ) + user_input = input(f'Do you want to include handles of your followers ' + f'in the online lookup of user handles anyway? [Y/n]') + if user_input.lower() in ['n', 'no']: + collected_user_ids = collected_user_ids_without_followers lookup_users(collected_user_ids, users) From 67baaddbf594bd57708d04beef1ce6c604b36126 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Sat, 26 Nov 2022 22:39:40 +0100 Subject: [PATCH 26/46] Minor cleanup. --- parser.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/parser.py b/parser.py index 14aa69b..fc027e1 100755 --- a/parser.py +++ b/parser.py @@ -45,6 +45,7 @@ def __init__(self, id, handle = None): self.id = id self.handle = handle + class PathConfig: """ Helper class containing constants for various directories and files. @@ -185,6 +186,7 @@ def lookup_users(user_ids, users): except Exception as err: print(f'Failed to download user data: {err}') + def read_json_from_js_file(filename): """Reads the contents of a Twitter-produced .js file into a dictionary.""" print(f'Parsing {filename}...') @@ -1080,6 +1082,10 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): def migrate_old_output(paths: PathConfig): + """If present, moves media and cache files from the archive root to the new locations in + `paths.dir_output_media` and `paths.dir_output_cache`. Then deletes old output files + (md, html, txt) from the archive root, if the user consents.""" + # Create new media folder, so we can potentially use it to move files there os.makedirs(paths.dir_output_media, exist_ok=True) @@ -1171,8 +1177,6 @@ def main(): if not os.path.isfile(paths.file_tweet_icon): shutil.copy('assets/images/favicon.ico', paths.file_tweet_icon) - # TODO move files from older top-level folders, if they have been written by an older version of this script - media_sources = parse_tweets(username, users, html_template, paths) parse_followings(users, URL_template_user_id, paths) parse_followers(users, URL_template_user_id, paths) From 7cc17a08bb8aa0d9477d0a7c38b0c1813b075937 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Sat, 26 Nov 2022 22:46:01 +0100 Subject: [PATCH 27/46] catch ValueError from urlparse when looking for links in old tweets --- parser.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/parser.py b/parser.py index 4c5508a..50c90e6 100755 --- a/parser.py +++ b/parser.py @@ -172,17 +172,21 @@ def convert_tweet(tweet, username, media_sources, users, paths): # added to the urls entities list so that we can build correct links later on. if 'entities' in tweet and 'media' not in tweet['entities'] and len(tweet['entities'].get("urls", [])) == 0: for word in tweet['full_text'].split(): - url = urlparse(word) - if url.scheme != '' and url.netloc != '' and not word.endswith('\u2026'): - # Shorten links similiar to twitter - netloc_short = url.netloc[4:] if url.netloc.startswith("www.") else url.netloc - path_short = url.path if len(url.path + '?' + url.query) < 15 else (url.path + '?' + url.query)[:15] + '\u2026' - tweet['entities']['urls'].append({ - 'url': word, - 'expanded_url': word, - 'display_url': netloc_short + path_short, - 'indices': [tweet['full_text'].index(word), tweet['full_text'].index(word) + len(word)], - }) + try: + url = urlparse(word) + except ValueError: + pass # don't crash when trying to parse something that looks like a URL but actually isn't + else: + if url.scheme != '' and url.netloc != '' and not word.endswith('\u2026'): + # Shorten links similiar to twitter + netloc_short = url.netloc[4:] if url.netloc.startswith("www.") else url.netloc + path_short = url.path if len(url.path + '?' + url.query) < 15 else (url.path + '?' + url.query)[:15] + '\u2026' + tweet['entities']['urls'].append({ + 'url': word, + 'expanded_url': word, + 'display_url': netloc_short + path_short, + 'indices': [tweet['full_text'].index(word), tweet['full_text'].index(word) + len(word)], + }) # replace t.co URLs with their original versions if 'entities' in tweet and 'urls' in tweet['entities']: for url in tweet['entities']['urls']: From aa52c936854381e1bfd9eb37f4bf715fd24f8bef Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Sun, 27 Nov 2022 16:45:20 +0100 Subject: [PATCH 28/46] output of % done and estimated remaining time while trying to download larger media --- parser.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/parser.py b/parser.py index 73b7378..a4a3972 100644 --- a/parser.py +++ b/parser.py @@ -442,6 +442,32 @@ def download_larger_media(media_sources, paths): else: retries.append((local_media_path, media_url)) total_bytes_downloaded += bytes_downloaded + + # show % done and estimated remaining time: + time_elapsed: float = time.time() - start_time + estimated_time_per_file: float = time_elapsed / (index + 1) + estimated_time_remaining: datetime.datetime = \ + datetime.datetime.fromtimestamp( + (number_of_files - (index + 1)) * estimated_time_per_file, + tz=datetime.timezone.utc + ) + if estimated_time_remaining.hour >= 1: + time_remaining_string: str = \ + f"{estimated_time_remaining.hour} hour{'' if estimated_time_remaining.hour == 1 else 's'} " \ + f"{estimated_time_remaining.minute} minute{'' if estimated_time_remaining.minute == 1 else 's'}" + elif estimated_time_remaining.minute >= 1: + time_remaining_string: str = \ + f"{estimated_time_remaining.minute} minute{'' if estimated_time_remaining.minute == 1 else 's'} " \ + f"{estimated_time_remaining.second} second{'' if estimated_time_remaining.second == 1 else 's'}" + else: + time_remaining_string: str = \ + f"{estimated_time_remaining.second} second{'' if estimated_time_remaining.second == 1 else 's'}" + + if index + 1 == number_of_files: + print(' 100 % done.') + else: + print(f' {(100*(index+1)/number_of_files):.1f} % done, about {time_remaining_string} remaining...') + media_sources = retries remaining_tries -= 1 sleep_time += 2 From 3daec9632a603f363042d3df9b0820d496cfb542 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Sun, 27 Nov 2022 17:30:23 +0100 Subject: [PATCH 29/46] Update README.md to represent the current state of DM parsing --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c0ffa6b..6dfe927 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Our script does the following: - Replaces t.co URLs with their original versions (the ones that can be found in the archive). - Copies used images to an output folder, to allow them to be moved to a new home. - Will query Twitter for the missing user handles (checks with you first). -- Converts DMs to markdown, including the handles that we retrieved. Basic functionality for now (no embedded images), pending improvements. +- Converts DMs (including group DMs) to markdown, including the handles that we retrieved. - Outputs lists of followers and following. - Downloads the original size images (checks with you first). From 638f33be4a9b22288fa6292dd1cf41b730ff86b3 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Sun, 27 Nov 2022 17:31:02 +0100 Subject: [PATCH 30/46] Update README.md with more explanation of how to use a command prompt. --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6dfe927..17509b9 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,12 @@ 1. [Download your Twitter archive](https://twitter.com/settings/download_your_data) (Settings > Your account > Download an archive of your data). 2. Unzip to a folder. 3. Right-click this link --> [parser.py](https://raw.githubusercontent.com/timhutton/twitter-archive-parser/main/parser.py) <-- and select "Save Link as", and save into the folder where you extracted the archive. (Or use wget or curl on that link. Or clone the git repo.) -4. Run parser.py with [Python 3](https://realpython.com/installing-python/). e.g. `python parser.py` from a command prompt opened in that folder. +4. Open a command prompt and change directory into the unzipped folder where you just saved parser.py. + (**Here's how to do that on Windows:** Hold shift while right-clicking in the folder. Click on `Open PowerShell`.) +5. Run parser.py with [Python 3](https://realpython.com/installing-python/). e.g. `python parser.py`. + (**On Windows:** When the command window opens, paste or enter `python parser.py` at the command prompt.) + + If you are having problems please check the [issues list](https://github.com/timhutton/twitter-archive-parser/issues?q=is%3Aissue) to see if it has happened before, and open a new issue otherwise. From d616c1fead9ea8cfdc4a5949855f424bacb80b6a Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Sun, 27 Nov 2022 17:37:33 +0100 Subject: [PATCH 31/46] Update README.md with more info about current DMs parsing functionality. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 17509b9..4a781bb 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Our script does the following: - Replaces t.co URLs with their original versions (the ones that can be found in the archive). - Copies used images to an output folder, to allow them to be moved to a new home. - Will query Twitter for the missing user handles (checks with you first). -- Converts DMs (including group DMs) to markdown, including the handles that we retrieved. +- Converts DMs (including group DMs) to markdown with embedded media and links, including the handles that we retrieved. - Outputs lists of followers and following. - Downloads the original size images (checks with you first). From 199ca9fbbc1e50e4ef4a75fafd3120a6158a90bc Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Sun, 27 Nov 2022 20:15:33 +0100 Subject: [PATCH 32/46] Fix multiple bugs which prevented media downloading and/or resulted in multiple downloads of the same media. In particular, this is done through enhanced merging and comparing between tweets and their sub-objects, and by storing media_sources in a dict instead of a list. --- parser.py | 84 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 67 insertions(+), 17 deletions(-) diff --git a/parser.py b/parser.py index 87707c9..21871ac 100755 --- a/parser.py +++ b/parser.py @@ -30,6 +30,7 @@ import sys import time import traceback +from typing import List # hot-loaded if needed, see import_module(): # imagesize # requests @@ -94,7 +95,7 @@ def get_twitter_users(session, bearer_token, guest_token, user_ids): return users def get_tweets(session, bearer_token, guest_token, tweet_ids, include_user=True, include_alt_text=True): - """ Get the json metadata for a multiple tweets. + """Get the json metadata for multiple tweets. If include_user is False, you will only get a numerical id for the user. Returns `tweets, remaining_tweet_ids` where `tweets`. If all goes well, `tweets` will contain all tweets, and `remaining_tweet_ids` is empty. If something goes wrong, downloading is stopped @@ -186,7 +187,7 @@ def collect_tweet_id(tweet): tweet = tweet['tweet'] return tweet['id_str'] -# returns an it if you give it either an int or a str that can be parsed as +# returns an int if you give it either an int or a str that can be parsed as # an int. Otherwise, returns None. def parse_as_number(str_or_number): if isinstance(str_or_number, str): @@ -198,17 +199,59 @@ def parse_as_number(str_or_number): return str_or_number else: return None - + +def equal_ignore_types(a, b): + """Recognizes two things as equal even if one is a str and the other is a number (but with identical content), or if both are lists or both are dicts, and all of their nested values are equal_ignore_types""" + if a == b: + return True + if parse_as_number(a) is not None and parse_as_number(b) is not None: + return parse_as_number(a) == parse_as_number(b) + if isinstance(a, dict) and isinstance (b, dict): + if len(a) != len(b): + return False + for key in a.keys(): + if not equal_ignore_types(a[key], b[key]): + return False + return True + if isinstance(a, list) and isinstance(b, list): + if len(a) != len(b): + return False + for i in range(len(a)): + if not equal_ignore_types(a[i], b[i]): + return False + return True + return False + +def merge_lists(a: list, b: list, ignore_types:bool=False): + """Adds all items from b to a which are not already in a. If you pass ignore_types=True, it uses equal_ignore_types internally, and also recognizes two list items as equal if they both are dicts with equal id_str values in it, which results in merging the dicts instead of adding both separately to the result. Modifies a and returns a.""" + for item_b in b: + found_in_a = False + if ignore_types: + for item_a in a: + if equal_ignore_types(item_a, item_b): + found_in_a = True + break + if isinstance(item_a, dict) and isinstance(item_b, dict) and has_path(item_a, ['id_str']) and has_path(item_b, ['id_str']) and item_a['id_str'] == item_b['id_str']: + merge_dicts(item_a, item_b) + else: + found_in_a = item_b in a + + if not found_in_a: + a.append(item_b) + return a + # Taken from https://stackoverflow.com/a/7205107/39946, then adapted to # some commonly observed twitter specifics. -def merge(a, b, path=None): +def merge_dicts(a, b, path=None): "merges b into a" if path is None: path = [] for key in b: if key in a: if isinstance(a[key], dict) and isinstance(b[key], dict): - merge(a[key], b[key], path + [str(key)]) + merge_dicts(a[key], b[key], path + [str(key)]) + elif isinstance(a[key], list) and isinstance(b[key], list): + merge_lists(a[key], b[key], ignore_types=True) elif a[key] == b[key]: pass # same leaf value elif key == 'retweet_count' or key == 'favorite_count': @@ -245,8 +288,9 @@ def add_known_tweet(known_tweets, new_tweet): #print(f"Tweet {tweet_id} was already known with identical contents") else: try: - merge(known_tweets[tweet_id], new_tweet) + merge_dicts(known_tweets[tweet_id], new_tweet) except Exception as err: + print(traceback.format_exc()) print(f"Tweet {tweet_id} could not be merged: {err}") else: @@ -304,8 +348,8 @@ def collect_tweet_references(tweet, known_tweets, counts): return tweet_ids -# Walks a path through nested dicts or lists, and returns True if all the keys are present, and all of the values are not None -def has_path(dict, index_path): +def has_path(dict, index_path: List[str]): + """Walks a path through nested dicts or lists, and returns True if all the keys are present, and all of the values are not None.""" for index in index_path: if not index in dict: return False @@ -315,7 +359,7 @@ def has_path(dict, index_path): return True def convert_tweet(tweet, username, archive_media_folder, output_media_folder_name, - tweet_icon_path, media_sources, users, referenced_tweets): + tweet_icon_path, media_sources: dict, users, referenced_tweets): """Converts a JSON-format tweet. Returns tuple of timestamp, markdown and HTML.""" # TODO actually use `referenced_tweets` tweet = unwrap_tweet(tweet) @@ -354,7 +398,9 @@ def convert_tweet(tweet, username, archive_media_folder, output_media_folder_nam header_markdown += f'Replying to [{name_list}]({replying_to_url})\n\n' header_html += f'Replying to {name_list}
' # replace image URLs with image links to local files - if has_path(tweet, ['entities', 'media', 0, 'url']) and has_path(tweet, ['extended_entities', 'media']): + if has_path(tweet, ['entities', 'media']) and has_path(tweet, ['extended_entities', 'media']) \ + and len(tweet['entities']['media']) > 0 and 'url' in tweet['entities']['media'][0]: + original_url = tweet['entities']['media'][0]['url'] markdown = '' html = '' @@ -367,6 +413,7 @@ def convert_tweet(tweet, username, archive_media_folder, output_media_folder_nam new_url = output_media_folder_name + archive_media_filename markdown += '' if not markdown and body_markdown == original_url else '\n\n' html += '' if not html and body_html == original_url else '
' + # if file exists, this means that file is probably an image (not a video) if os.path.isfile(archive_media_path): # Found a matching image, use this one if not os.path.isfile(new_url): @@ -375,8 +422,10 @@ def convert_tweet(tweet, username, archive_media_folder, output_media_folder_nam html += f'' # Save the online location of the best-quality version of this file, for later upgrading if wanted best_quality_url = f'https://pbs.twimg.com/media/{original_filename}:orig' - media_sources.append((os.path.join(output_media_folder_name, archive_media_filename), best_quality_url)) + media_sources[os.path.join(output_media_folder_name, archive_media_filename)] = best_quality_url else: + # If the file does not exists, it might be a video. Then its filename might + # be found like this: # Is there any other file that includes the tweet_id in its filename? archive_media_paths = glob.glob(os.path.join(archive_media_folder, tweet_id_str + '*')) if len(archive_media_paths) > 0: @@ -401,7 +450,7 @@ def convert_tweet(tweet, username, archive_media_folder, output_media_folder_nam print(f"Warning No URL found for {original_url} {original_expanded_url} {archive_media_path} {media_url}") print(f"JSON: {tweet}") else: - media_sources.append((os.path.join(output_media_folder_name, archive_media_filename), best_quality_url)) + media_sources[os.path.join(output_media_folder_name, archive_media_filename)] = best_quality_url else: print(f'Warning: missing local file: {archive_media_path}. Using original link instead: {original_url} (expands to {original_expanded_url})') markdown += f'![]({original_url})' @@ -521,7 +570,7 @@ def download_file_if_larger(url, filename, index, count, sleep_time): return False, 0 -def download_larger_media(media_sources, log_path): +def download_larger_media(media_sources: dict, log_path): """Uses (filename, URL) tuples in media_sources to download files from remote storage. Aborts downloads if the remote file is the same size or smaller than the existing local version. Retries the failed downloads several times, with increasing pauses between each to avoid being blocked. @@ -540,7 +589,7 @@ def download_larger_media(media_sources, log_path): number_of_files = len(media_sources) success_count = 0 retries = [] - for index, (local_media_path, media_url) in enumerate(media_sources): + for index, (local_media_path, media_url) in enumerate(media_sources.items()): success, bytes_downloaded = download_file_if_larger(media_url, local_media_path, index + 1, number_of_files, sleep_time) if success: success_count += 1 @@ -563,14 +612,14 @@ def download_larger_media(media_sources, log_path): def parse_tweets(input_filenames, username, users, html_template, archive_media_folder, - output_media_folder_name, tweet_icon_path, output_html_filename): + output_media_folder_name, tweet_icon_path, output_html_filename) -> dict: """Read tweets from input_filenames, write to *.md and output_html_filename. Copy the media used to output_media_folder_name. Collect user_id:user_handle mappings for later use, in 'users'. Returns the mapping from media filename to best-quality URL. """ converted_tweets = [] - media_sources = [] + media_sources = {} counts = defaultdict(int) known_tweets = {} @@ -601,7 +650,8 @@ def parse_tweets(input_filenames, username, users, html_template, archive_media_ for tweet in known_tweets.values(): tweet_ids_to_download.update(collect_tweet_references(tweet, known_tweets, counts)) - # Download referenced tweets + # (Maybe) download referenced tweets + # TODO ask user for consent to download referenced_tweets = [] if (len(tweet_ids_to_download) > 0): print(f"Found references to {len(tweet_ids_to_download)} tweets which should be downloaded. Breakdown of download reasons:") From 9d75a992c250936645df4f8ae0e88e659635f8da Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Sun, 27 Nov 2022 20:20:39 +0100 Subject: [PATCH 33/46] index on (no branch): 199ca9f Fix multiple bugs which prevented media downloading and/or resulted in multiple downloads of the same media. From 911c46c2ba370011d5c3e7c795f97776caea3a28 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Sun, 27 Nov 2022 20:36:49 +0100 Subject: [PATCH 34/46] Use get_consent for user handle download. --- parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parser.py b/parser.py index cfdbadb..4e7463e 100644 --- a/parser.py +++ b/parser.py @@ -1236,9 +1236,9 @@ def main(): f'about {estimated_follower_lookup_size:,} KB smaller and up to ' f'{estimated_max_follower_lookup_time_in_minutes:.1f} minutes faster without them.\n' ) - user_input = input(f'Do you want to include handles of your followers ' - f'in the online lookup of user handles anyway? [Y/n]') - if user_input.lower() in ['n', 'no']: + + if not get_consent(f'Do you want to include handles of your followers ' + f'in the online lookup of user handles anyway?', default_to_yes=True): collected_user_ids = collected_user_ids_without_followers lookup_users(collected_user_ids, users) From 4e907d5a9d793ffd46db77fa1fd920ca73cfba33 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Sun, 27 Nov 2022 21:05:23 +0100 Subject: [PATCH 35/46] Use create_path_for_file_output_dms for group DMs, remove unused file_template vars. --- parser.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/parser.py b/parser.py index c5a2288..ed38811 100644 --- a/parser.py +++ b/parser.py @@ -18,6 +18,7 @@ """ from collections import defaultdict +from typing import Optional from urllib.parse import urlparse import datetime import glob @@ -74,8 +75,6 @@ def __init__(self, dir_archive): self.dir_output_cache = os.path.join(self.dir_archive, 'parser-cache') self.file_output_following = os.path.join(self.dir_output, 'following.txt') self.file_output_followers = os.path.join(self.dir_output, 'followers.txt') - self.file_template_dm_output = os.path.join(self.dir_output, 'DMs-Archive-{}.md') - self.file_template_group_dm_output = os.path.join(self.dir_output, 'DMs-Group-Archive-{}.md') self.file_download_log = os.path.join(self.dir_output_media, 'download_log.txt') self.file_tweet_icon = os.path.join(self.dir_output_media, 'tweet.ico') self.files_input_tweets = find_files_input_tweets(self.dir_input_data) @@ -88,7 +87,7 @@ def create_path_for_file_output_tweets(self, year, month, format="html", kind="t # Previously the filename was f'{dt.year}-{dt.month:02}-01-Tweet-Archive-{dt.year}-{dt.month:02}' return os.path.join(self.dir_output, f"{kind}-{format}", f"{year:04}", f"{year:04}-{month:02}-01-{kind}.{format}") - def create_path_for_file_output_dms(self, name, index=None, format="html", kind="DMs") -> str: + def create_path_for_file_output_dms(self, name: str, index: Optional[int]=None, format: str="html", kind: str="DMs") -> str: """Builds the path for a dm-archive file based on some properties.""" index_suffix = "" if (index): @@ -1189,11 +1188,12 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): markdown += f'## {official_name} ##\n\n' markdown += f'### Group conversation between {name_list}, part {chunk_index + 1}: ###\n\n----\n\n' markdown += '\n\n----\n\n'.join(md for _, md in chunk) - conversation_output_filename = \ - paths.file_template_group_dm_output.format(f'{group_name}_part{chunk_index + 1:03}') - + conversation_output_filename = paths.create_path_for_file_output_dms( + name=group_name, format="md", kind="DMs-Group", index=chunk_index + 1 + ) + # write part to a markdown file - with open(conversation_output_filename, 'w', encoding='utf8') as f: + with open_and_mkdirs(conversation_output_filename) as f: f.write(markdown) print(f'Wrote {len(chunk)} messages to {conversation_output_filename}') num_written_files += 1 @@ -1202,9 +1202,10 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): markdown += f'## {official_name} ##\n\n' markdown += f'### Group conversation between {name_list}: ###\n\n----\n\n' markdown += '\n\n----\n\n'.join(md for _, md in messages) - conversation_output_filename = paths.file_template_group_dm_output.format(group_name) + conversation_output_filename = \ + paths.create_path_for_file_output_dms(name=group_name, format="md", kind="DMs-Group") - with open(conversation_output_filename, 'w', encoding='utf8') as f: + with open_and_mkdirs(conversation_output_filename) as f: f.write(markdown) print(f'Wrote {len(messages)} messages to {conversation_output_filename}') num_written_files += 1 From b65d66c3d4083c3337ef71d564b3e58dd31d6d21 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Sun, 27 Nov 2022 21:17:41 +0100 Subject: [PATCH 36/46] Use get_consent in migrate_old_output, skip question about downloading images if there are none, show number of images otherwise. --- parser.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/parser.py b/parser.py index f36d4d4..e1b3693 100644 --- a/parser.py +++ b/parser.py @@ -1550,13 +1550,13 @@ def migrate_old_output(paths: PathConfig): print("which were probably generated from an older version of this script.") print("Since then, the directory layout of twitter-archive-parser has changed") print("and these files are generated into the sub-directory 'parser-output' or") - print("various sub-sub-directories therein. These are the affected files:") + print("various sub-sub-directories therein. These are the affected files:\n") for file_to_delete in files_to_delete: print(file_to_delete) - user_input = input('\nOK delete these files? (If the the directory layout would not have changed, they would be overwritten anyway) [y/N]') - if user_input.lower() in ('y', 'yes'): + print() + if get_consent('OK to delete these files? (If the the directory layout would not have changed, they would be overwritten anyway)'): for file_to_delete in files_to_delete: os.remove(file_to_delete) print(f"Files have been deleted. New versions of these files will be generated into 'parser-output' soon.") @@ -1648,16 +1648,17 @@ def main(): parse_group_direct_messages(username, users, user_id_url_template, paths) # Download larger images, if the user agrees - print(f"\nThe archive doesn't contain the original-size images. We can attempt to download them from twimg.com.") - print(f'Please be aware that this script may download a lot of data, which will cost you money if you are') - print(f'paying for bandwidth. Please be aware that the servers might block these requests if they are too') - print(f'frequent. This script may not work if your account is protected. You may want to set it to public') - print(f'before starting the download.\n') - - if get_consent('OK to start downloading?'): - download_larger_media(media_sources, paths) - print('In case you set your account to public before initiating the download, ' - 'do not forget to protect it again.') + if len(media_sources) > 0: + print(f"\nThe archive doesn't contain the original-size images. We can attempt to download them from twimg.com.") + print(f'Please be aware that this script may download a lot of data, which will cost you money if you are') + print(f'paying for bandwidth. Please be aware that the servers might block these requests if they are too') + print(f'frequent. This script may not work if your account is protected. You may want to set it to public') + print(f'before starting the download.\n') + + if get_consent('OK to start downloading {len(media_sources)} media files?'): + download_larger_media(media_sources, log_path) + print('In case you set your account to public before initiating the download, ' + 'do not forget to protect it again.') if __name__ == "__main__": From fdfe909f48a436fbc093c4c49f9e4ca166be4e93 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Sun, 27 Nov 2022 21:41:05 +0100 Subject: [PATCH 37/46] Extract format_duration and use it for additional download time estimations for downloading tweets and larger media files. --- parser.py | 91 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/parser.py b/parser.py index e1b3693..b98cbaf 100644 --- a/parser.py +++ b/parser.py @@ -18,6 +18,7 @@ """ from collections import defaultdict +import math from typing import Optional from urllib.parse import urlparse import datetime @@ -101,6 +102,22 @@ def create_path_for_file_output_single(self, format: str, kind: str)->str: return os.path.join(self.dir_output, f"{kind}.{format}") +def format_duration(seconds: float) -> str: + duration_datetime: datetime.datetime = \ + datetime.datetime.fromtimestamp( + seconds, + tz=datetime.timezone.utc + ) + if duration_datetime.hour >= 1: + return f"{duration_datetime.hour } hour{ '' if duration_datetime.hour == 1 else 's'} " \ + f"{duration_datetime.minute} minute{'' if duration_datetime.minute == 1 else 's'}" + elif duration_datetime.minute >= 1: + return f"{duration_datetime.minute} minute{'' if duration_datetime.minute == 1 else 's'} " \ + f"{duration_datetime.second} second{'' if duration_datetime.second == 1 else 's'}" + else: + return f"{duration_datetime.second} second{'' if duration_datetime.second == 1 else 's'}" + + def get_consent(prompt: str, default_to_yes: bool = False): """Asks the user for consent, using the given prompt. Accepts various versions of yes/no, or an empty answer to accept the default. The default is 'no' unless default_to_yes is passed as @@ -763,22 +780,8 @@ def download_larger_media(media_sources: dict, paths: PathConfig): # show % done and estimated remaining time: time_elapsed: float = time.time() - start_time estimated_time_per_file: float = time_elapsed / (index + 1) - estimated_time_remaining: datetime.datetime = \ - datetime.datetime.fromtimestamp( - (number_of_files - (index + 1)) * estimated_time_per_file, - tz=datetime.timezone.utc - ) - if estimated_time_remaining.hour >= 1: - time_remaining_string: str = \ - f"{estimated_time_remaining.hour} hour{'' if estimated_time_remaining.hour == 1 else 's'} " \ - f"{estimated_time_remaining.minute} minute{'' if estimated_time_remaining.minute == 1 else 's'}" - elif estimated_time_remaining.minute >= 1: - time_remaining_string: str = \ - f"{estimated_time_remaining.minute} minute{'' if estimated_time_remaining.minute == 1 else 's'} " \ - f"{estimated_time_remaining.second} second{'' if estimated_time_remaining.second == 1 else 's'}" - else: - time_remaining_string: str = \ - f"{estimated_time_remaining.second} second{'' if estimated_time_remaining.second == 1 else 's'}" + + time_remaining_string = format_duration(seconds = (number_of_files - (index + 1)) * estimated_time_per_file) if index + 1 == number_of_files: print(' 100 % done.') @@ -841,36 +844,38 @@ def parse_tweets(username, users, html_template, paths: PathConfig) -> dict: tweet_ids_to_download.update(collect_tweet_references(tweet, known_tweets, counts)) # (Maybe) download referenced tweets - # TODO ask user for consent to download referenced_tweets = [] if (len(tweet_ids_to_download) > 0): print(f"Found references to {len(tweet_ids_to_download)} tweets which should be downloaded. Breakdown of download reasons:") for reason in ['quote', 'reply', 'retweet', 'media']: print(f" * {counts[reason]} because of {reason}") print(f"There were {counts['known_reply']} references to tweets which are already known so we don't need to download them (not included in the numbers above).") - # TODO maybe ask the user if we should start downloading - # TODO maybe give an estimate of download size and/or time - # TODO maybe let the user choose which of the tweets to download, by selecting a subset of those reasons - requests = import_module('requests') - try: - with requests.Session() as session: - bearer_token = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' - guest_token = get_twitter_api_guest_token(session, bearer_token) - # TODO We could download user data together with the tweets, because we will need it anyway. But we might download the data for each user multiple times then. - downloaded_tweets, remaining_tweet_ids = get_tweets(session, bearer_token, guest_token, list(tweet_ids_to_download), False) - # TODO maybe react if remaining_tweet_ids contains tweets - for downloaded_tweet in downloaded_tweets.values(): - downloaded_tweet = unwrap_tweet(downloaded_tweet) - downloaded_tweet['from_api'] = True - downloaded_tweet['download_with_user'] = False - downloaded_tweet['download_with_alt_text'] = True - add_known_tweet(known_tweets, downloaded_tweet) - with open(tweet_dict_filename, "w") as outfile: - json.dump(known_tweets, outfile, indent=2) - print(f"Saved {len(known_tweets)} tweets to '{tweet_dict_filename}'.") - except Exception as err: - print(f'Failed to download tweets: {err}') + estimated_download_time_seconds = math.ceil(len(tweet_ids_to_download) / 100) * 2 + estimated_download_time_str = format_duration(estimated_download_time_seconds) + if get_consent(f"OK to download {len(tweet_ids_to_download)} tweets from twitter? This would take about {estimated_download_time_str}."): + # TODO maybe give an estimate of download size and/or time + # TODO maybe let the user choose which of the tweets to download, by selecting a subset of those reasons + requests = import_module('requests') + try: + with requests.Session() as session: + bearer_token = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' + guest_token = get_twitter_api_guest_token(session, bearer_token) + # TODO We could download user data together with the tweets, because we will need it anyway. But we might download the data for each user multiple times then. + downloaded_tweets, remaining_tweet_ids = get_tweets(session, bearer_token, guest_token, list(tweet_ids_to_download), False) + # TODO maybe react if remaining_tweet_ids contains tweets + for downloaded_tweet in downloaded_tweets.values(): + downloaded_tweet = unwrap_tweet(downloaded_tweet) + downloaded_tweet['from_api'] = True + downloaded_tweet['download_with_user'] = False + downloaded_tweet['download_with_alt_text'] = True + add_known_tweet(known_tweets, downloaded_tweet) + with open(tweet_dict_filename, "w") as outfile: + json.dump(known_tweets, outfile, indent=2) + print(f"Saved {len(known_tweets)} tweets to '{tweet_dict_filename}'.") + + except Exception as err: + print(f'Failed to download tweets: {err}') # Third pass: convert tweets, using the downloaded references from pass 2 for tweet in known_tweets.values(): @@ -1655,8 +1660,12 @@ def main(): print(f'frequent. This script may not work if your account is protected. You may want to set it to public') print(f'before starting the download.\n') - if get_consent('OK to start downloading {len(media_sources)} media files?'): - download_larger_media(media_sources, log_path) + estimated_download_time_str = format_duration(len(media_sources) * 0.4) + + if get_consent(f'OK to start downloading {len(media_sources)} media files? ' + f'This will take at least {estimated_download_time_str}.'): + + download_larger_media(media_sources, paths) print('In case you set your account to public before initiating the download, ' 'do not forget to protect it again.') From 179d01172cc0ec574a4c98f0d58ce44e3b8ac317 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Sun, 27 Nov 2022 23:02:57 +0100 Subject: [PATCH 38/46] Fix bug which re-downloaded the same tweets over and over, instead offer to retry after failure. Fix path for known_tweets (which seemed to work despite wrong code?!) --- parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/parser.py b/parser.py index b98cbaf..4f08a66 100644 --- a/parser.py +++ b/parser.py @@ -456,7 +456,7 @@ def collect_tweet_references(tweet, known_tweets, counts): # Collect previous tweets in conversation # Only do this for tweets from our original archive if 'from_archive' in tweet and has_path(tweet, ['in_reply_to_status_id_str']): - prev_tweet_id = parse_as_number(tweet['in_reply_to_status_id_str']) + prev_tweet_id = tweet['in_reply_to_status_id_str'] if (prev_tweet_id in known_tweets): counts['known_reply'] += 1 else: @@ -824,7 +824,7 @@ def parse_tweets(username, users, html_template, paths: PathConfig) -> dict: # 3. use the data that is already present in a tweet to distinguish own tweets from others # Load tweets that we saved in an earlier run between pass 2 and 3 - tweet_dict_filename = 'known_tweets.json' + tweet_dict_filename = os.path.join(paths.dir_output_cache, 'known_tweets.json') if os.path.exists(tweet_dict_filename): with open(tweet_dict_filename, 'r', encoding='utf8') as f: known_tweets = json.load(f) @@ -845,7 +845,7 @@ def parse_tweets(username, users, html_template, paths: PathConfig) -> dict: # (Maybe) download referenced tweets referenced_tweets = [] - if (len(tweet_ids_to_download) > 0): + while (len(tweet_ids_to_download) > 0): print(f"Found references to {len(tweet_ids_to_download)} tweets which should be downloaded. Breakdown of download reasons:") for reason in ['quote', 'reply', 'retweet', 'media']: print(f" * {counts[reason]} because of {reason}") @@ -862,8 +862,8 @@ def parse_tweets(username, users, html_template, paths: PathConfig) -> dict: bearer_token = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' guest_token = get_twitter_api_guest_token(session, bearer_token) # TODO We could download user data together with the tweets, because we will need it anyway. But we might download the data for each user multiple times then. - downloaded_tweets, remaining_tweet_ids = get_tweets(session, bearer_token, guest_token, list(tweet_ids_to_download), False) - # TODO maybe react if remaining_tweet_ids contains tweets + downloaded_tweets, tweet_ids_to_download = get_tweets(session, bearer_token, guest_token, list(tweet_ids_to_download), False) + for downloaded_tweet in downloaded_tweets.values(): downloaded_tweet = unwrap_tweet(downloaded_tweet) downloaded_tweet['from_api'] = True From 11846a5a78b282389a874111c53da104b9566afb Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Sun, 27 Nov 2022 23:58:04 +0100 Subject: [PATCH 39/46] Remove verbose tweet download logging and instead tell the user something about the overall purpose. Also improve error handling. --- parser.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/parser.py b/parser.py index 4f08a66..b709ccb 100644 --- a/parser.py +++ b/parser.py @@ -450,7 +450,6 @@ def collect_tweet_references(tweet, known_tweets, counts): counts['known_quote'] += 1 else: tweet_ids.add(quoted_id) - print(f"Need to download tweet {tweet['id_str']} because of being quoted") counts['quote'] += 1 # Collect previous tweets in conversation @@ -461,14 +460,12 @@ def collect_tweet_references(tweet, known_tweets, counts): counts['known_reply'] += 1 else: tweet_ids.add(prev_tweet_id) - print(f"Need to download tweet {prev_tweet_id} because of reply to it") counts['reply'] += 1 # Collect retweets # Don't do this if we already re-downloaded this tweet if not 'from_api' in tweet and 'full_text' in tweet and tweet['full_text'].startswith('RT @'): tweet_ids.add(tweet['id_str']) - print(f"Need to download tweet {tweet['id_str']} because of retweet") counts['retweet'] += 1 # Collect tweets with media, which might lack alt text @@ -476,7 +473,6 @@ def collect_tweet_references(tweet, known_tweets, counts): # Don't do this if we already re-downloaded this tweet with alt texts enabled if not 'download_with_alt_text' in tweet and has_path(tweet, ['entities', 'media']): tweet_ids.add(tweet['id_str']) - print(f"Need to download tweet {tweet['id_str']} because of contained media") counts['media'] += 1 if None in tweet_ids: @@ -845,12 +841,19 @@ def parse_tweets(username, users, html_template, paths: PathConfig) -> dict: # (Maybe) download referenced tweets referenced_tweets = [] - while (len(tweet_ids_to_download) > 0): + if (len(tweet_ids_to_download) >0): print(f"Found references to {len(tweet_ids_to_download)} tweets which should be downloaded. Breakdown of download reasons:") for reason in ['quote', 'reply', 'retweet', 'media']: print(f" * {counts[reason]} because of {reason}") print(f"There were {counts['known_reply']} references to tweets which are already known so we don't need to download them (not included in the numbers above).") + print() + print("Please note that the downloaded tweets will not be included in the generated output yet.") + print("Anyway, we recommend to download the tweets now, just in case Twitter (or its API which") + print("we use), won't be available forever. A future version of this script will be able to") + print("include the downloaded tweets into the output, even if Twitter should not be available then.") + print() + while (len(tweet_ids_to_download) > 0): estimated_download_time_seconds = math.ceil(len(tweet_ids_to_download) / 100) * 2 estimated_download_time_str = format_duration(estimated_download_time_seconds) if get_consent(f"OK to download {len(tweet_ids_to_download)} tweets from twitter? This would take about {estimated_download_time_str}."): @@ -875,8 +878,15 @@ def parse_tweets(username, users, html_template, paths: PathConfig) -> dict: print(f"Saved {len(known_tweets)} tweets to '{tweet_dict_filename}'.") except Exception as err: + # this code is rather unlikely to be reached, since get_tweets has internal error handling. print(f'Failed to download tweets: {err}') + if len(tweet_ids_to_download) > 0: + print("Not all tweets could be downloaded, but you can retry if you want.") + else: + # Don't ask again and again if the user said 'no' + break + # Third pass: convert tweets, using the downloaded references from pass 2 for tweet in known_tweets.values(): try: From 3c59d12bb98841b66dcd0363d7aa056eedaaa05f Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Mon, 28 Nov 2022 00:02:27 +0100 Subject: [PATCH 40/46] Remove listing of moved media files, it's much to verbose. --- parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/parser.py b/parser.py index ed38811..c0e7ba0 100644 --- a/parser.py +++ b/parser.py @@ -1232,7 +1232,6 @@ def migrate_old_output(paths: PathConfig): print(f"Moving {len(files_to_move)} files from 'media' to '{paths.dir_output_media}'") for file_path_to_move in files_to_move: file_name_to_move = os.path.split(file_path_to_move)[1] - print(file_name_to_move) os.rename(file_path_to_move, os.path.join(paths.dir_output_media, file_name_to_move)) os.rmdir(os.path.join(paths.dir_archive, "media")) From d39adb195677a46a7d10012661d435f528ef459a Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Mon, 28 Nov 2022 00:02:58 +0100 Subject: [PATCH 41/46] also include users with 0 messages in filename generation for group DMs (for even more human-readable filenames) --- parser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/parser.py b/parser.py index ed38811..d07d69b 100644 --- a/parser.py +++ b/parser.py @@ -961,6 +961,9 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): group_conversations_metadata[conversation_id]['participant_names'] = participant_names group_conversations_metadata[conversation_id]['conversation_names'] = [(0, conversation_id)] group_conversations_metadata[conversation_id]['participant_message_count'] = defaultdict(int) + for participant_id in participants: + # init every participant's message count with 0, so that users with no activity are not ignored + group_conversations_metadata[conversation_id]['participant_message_count'][participant_id] = 0 messages = [] if 'messages' in dm_conversation: for message in dm_conversation['messages']: @@ -1156,6 +1159,8 @@ def parse_group_direct_messages(username, users, user_id_url_template, paths): participant_handle = users[participant_id].handle if participant_handle != username: handles.append((participant_handle, message_count)) + # sort alphabetically by handle first, for a more deterministic order + handles.sort(key=lambda tup: tup[0]) # sort so that the most active users are at the start of the list handles.sort(key=lambda tup: tup[1], reverse=True) if len(handles) == 1: From c584bbd52726876f42b4d02df92a7424870c30dd Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Mon, 28 Nov 2022 00:31:36 +0100 Subject: [PATCH 42/46] Bugfix: make sure that a UserData object can't have an empty handle --- parser.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/parser.py b/parser.py index d07d69b..696706d 100644 --- a/parser.py +++ b/parser.py @@ -42,8 +42,12 @@ class UserData: - def __init__(self, id, handle = None): - self.id = id + def __init__(self, user_id: str, handle: str): + if user_id is None: + raise ValueError('ID "None" is not allowed in UserData.') + self.user_id = user_id + if handle is None: + raise ValueError('handle "None" is not allowed in UserData.') self.handle = handle @@ -207,7 +211,8 @@ def lookup_users(user_ids, users): guest_token = get_twitter_api_guest_token(session, bearer_token) retrieved_users = get_twitter_users(session, bearer_token, guest_token, filtered_user_ids) for user_id, user in retrieved_users.items(): - users[user_id] = UserData(user_id, user["screen_name"]) + if user["screen_name"] is not None: + users[user_id] = UserData(user_id=user_id, handle=user["screen_name"]) print() # empty line for better readability of output except Exception as err: print(f'Failed to download user data: {err}') @@ -256,7 +261,7 @@ def escape_markdown(input_text: str) -> str: return output_text -def convert_tweet(tweet, username, media_sources, users, paths: PathConfig): +def convert_tweet(tweet, username, media_sources, users: dict, paths: PathConfig): """Converts a JSON-format tweet. Returns tuple of timestamp, markdown and HTML.""" if 'tweet' in tweet.keys(): tweet = tweet['tweet'] @@ -397,17 +402,19 @@ def convert_tweet(tweet, username, media_sources, users, paths: PathConfig): body_html = header_html + body_html + f' {timestamp_str}

' # extract user_id:handle connections - if 'in_reply_to_user_id' in tweet and 'in_reply_to_screen_name' in tweet: - id = tweet['in_reply_to_user_id'] - if int(id) >= 0: # some ids are -1, not sure why + if 'in_reply_to_user_id' in tweet and 'in_reply_to_screen_name' in tweet and \ + tweet['in_reply_to_screen_name'] is not None: + reply_to_id = tweet['in_reply_to_user_id'] + if int(reply_to_id) >= 0: # some ids are -1, not sure why handle = tweet['in_reply_to_screen_name'] - users[id] = UserData(id=id, handle=handle) + users[reply_to_id] = UserData(user_id=reply_to_id, handle=handle) if 'entities' in tweet and 'user_mentions' in tweet['entities']: for mention in tweet['entities']['user_mentions']: - id = mention['id'] - if int(id) >= 0: # some ids are -1, not sure why + mentioned_id = mention['id'] + if int(mentioned_id) >= 0: # some ids are -1, not sure why handle = mention['screen_name'] - users[id] = UserData(id=id, handle=handle) + if handle is not None: + users[mentioned_id] = UserData(user_id=mentioned_id, handle=handle) return timestamp, body_markdown, body_html From cb5897da6f71a98314194e8bf124da87add889a0 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Mon, 28 Nov 2022 00:43:49 +0100 Subject: [PATCH 43/46] added a few more 'not None' checks --- parser.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/parser.py b/parser.py index 696706d..b407b09 100644 --- a/parser.py +++ b/parser.py @@ -408,13 +408,14 @@ def convert_tweet(tweet, username, media_sources, users: dict, paths: PathConfig if int(reply_to_id) >= 0: # some ids are -1, not sure why handle = tweet['in_reply_to_screen_name'] users[reply_to_id] = UserData(user_id=reply_to_id, handle=handle) - if 'entities' in tweet and 'user_mentions' in tweet['entities']: + if 'entities' in tweet and 'user_mentions' in tweet['entities'] and tweet['entities']['user_mentions'] is not None: for mention in tweet['entities']['user_mentions']: - mentioned_id = mention['id'] - if int(mentioned_id) >= 0: # some ids are -1, not sure why - handle = mention['screen_name'] - if handle is not None: - users[mentioned_id] = UserData(user_id=mentioned_id, handle=handle) + if mention is not None and 'id' in mention and 'screen_name' in mention: + mentioned_id = mention['id'] + if int(mentioned_id) >= 0: # some ids are -1, not sure why + handle = mention['screen_name'] + if handle is not None: + users[mentioned_id] = UserData(user_id=mentioned_id, handle=handle) return timestamp, body_markdown, body_html From 5f499d7643849991542371b481ad7db037405ca1 Mon Sep 17 00:00:00 2001 From: Tim Hutton Date: Mon, 28 Nov 2022 01:44:02 +0000 Subject: [PATCH 44/46] Added TechCrunch article --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4a781bb..eb7fcad 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ Our script does the following: Some of the functionality requires the `requests` and `imagesize` modules. `parser.py` will offer to install these for you using pip. To avoid that you can install them before running the script. ## Articles about handling your Twitter archive: +- https://techcrunch.com/2022/11/21/quit-twitter-better-with-these-free-tools-that-make-archiving-a-breeze/ - https://www.bitsgalore.org/2022/11/20/how-to-preserve-your-personal-twitter-archive - https://matthiasott.com/notes/converting-your-twitter-archive-to-markdown From 2e0a12539cc9ae85d59515a9fd452d2462d0fb66 Mon Sep 17 00:00:00 2001 From: Lena Schimmel Date: Mon, 28 Nov 2022 23:18:19 +0100 Subject: [PATCH 45/46] Reduce redundant tweet downloads. --- parser.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/parser.py b/parser.py index dfb9eed..776dd9b 100644 --- a/parser.py +++ b/parser.py @@ -442,6 +442,10 @@ def collect_tweet_references(tweet, known_tweets, counts): tweet = unwrap_tweet(tweet) tweet_ids = set() + # Don't search for tweet references if this tweet was not part of the original archive + if 'from_archive' not in tweet: + return tweet_ids + # Collect quoted tweets if has_path(tweet, ['entities', 'urls']): for url in tweet['entities']['urls']: From b68eefdb31775069d6c800a7a3badfa2b91e0808 Mon Sep 17 00:00:00 2001 From: Kirstin Rohwer Date: Tue, 29 Nov 2022 00:22:10 +0100 Subject: [PATCH 46/46] fix bug in download_larger_media --- parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parser.py b/parser.py index 776dd9b..c0ce179 100644 --- a/parser.py +++ b/parser.py @@ -776,13 +776,13 @@ def download_larger_media(media_sources: dict, paths: PathConfig): while remaining_tries > 0: number_of_files = len(media_sources) success_count = 0 - retries = [] + retries = {} for index, (local_media_path, media_url) in enumerate(media_sources.items()): success, bytes_downloaded = download_file_if_larger(media_url, local_media_path, index + 1, number_of_files, sleep_time) if success: success_count += 1 else: - retries.append((local_media_path, media_url)) + retries[local_media_path] = media_url total_bytes_downloaded += bytes_downloaded # show % done and estimated remaining time: