From 06f87f2346c75a709feab21b5d7ba309e6dac756 Mon Sep 17 00:00:00 2001 From: Shaw-Sean Yang Date: Tue, 9 Jul 2024 17:19:17 +0200 Subject: [PATCH] truncate emails --- src/panza/data_preparation/extract_emails.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/panza/data_preparation/extract_emails.py b/src/panza/data_preparation/extract_emails.py index 613c297..d79945e 100644 --- a/src/panza/data_preparation/extract_emails.py +++ b/src/panza/data_preparation/extract_emails.py @@ -8,6 +8,7 @@ import langdetect CLEAN_EMAILS = [] +TRUNCATED_EMAILS_COUNTER = 0 DISCARDED_EMAILS = { "non_english": [], "forwarded": [], @@ -17,7 +18,7 @@ } SHORT_EMAIL_THRESHOLD = 10 # words - +LONG_EMAIL_THRESHOLD = 500 # words def extract_only_plain_text(msg_part): if msg_part.get_content_type() == "text/plain": @@ -47,6 +48,11 @@ def remove_quoted_content(email_body): else: return email_body +def truncate_long_emails(email_body): + if count_words(email_body) > LONG_EMAIL_THRESHOLD: + TRUNCATED_EMAILS_COUNTER += 1 + return " ".join(email_body.split()[:LONG_EMAIL_THRESHOLD]) + return email_body def remove_lines_starting_with_gt(text): lines = text.split("\n") @@ -74,6 +80,7 @@ def filter_message(msg): plain_text = remove_quoted_content(plain_text) # sometimes remove_quoted_content misses, so making sure we remove lines with ">" at the start plain_text = remove_lines_starting_with_gt(plain_text) + plain_text = truncate_long_emails(plain_text) # check length before detecting language if count_words(plain_text) < SHORT_EMAIL_THRESHOLD: @@ -138,6 +145,7 @@ def main(): f"\n\t forwarded = {len(DISCARDED_EMAILS['forwarded'])}" f"\n\t cant_decode_utf8 = {len(DISCARDED_EMAILS['cant_decode_utf8'])}" ) + print(f"# truncated emails = {TRUNCATED_EMAILS_COUNTER}") first_email = EMAIL[0] username = first_email[: first_email.find("@")]