import re
# Regular expressions for matching emails and websites
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
website_regex = re.compile(r'\b(?:http://|https://|www\.)\S+\b')
# Files for saving the extracted data
email_file = "comment_emails.txt"
website_file = "comment_websites.txt"
# Read the XML file
with open('sdn_advanced.xml', 'r', encoding='utf-8') as file:
data = file.read()
# Find all comments in the format <Comment>...</Comment>
comments = re.findall(r'<Comment>(.*?)</Comment>', data)
emails_count = 0
websites_count = 0
# Open the output files
with open(email_file, 'w', encoding='utf-8') as emails_out, open(website_file, 'w', encoding='utf-8') as websites_out:
for comment in comments:
emails = email_regex.findall(comment)
websites = website_regex.findall(comment)
# Write the emails to the email file
for email in emails:
emails_out.write(email + '\n')
emails_count += 1
# Write the websites to the website file
for website in websites:
if "treasury.gov" not in website:
websites_out.write(website + '\n')
websites_count += 1
print(f"Extraction complete. {emails_count} Emails saved in {email_file}, {websites_count} websites saved in {website_file}.")
@n-patel OFAC SDN XML has some entity recording errors.
Here's a quick extractor to grab missing entries: