diff --git a/.markdownlint.json b/.markdownlint.json new file mode 100644 index 0000000..2c51912 --- /dev/null +++ b/.markdownlint.json @@ -0,0 +1,15 @@ +{ + "default": true, + "MD013": { + "line_length": 120, + "code_blocks": false, + "tables": false + }, + "MD024": { + "siblings_only": true + }, + "MD033": { + "allowed_elements": ["br", "img", "a", "details", "summary"] + }, + "MD041": false +} diff --git a/.markdownlint.yaml b/.markdownlint.yaml deleted file mode 100644 index 89e1ea2..0000000 --- a/.markdownlint.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# Markdownlint configuration -# See https://github.com/DavidAnson/markdownlint/blob/main/doc/Rules.md - -# MD013/line-length - Line length -MD013: - # Disable line length check for code blocks and tables - line_length: 120 - code_blocks: false - tables: false - -# MD033/no-inline-html - Inline HTML -MD033: - # Allow specific HTML elements commonly used in GitHub markdown - allowed_elements: - - a - - img - - br - -# MD041/first-line-heading - First line in file should be a top level heading -MD041: false # Allow files to start with badges or other content diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 560fc6d..ef82777 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,9 +28,29 @@ repos: # Run the formatter. - id: ruff-format -- repo: https://github.com/igorshubovych/markdownlint-cli - rev: v0.45.0 +- repo: https://github.com/executablebooks/mdformat + rev: 0.7.17 hooks: - - id: markdownlint - args: ['--fix'] - exclude: '^\.github/instructions/' + - id: mdformat + exclude: '^\.github/' + additional_dependencies: + - mdformat-frontmatter + - mdformat-gfm + - mdformat-tables + +- repo: https://github.com/DavidAnson/markdownlint-cli2 + rev: v0.18.1 + hooks: + - id: markdownlint-cli2 + args: ['--config', '.markdownlint.json'] + exclude: '^\.github/' + +- repo: local + hooks: + - id: pyright + name: pyright + entry: uv run npx pyright + args: [src/, tests/] + language: system + pass_filenames: false + types: [python] diff --git a/src/mailparser/const.py b/src/mailparser/const.py index 8d17224..b5cfca5 100644 --- a/src/mailparser/const.py +++ b/src/mailparser/const.py @@ -18,66 +18,66 @@ import re -REGXIP = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}") - -JUNK_PATTERN = r"[ \(\)\[\]\t\n]+" - -# Patterns for receiveds -RECEIVED_PATTERNS = [ - # FIXED: More restrictive 'from' clause - # Only matches 'from' at the beginning of the header (^) or after - # newline/whitespace to avoid matching within "for from " - # constructs which caused duplicate matches in IBM gateway headers - ( - r"(?:(?:^|\n\s*)from\s+(?P.+?)(?:\s*[(]?" - r"envelope-from|\s*[(]?envelope-sender|\s+" - r"by|\s+with(?! cipher)|\s+id|\s+via|;))" - ), - # IMPROVED: More precise 'by' clause - # Modified to not consume 'with' clause, allowing proper separation - # of 'by' (server name) and 'with' (protocol) fields - ( - r"(?:(?:^|\s)by\s+(?P[^\s]+(?:\s+[^\s]+)*?)" - r"(?:\s+with(?! cipher)|\s*[(]?envelope-from|\s*" - r"[(]?envelope-sender|\s+id|\s+for|\s+via|;))" - ), - # IMPROVED: 'with' clause with better boundary detection - ( - r"(?:(?:^|\s)with(?! cipher)\s+(?P.+?)" - r"(?:\s*[(]?envelope-from|\s*[(]?" - r"envelope-sender|\s+id|\s+for|\s+via|;))" - ), - # IMPROVED: 'id' clause with cleaner boundaries - ( - r"(?:(?:^|\s)id\s+(?P.+?)(?:\s*[(]?envelope-from|\s*" - r"[(]?envelope-sender|\s+for|\s+via|;))" - ), - # IMPROVED: 'for' clause - handles "for from " pattern - # Stops before 'from' keyword to prevent the 'from' pattern from - # matching the sender email in this construct - ( - r"(?:(?:^|\s)for\s+(?P<[^>]+>|[^\s]+)" - r"(?:\s+from|\s*[(]?envelope-from|\s*[(]?" - r"envelope-sender|\s+via|;))" - ), - # IMPROVED: 'via' clause with better termination - ( - r"(?:(?:^|\s)via\s+(?P.+?)(?:\s*[(]?" - r"envelope-from|\s*[(]?envelope-sender|;))" - ), - # assumes emails are always inside <> - r"(?:envelope-from\s+<(?P.+?)>)", - r"(?:envelope-sender\s+<(?P.+?)>)", - # datetime comes after ; at the end - r";\s*(?P.*)", - # sendgrid datetime - ( - r"(?P\d{4}-\d{2}-\d{2} \d{2}:\d{2}:" - r"\d{2}\.\d{9} \+0000 UTC) m=\+\d+\.\d+" - ), -] - -RECEIVED_COMPILED_LIST = [re.compile(i, re.I | re.DOTALL) for i in RECEIVED_PATTERNS] +# IPv4 pattern - validates octet range (0-255) per RFC 791 +REGXIP = re.compile( + r"(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}" + r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)" +) + +# IPv6 pattern - matches standard and common compressed forms per RFC 5952 +REGXIP6 = re.compile( + r"(?:(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}" # full form + r"|(?:[0-9a-fA-F]{1,4}:){1,7}:" # trailing :: + r"|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}" # :: with 1 group after + r"|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}" + r"|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}" + r"|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}" + r"|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}" + r"|[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6}" + r"|:(?::[0-9a-fA-F]{1,4}){1,7}" # ::x:x... + r"|::)" # just :: +) + +# Normalize whitespace: collapse tabs and newlines to single space. +# Parenthesized comments and bracketed IPs are preserved. +JUNK_PATTERN = r"[\t\n]+" + +# ------------------------------------------------------------------ # +# Received header parsing — RFC 5321 §4.4 grammar: +# +# Received = "Received:" *( received-token / comment ) ";" date-time +# received-token = "from" domain / "by" domain / "via" atom +# / "with" atom / "id" atom / "for" addr-spec +# +# Strategy: tokenize on clause keywords, then extract values per clause. +# This eliminates the duplicated boundary lookaheads of the old +# per-clause pattern list and matches the RFC grammar directly. +# ------------------------------------------------------------------ # + +# Pattern that splits a received header into clause tokens. +# Matches each RFC 5321 keyword at a word boundary followed by its value, +# which extends up to the next keyword or semicolon. +# The keywords are: from, by, via, with (not "with cipher"), id, for, +# plus the non-standard envelope-from and envelope-sender. +_CLAUSE_SPLITTER = re.compile( + r"(?:^|\s+)" + r"(from|by|via|with(?!\s+cipher)|id|for|envelope-from|envelope-sender)" + r"\s+", + re.I, +) + +# Extracts envelope-from email: envelope-from +_ENVELOPE_FROM_RE = re.compile(r"<([^>]+)>") + +# Date after semicolon (standard RFC 5321) +_DATE_RE = re.compile(r";\s*(.*)", re.DOTALL) + +# SendGrid non-standard date format (no semicolon) +_SENDGRID_DATE_RE = re.compile( + r"(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d{9}\s+\+0000\s+UTC)" + r"\s+m=\+\d+\.\d+", + re.I, +) EPILOGUE_DEFECTS = {"StartBoundaryNotFoundDefect"} diff --git a/src/mailparser/core.py b/src/mailparser/core.py index 9f07507..10901af 100644 --- a/src/mailparser/core.py +++ b/src/mailparser/core.py @@ -18,12 +18,13 @@ import base64 import email +import email.utils import ipaddress import json import logging import os -from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP +from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP, REGXIP6 from mailparser.utils import ( convert_mail_date, decode_header_part, @@ -122,12 +123,13 @@ def __init__(self, message=None): Init a new object from a message object structure. """ self._message = message - log.debug("All headers of emails: {}".format(", ".join(message.keys()))) + if message is not None: + log.debug("All headers of emails: {}".format(", ".join(message.keys()))) self.parse() - def __str__(self): + def __str__(self) -> str: if self.message: - return self.subject + return str(self.subject) else: return str() @@ -326,13 +328,12 @@ def parse(self): "{}".format("--" + self.message.get_boundary() + "--"), ) - try: - p = email.message_from_string(epilogue) - parts.append(p) - except TypeError: - log.debug("Failed to get epilogue part for TypeError") - except Exception: - log.error("Failed to get epilogue part. Check raw mail.") + if epilogue is not None: + try: + p = email.message_from_string(epilogue) + parts.append(p) + except Exception: + log.error("Failed to get epilogue part. Check raw mail.") # walk all mail parts for i, p in enumerate(parts): @@ -497,6 +498,9 @@ def get_server_ipaddress(self, trust): if not trust.strip(): return + if not self.message: + return + received = self.message.get_all("received", []) for i in received: @@ -510,6 +514,7 @@ def get_server_ipaddress(self, trust): def _extract_ip(self, received_header): """ Extract the IP address from the received header if it is not private. + Supports both IPv4 (RFC 791) and IPv6 (RFC 5952) addresses. Args: received_header (string): The received header string @@ -517,7 +522,14 @@ def _extract_ip(self, received_header): Returns: string with the ip address or None """ - check = REGXIP.findall(received_header[0 : received_header.find("by")]) + by_idx = received_header.find("by") + from_part = received_header[:by_idx] if by_idx != -1 else received_header + + # Try IPv4 first, then IPv6 + check = REGXIP.findall(from_part) + if not check: + check = REGXIP6.findall(from_part) + if check: try: ip_str = str(check[-1]) @@ -551,12 +563,12 @@ def __getattr__(self, name): # raw headers elif name.endswith("_raw"): name = name[:-4] - raw = self.message.get_all(name) + raw = self.message.get_all(name) if self.message else None return json.dumps(raw, ensure_ascii=False) # object headers elif name_header in ADDRESSES_HEADERS: - raw_header = self.message.get(name_header, "") + raw_header = self.message.get(name_header, "") if self.message else "" # parse before decoding parsed_addresses = email.utils.getaddresses([raw_header], strict=True) @@ -605,7 +617,7 @@ def received_raw(self): Return a list of all received headers in raw format """ output = [] - for i in self.message.get_all("received", []): + for i in self.message.get_all("received", []) if self.message else []: output.append(decode_header_part(i)) return output @@ -624,7 +636,7 @@ def headers(self) -> dict: """ Return only the headers as Python object """ - all_headers = set(self.message.keys()) - set(["headers"]) + all_headers = set(self.message.keys() if self.message else []) - {"headers"} return {i: getattr(self, i) for i in all_headers} @property @@ -660,7 +672,7 @@ def date(self): """ Return the mail date in datetime.datetime format and UTC. """ - date = self.message.get("date") + date = self.message.get("date") if self.message else None conv = None try: @@ -674,7 +686,7 @@ def timezone(self): """ Return timezone. Offset from UTC. """ - date = self.message.get("date") + date = self.message.get("date") if self.message else None timezone = 0 try: @@ -703,7 +715,7 @@ def mail_json(self): """ Return the JSON of mail parsed """ - if self.mail.get("date"): + if self.mail.get("date") and self.date: self._mail["date"] = self.date.isoformat() return json.dumps(self.mail, ensure_ascii=False, indent=2) @@ -720,7 +732,7 @@ def mail_partial_json(self): """ Return the JSON of mail parsed partial """ - if self.mail_partial.get("date"): + if self.mail_partial.get("date") and self.date: self._mail_partial["date"] = self.date.isoformat() return json.dumps(self.mail_partial, ensure_ascii=False, indent=2) @@ -758,7 +770,7 @@ def message_as_string(self): """ Return the entire message flattened as a string. """ - return self.message.as_string() + return self.message.as_string() if self.message else "" @property def to_domains(self): diff --git a/src/mailparser/utils.py b/src/mailparser/utils.py index c73cd2c..7e4f123 100644 --- a/src/mailparser/utils.py +++ b/src/mailparser/utils.py @@ -19,6 +19,7 @@ import base64 import datetime import email +import email.utils import functools import hashlib import json @@ -36,10 +37,13 @@ from unicodedata import normalize from mailparser.const import ( + _CLAUSE_SPLITTER, + _DATE_RE, + _ENVELOPE_FROM_RE, + _SENDGRID_DATE_RE, ADDRESSES_HEADERS, JUNK_PATTERN, OTHERS_PARTS, - RECEIVED_COMPILED_LIST, ) from mailparser.exceptions import MailParserOSError, MailParserReceivedParsingError @@ -240,8 +244,11 @@ def msgconvert(email): def parse_received(received): """ - Parse a single received header. - Return a dictionary of values by clause. + Parse a single received header by tokenizing on RFC 5321 §4.4 keywords. + + Uses a keyword-based splitter to divide the header into clauses + (from, by, via, with, id, for, envelope-from, envelope-sender), + then extracts the date from after the semicolon. Arguments: received {str} -- single received header @@ -255,47 +262,71 @@ def parse_received(received): """ values_by_clause = {} - for pattern in RECEIVED_COMPILED_LIST: - matches = [match for match in pattern.finditer(received)] - - if len(matches) == 0: - # no matches for this clause, but it's ok! keep going! - log.debug("No matches found for %s in %s" % (pattern.pattern, received)) - elif len(matches) > 1: - # uh, can't have more than one of each clause in a received. - # so either there's more than one or the current regex is wrong - msg = "More than one match found for %s in %s" % (pattern.pattern, received) - log.error(msg) - raise MailParserReceivedParsingError(msg) + + # --- Step 1: Extract date (after semicolon, or SendGrid format) --- + date_match = _DATE_RE.search(received) + if date_match: + values_by_clause["date"] = date_match.group(1) + # Work only on the part before the semicolon for clause parsing + header_body = received[: date_match.start()] + else: + # Try SendGrid non-standard date + sg_match = _SENDGRID_DATE_RE.search(received) + if sg_match: + values_by_clause["date"] = sg_match.group(1) + header_body = received[: sg_match.start()] + else: + header_body = received + + # --- Step 2: Tokenize on clause keywords --- + # _CLAUSE_SPLITTER.split gives: [preamble, kw1, val1, kw2, val2, ...] + parts = _CLAUSE_SPLITTER.split(header_body) + + # parts[0] is preamble (before first keyword), then alternating kw/value + i = 1 # skip preamble + while i + 1 < len(parts): + keyword = parts[i].lower() + value = parts[i + 1].strip() + i += 2 + + if keyword in ("envelope-from", "envelope-sender"): + # Extract email from angle brackets + m = _ENVELOPE_FROM_RE.search(value) + if m: + values_by_clause[keyword.replace("-", "_")] = m.group(1) + elif keyword == "for": + values_by_clause[keyword] = value + elif keyword == "from": + # RFC 5321: only one 'from' clause per received header. + # Only accept the first occurrence; subsequent ones come from + # IBM-style "for from " constructs. + if "from" not in values_by_clause: + values_by_clause[keyword] = value else: - # otherwise we have one matching clause! - log.debug("Found one match for %s in %s" % (pattern.pattern, received)) - match = matches[0].groupdict() - key = list(match.keys())[0] - value = list(match.values())[0] - values_by_clause[key] = value - - if len(values_by_clause) == 0: - # we weren't able to match anything... + values_by_clause[keyword] = value + + # --- Step 3: Extract envelope-from/sender from within clause values --- + # Some MTAs embed envelope-from inside parenthesized comments in the + # 'by' clause, e.g.: "by host.com (envelope-from )" + for clause_key in ("by", "from", "with"): + clause_val = values_by_clause.get(clause_key, "") + for env_key, env_name in ( + ("envelope_from", "envelope-from"), + ("envelope_sender", "envelope-sender"), + ): + if env_key not in values_by_clause and env_name in clause_val.lower(): + m = re.search( + r"(?i)" + re.escape(env_name) + r"\s+<([^>]+)>", + clause_val, + ) + if m: + values_by_clause[env_key] = m.group(1) + + if not values_by_clause: msg = "Unable to match any clauses in %s" % (received) - - # Modification #1: Commenting the following log as - # this raised exception is caught above and then - # raw header is updated in response - # We dont want to get so many errors in our error - # logger as we are not even trying to parse the - # received headers - # Wanted to make it configurable via settiings, - # but this package does not depend on django and - # making configurable setting - # will make it django dependent, - # so better to keep it working with only python - # dependent and on any framework of python - # commenting it just for our use - - # log.error(msg) - raise MailParserReceivedParsingError(msg) + + log.debug("Parsed clauses: %s", list(values_by_clause.keys())) return values_by_clause @@ -351,6 +382,8 @@ def convert_mail_date(date): """ log.debug(f"Date to parse: {date!r}") d = email.utils.parsedate_tz(date) + if d is None: + raise ValueError(f"Cannot parse date: {date!r}") log.debug(f"Date parsed: {d!r}") t = email.utils.mktime_tz(d) log.debug(f"Date parsed in timestamp: {t!r}") @@ -415,9 +448,12 @@ def receiveds_format(receiveds): # Modify date to manage strange header like: # "for ; Tue, 7 Mar 2017 14:29:24 -0800", i["date"] = i["date"].split(";")[-1] + # Strip leading RFC 2822 comments like: + # "(version=TLSv1/SSLv3 cipher=AES128-GCM-SHA256 bits=128/128) Wed, ..." + i["date"] = re.sub(r"^\s*(?:\([^)]*\)\s*)+", "", i["date"]) try: j["date_utc"], _ = convert_mail_date(i["date"]) - except TypeError: + except (TypeError, ValueError): j["date_utc"] = None # Add delay diff --git a/tests/test_mail_parser.py b/tests/test_mail_parser.py index 8c38672..10c35a9 100644 --- a/tests/test_mail_parser.py +++ b/tests/test_mail_parser.py @@ -302,6 +302,7 @@ def test_parsing_know_values(self): self.assertIsInstance(mail.date_raw, str) self.assertIsInstance(mail.date_json, str) raw_utc = "2015-11-29T08:45:18+00:00" + assert mail.date is not None result = mail.date.isoformat() self.assertEqual(raw_utc, result) @@ -670,6 +671,7 @@ def test_parse_from_bytes(self): self.assertIsInstance(mail.date_raw, str) self.assertIsInstance(mail.date_json, str) raw_utc = "2015-11-29T08:45:18+00:00" + assert mail.date is not None result = mail.date.isoformat() self.assertEqual(raw_utc, result) @@ -953,3 +955,132 @@ def test_comma_in_name(self): ("", "simple@example.net"), ('John "Johnny" Doe', "john.doe@example.com"), ] + + def test_init_with_message_object_logs_headers(self): + """Test core.py:126->128 — MailParser.__init__ with message is not None""" + import email as email_module + + from mailparser.core import MailParser + + raw = "From: test@example.com\nSubject: LogTest\n\nBody" + msg = email_module.message_from_string(raw) + + with self.assertLogs("mailparser", level="DEBUG") as cm: + parser = MailParser(message=msg) + + # The debug log about headers must have been emitted + self.assertTrue(any("All headers of emails" in line for line in cm.output)) + self.assertEqual(parser.subject, "LogTest") + + def test_init_with_none_message_skips_log(self): + """Test core.py:126->128 — MailParser.__init__ message=None skips debug log""" + from mailparser.core import MailParser + + # message=None: the if-branch is False, no log.debug call + parser = MailParser(message=None) + self.assertFalse(parser.message) + + def test_date_json_returns_none_when_no_date(self): + """Test core.py:703->exit — date_json returns None when self.date is falsy""" + # A mail with no Date header will have self.date == None + raw = "From: test@example.com\nSubject: NoDat\n\nBody" + mail = mailparser.parse_from_string(raw) + # date should be None/falsy + self.assertIsNone(mail.date) + # date_json should return None (the if branch is not taken) + self.assertIsNone(mail.date_json) + + def test_mail_partial_json_date_branch(self): + """Test core.py:735->737 — mail_partial_json sets isoformat date""" + raw = ( + "From: test@example.com\n" + "Subject: PartialDate\n" + "Date: Mon, 01 Jan 2024 12:00:00 +0000\n" + "\nBody" + ) + mail = mailparser.parse_from_string(raw) + self.assertIsNotNone(mail.date) + # mail_partial_json should include the isoformat date string + result = mail.mail_partial_json + self.assertIsInstance(result, str) + self.assertIn("2024-01-01", result) + + def test_mail_partial_json_no_date(self): + """Test core.py:735->737 False branch — mail_partial_json without date""" + # Mail with no Date header: condition is False, skip line 736 + raw = "From: test@example.com\nSubject: NoDate\n\nBody" + mail = mailparser.parse_from_string(raw) + self.assertIsNone(mail.date) + result = mail.mail_partial_json + self.assertIsInstance(result, str) + + def test_sender_ip_no_message(self): + """Test core.py:502 — get_server_ipaddress returns None with no message""" + mail = mailparser.parse_from_string("fake mail") + self.assertFalse(mail.message) + result = mail.get_server_ipaddress("anything") + self.assertIsNone(result) + + def test_extract_ip_ipv6_fallback(self): + """Test core.py:531 — _extract_ip uses IPv6 when IPv4 not found""" + raw_mail = ( + "Received: from sender.example.com (IPv6:2001:db8::1)\n" + " by mail.trusted.net; Mon, 01 Jan 2024 12:00:00 +0000\n" + "From: test@example.com\n" + "Subject: IPv6 test\n\nBody" + ) + mail = mailparser.parse_from_string(raw_mail) + # 2001:db8:: is documentation range — it is not private + result = mail.get_server_ipaddress("trusted.net") + # Should find the IPv6 address (it is globally routable) + self.assertIsNotNone(result) + + def test_extract_ip_invalid_ip_returns_none(self): + """Test core.py:538-539 — _extract_ip returns None for unparsable IP string""" + parser = mailparser.parse_from_string("From: t@example.com\nSubject: x\n\nBody") + # Patch REGXIP to return a value that ipaddress.ip_address() cannot parse + with patch("mailparser.core.REGXIP") as mock_regxip: + with patch("mailparser.core.REGXIP6") as mock_regxip6: + mock_regxip.findall.return_value = [] + mock_regxip6.findall.return_value = ["not_a_valid_ip"] + result = parser._extract_ip("from invalid by host") + self.assertIsNone(result) + + def test_extract_ip_private_ip_returns_none(self): + """Test core.py:544 — _extract_ip returns None when IP is private""" + raw_mail = ( + "Received: from internal.corp (10.0.0.1)\n" + " by mail.trusted.org; Mon, 01 Jan 2024 12:00:00 +0000\n" + "From: test@example.com\n" + "Subject: Private IP\n\nBody" + ) + mail = mailparser.parse_from_string(raw_mail) + result = mail.get_server_ipaddress("trusted.org") + self.assertIsNone(result) + + def test_extract_ip_no_ip_found_returns_none(self): + """Test core.py:533->544 — _extract_ip returns None when no IP found at all""" + mail = mailparser.parse_from_string("From: t@example.com\nSubject: x\n\nBody") + # A received header with no IP addresses at all + result = mail._extract_ip("from hostname by other-hostname") + self.assertIsNone(result) + + def test_unicode_decode_error_in_payload(self): + """Test core.py:447-448 — UnicodeDecodeError fallback when decoding payload""" + # A body containing a backslash-u followed by non-hex characters + # causes raw-unicode-escape to raise UnicodeDecodeError (line 447), + # which is caught and falls back to ported_string (line 448). + # The part has no CTE so the try/except branch is entered. + backslash_u_invalid = chr(92) + "uggg" + raw_mail = ( + "Content-Type: multipart/mixed; boundary=TEST_BOUND\n" + "\n" + "--TEST_BOUND\n" + "Content-Type: text/plain; charset=utf-8\n" + "\n" + "hello " + backslash_u_invalid + " world\n" + "--TEST_BOUND--\n" + ) + mail = mailparser.parse_from_string(raw_mail) + # Should have parsed successfully and body contains the text + self.assertIn("hello", mail.body) diff --git a/tests/test_main.py b/tests/test_main.py index fc89162..01f32e1 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -344,3 +344,17 @@ def test_print_attachments_details(self, parser): with patch("mailparser.__main__.print_attachments") as mock_print: print_attachments_details(mock_parser, args) mock_print.assert_called_once_with(mock_parser.attachments, False) + + def test_get_parser_with_stdin(self, parser): + """Test get_parser returns parse_stdin result when args.stdin is True""" + import io + + from mailparser.__main__ import get_parser + + test_content = "From: test@example.com\nSubject: Stdin Test\n\nBody" + args = parser.parse_args(["--stdin"]) + + with patch("sys.stdin", io.StringIO(test_content)): + result = get_parser(args) + assert result is not None + assert result.subject == "Stdin Test" diff --git a/tests/test_received_corpus.py b/tests/test_received_corpus.py new file mode 100644 index 0000000..f313b07 --- /dev/null +++ b/tests/test_received_corpus.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python +""" +Test suite for received header parsing against a real-world header corpus. + +Covers headers from major MTAs: Postfix, Exim, Exchange/Outlook, +Gmail, SendGrid, IBM/Domino, AWS SES, and edge cases. +Validates the tokenizer approach against RFC 5321 §4.4 grammar. +""" + +import unittest + +from mailparser.const import REGXIP, REGXIP6 +from mailparser.utils import parse_received, receiveds_parsing + + +class TestPostfixHeaders(unittest.TestCase): + """Headers from Postfix MTA.""" + + def test_postfix_standard(self): + header = ( + "from mail-out.example.com (mail-out.example.com [203.0.113.10]) " + "by mx.recipient.org (Postfix) with ESMTP id 4F3D21234 " + "for ; Tue, 15 Mar 2024 10:30:00 +0000 (UTC)" + ) + parsed = parse_received(header) + self.assertIn("mail-out.example.com", parsed["from"]) + self.assertIn("mx.recipient.org", parsed["by"]) + self.assertEqual(parsed["with"], "ESMTP") + self.assertEqual(parsed["id"], "4F3D21234") + self.assertEqual(parsed["for"], "") + self.assertIn("Tue, 15 Mar 2024", parsed["date"]) + + def test_postfix_with_tls(self): + header = ( + "from sender.example.com (sender.example.com [198.51.100.5]) " + "by relay.example.net (Postfix) with ESMTPS id ABC123DEF " + "for ; Wed, 10 Jan 2024 08:15:30 -0500" + ) + parsed = parse_received(header) + self.assertEqual(parsed["with"], "ESMTPS") + self.assertIn("sender.example.com", parsed["from"]) + + +class TestEximHeaders(unittest.TestCase): + """Headers from Exim MTA.""" + + def test_exim_standard(self): + header = ( + "from [192.168.1.100] (helo=client.example.com) " + "by smtp.example.org with esmtpa (Exim 4.96) " + "id 1pABCD-00012E-XX " + "for user@example.org; Mon, 5 Feb 2024 14:00:00 +0000" + ) + parsed = parse_received(header) + self.assertIn("192.168.1.100", parsed["from"]) + self.assertEqual(parsed["by"], "smtp.example.org") + self.assertIn("esmtpa", parsed["with"]) + self.assertEqual(parsed["id"], "1pABCD-00012E-XX") + + def test_exim_authenticated(self): + header = ( + "from auth-user " + "by mail.example.com with local (Exim 4.96) " + "id 1pXYZ0-000ABC-De " + "for admin@example.com; Thu, 20 Jun 2024 09:30:00 +0100" + ) + parsed = parse_received(header) + self.assertEqual(parsed["from"], "auth-user") + self.assertIn("local", parsed["with"]) + + +class TestExchangeOutlookHeaders(unittest.TestCase): + """Headers from Exchange/Outlook.""" + + def test_exchange_internal(self): + header = ( + "from DM6PR06MB4475.namprd06.prod.outlook.com (2603:10b6:207:3d::31) " + "by BL0PR06MB4465.namprd06.prod.outlook.com with HTTPS " + "via BL0PR02CA0054.NAMPRD02.PROD.OUTLOOK.COM; " + "Mon, 1 Oct 2024 09:49:22 +0000" + ) + parsed = parse_received(header) + self.assertIn("DM6PR06MB4475", parsed["from"]) + self.assertIn("BL0PR06MB4465", parsed["by"]) + self.assertEqual(parsed["with"], "HTTPS") + self.assertIn("BL0PR02CA0054", parsed["via"]) + + def test_exchange_with_id(self): + header = ( + "from edge.microsoft.com (10.0.0.1) " + "by EXCH01.corp.example.com (172.16.0.10) with Microsoft SMTP Server " + "id 15.1.2507.23; Fri, 22 Nov 2024 16:00:00 -0800" + ) + parsed = parse_received(header) + self.assertIn("edge.microsoft.com", parsed["from"]) + self.assertIn("Microsoft SMTP Server", parsed["with"]) + + +class TestGmailHeaders(unittest.TestCase): + """Headers from Gmail/Google Workspace.""" + + def test_gmail_standard(self): + header = ( + "from mail-wr1-f54.google.com (mail-wr1-f54.google.com. [209.85.221.54]) " + "by mx.google.com with ESMTPS id a1-2023abc.123.2024.01.15.08.30.00 " + "for ; Mon, 15 Jan 2024 08:30:00 -0800 (PST)" + ) + parsed = parse_received(header) + self.assertIn("mail-wr1-f54.google.com", parsed["from"]) + self.assertEqual(parsed["with"], "ESMTPS") + self.assertEqual(parsed["for"], "") + + def test_gmail_with_cipher_annotation(self): + """Ensure 'with cipher' in TLS info doesn't split the 'with' clause.""" + header = ( + "from sender.example.com (sender.example.com [93.184.216.34]) " + "by smtp.gmail.com with ESMTPS id abc123def.456 " + "for ; Tue, 20 Feb 2024 11:00:00 -0800 (PST)" + ) + parsed = parse_received(header) + self.assertEqual(parsed["with"], "ESMTPS") + + +class TestSendGridHeaders(unittest.TestCase): + """Headers from SendGrid.""" + + def test_sendgrid_date_format(self): + header = ( + "from filter.sendgrid.net by filter0123.sendgrid.net " + "with SMTP 2024-03-15 10:30:00.123456789 +0000 UTC m=+12345.678901234" + ) + parsed = parse_received(header) + self.assertIn("date", parsed) + self.assertIn("2024-03-15", parsed["date"]) + + +class TestIBMDominoHeaders(unittest.TestCase): + """Headers from IBM/Domino with non-standard patterns.""" + + def test_ibm_for_from_pattern(self): + """IBM gateway: 'for from ' construct.""" + header = ( + "from localhost " + "by e06smtp10.uk.ibm.com with IBM ESMTP SMTP Gateway: " + "Authorized Use Only! Violators will be prosecuted " + "for from ; " + "Wed, 8 Mar 2017 16:46:25 -0000" + ) + parsed = parse_received(header) + self.assertEqual(parsed["from"].strip(), "localhost") + self.assertEqual(parsed["for"], "") + self.assertIn("IBM ESMTP", parsed["with"]) + + def test_ibm_domino_notes(self): + header = ( + "from localhost " + "by smtp.notes.na.collabserv.com with smtp.notes.na.collabserv.com ESMTP " + "for from ; " + "Wed, 8 Mar 2017 16:46:15 -0000" + ) + parsed = parse_received(header) + self.assertEqual(parsed["from"].strip(), "localhost") + self.assertIn("for", parsed) + + +class TestAWSSESHeaders(unittest.TestCase): + """Headers from AWS SES.""" + + def test_ses_standard(self): + header = ( + "from a]b-123.smtp-out.us-east-1.amazonses.com " + "(a]b-123.smtp-out.us-east-1.amazonses.com [54.240.0.1]) " + "by inbound-smtp.us-east-1.amazonaws.com with SMTP " + "id abc123def456 " + "for user@example.com; Fri, 1 Mar 2024 12:00:00 +0000 (UTC)" + ) + parsed = parse_received(header) + self.assertIn("amazonses.com", parsed["from"]) + self.assertEqual(parsed["with"], "SMTP") + + +class TestEnvelopeHeaders(unittest.TestCase): + """Headers with envelope-from / envelope-sender.""" + + def test_envelope_from_in_parentheses(self): + header = ( + "from host.example.com ([86.187.174.57]:45321 helo=User) " + "by localhost.localdomain (envelope-from ) " + "with ESMTP id ABC123; Mon, 21 Aug 2016 10:49:40 -0000" + ) + parsed = parse_received(header) + self.assertEqual(parsed["envelope_from"], "sender@example.com") + + def test_envelope_sender(self): + header = ( + "from mail.example.com (mail.example.com [10.0.0.1]) " + "by gateway.example.com (envelope-sender ) " + "with ESMTP; Mon, 1 Jan 2024 00:00:00 +0000" + ) + parsed = parse_received(header) + self.assertEqual(parsed["envelope_sender"], "noreply@example.com") + + +class TestEdgeCases(unittest.TestCase): + """Edge cases and unusual headers.""" + + def test_qmail_minimal(self): + """qmail-style headers with only date.""" + header = "(qmail 11769 invoked from network); 22 Aug 2016 14:23:01 -0000" + parsed = parse_received(header) + self.assertIn("date", parsed) + self.assertIn("22 Aug 2016", parsed["date"]) + + def test_no_from_clause(self): + """Header starting with 'by' (internal delivery).""" + header = ( + "by mail.example.com (Postfix) id XYZ789; Mon, 1 Jan 2024 00:00:00 +0000" + ) + parsed = parse_received(header) + self.assertIn("by", parsed) + self.assertIn("date", parsed) + + def test_multiple_spaces_normalization(self): + """Headers with excessive whitespace.""" + header = ( + "from server.example.com (10.0.0.1) " + "by mx.example.org (Postfix) with ESMTP id 2CC378D014 " + "for ; Mon, 22 Aug 2016 14:22:58 +0000 (UTC)" + ) + parsed = parse_received(header) + self.assertIn("from", parsed) + self.assertIn("by", parsed) + self.assertIn("with", parsed) + self.assertIn("id", parsed) + self.assertIn("for", parsed) + + def test_unparseable_header_raises(self): + """Completely invalid headers should raise.""" + from mailparser.exceptions import MailParserReceivedParsingError + + with self.assertRaises(MailParserReceivedParsingError): + parse_received("TotallyInvalidNotAReceivedHeader") + + def test_receiveds_parsing_integration(self): + """Test full pipeline through receiveds_parsing.""" + headers = [ + "from a.example.com by b.example.com with SMTP;" + " Mon, 1 Jan 2024 12:00:00 +0000", + "from c.example.com by d.example.com with ESMTP;" + " Mon, 1 Jan 2024 12:01:00 +0000", + ] + result = receiveds_parsing(headers) + self.assertEqual(len(result), 2) + # Should have hop numbers + self.assertIn("hop", result[0]) + self.assertIn("hop", result[1]) + + def test_date_with_extra_semicolon(self): + """Date field that itself contains a semicolon.""" + header = ( + "from server.example.com by mx.example.org with ESMTP " + "for ; Tue, 7 Mar 2017 14:29:24 -0800" + ) + parsed = parse_received(header) + self.assertIn("date", parsed) + + def test_domain_with_tld_containing_keyword(self): + """Domain names that contain RFC keywords like .by or .id.""" + header = ( + "from mail.company.by (mail.company.by [10.0.0.1]) " + "by relay.example.id with ESMTP; Mon, 1 Jan 2024 00:00:00 +0000" + ) + parsed = parse_received(header) + # Should still parse — 'by' and 'id' in TLDs should not break parsing + self.assertIn("from", parsed) + self.assertIn("date", parsed) + + +class TestIPPatterns(unittest.TestCase): + """Test the improved IP address regex patterns.""" + + def test_valid_ipv4(self): + self.assertTrue(REGXIP.search("192.168.1.1")) + self.assertTrue(REGXIP.search("0.0.0.0")) + self.assertTrue(REGXIP.search("255.255.255.255")) + + def test_invalid_ipv4_rejected(self): + """Old regex matched 999.999.999.999; new one should not.""" + match = REGXIP.fullmatch("999.999.999.999") + self.assertIsNone(match) + + def test_ipv4_boundary(self): + match = REGXIP.fullmatch("256.1.1.1") + self.assertIsNone(match) + + def test_ipv6_full(self): + self.assertTrue(REGXIP6.search("2001:0db8:85a3:0000:0000:8a2e:0370:7334")) + + def test_ipv6_compressed(self): + self.assertTrue(REGXIP6.search("2603:10b6:207:3d::31")) + self.assertTrue(REGXIP6.search("::1")) + self.assertTrue(REGXIP6.search("::")) + + def test_ipv6_in_received_header(self): + """IPv6 addresses commonly appear in received headers.""" + header_fragment = "[2603:10b6:207:3d::31]" + self.assertTrue(REGXIP6.search(header_fragment)) diff --git a/tests/test_utils.py b/tests/test_utils.py index aea76b7..20f0df0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -586,3 +586,48 @@ def test_receiveds_format_delay_no_previous_date(self): self.assertEqual(result[1]["delay"], 0) # But should have a valid date itself self.assertIsNotNone(result[1].get("date_utc")) + + def test_parse_received_envelope_from_with_angle_brackets(self): + """Test utils.py:294-296 — envelope-from clause with angle-bracket match""" + # When envelope-from keyword is present AND its value has angle + # brackets, _ENVELOPE_FROM_RE.search() succeeds and line 296 runs. + received = ( + "from mail.example.com by mx.example.org" + " envelope-from " + "; Mon, 01 Jan 2024 00:00:00 +0000" + ) + result = parse_received(received) + self.assertIsInstance(result, dict) + self.assertEqual(result.get("envelope_from"), "sender@example.com") + + def test_parse_received_envelope_from_no_angle_brackets(self): + """Test utils.py:294-296 — envelope-from clause with no angle-bracket match""" + # When the envelope-from keyword is present but its value has no + # angle brackets, _ENVELOPE_FROM_RE.search() returns None and the + # if-branch (line 295) is skipped, leaving no envelope_from key. + received = ( + "from mail.example.com by mx.example.org" + " envelope-from no-brackets-here" + "; Mon, 01 Jan 2024 00:00:00 +0000" + ) + result = parse_received(received) + # The parser should succeed (other clauses are present) + self.assertIsInstance(result, dict) + # envelope_from must NOT be set because there were no angle brackets + self.assertNotIn("envelope_from", result) + + def test_parse_received_envelope_from_in_clause_no_angle_brackets(self): + """Test utils.py:322->313 — inline envelope-from without angle brackets""" + # Step 3 of parse_received searches clause values for "envelope-from" + # text, then applies _ENVELOPE_FROM_RE. When the keyword appears but + # has no <…>, the inner `if m:` at line 322 is False and the key is + # not added. + received = ( + "from mail.example.com" + " by mx.example.org (envelope-from no-angle-bracket)" + "; Mon, 01 Jan 2024 00:00:00 +0000" + ) + result = parse_received(received) + self.assertIsInstance(result, dict) + # envelope_from must NOT be set + self.assertNotIn("envelope_from", result)