diff --git a/src/mailparser/utils.py b/src/mailparser/utils.py index ff4e725..bd4dfc1 100644 --- a/src/mailparser/utils.py +++ b/src/mailparser/utils.py @@ -100,13 +100,36 @@ def get_addresses(raw_header): that was actually present in the header. Args: - raw_header (str): raw value of an address header - (e.g. ``From``, ``To``, ``CC`` …) + raw_header (str | email.header.Header | None): raw value of an + address header (e.g. ``From``, ``To``, ``CC`` …). Accepts a + plain ``str``, an ``email.header.Header`` instance (returned + by ``email.message.Message.get`` for headers containing + RFC 2047 encoded-words such as non-ASCII display names), or + ``None``. Returns: list[tuple[str, str]]: list of ``(display_name, email_addr)`` tuples. ``display_name`` is an empty string when absent. """ + # ``Message.get(name)`` returns an ``email.header.Header`` for any header + # whose value contains RFC 2047 encoded-words (typical for non-ASCII + # display names like ``=?utf-8?q?=C3=81rp=C3=A1d?=``). ``Header`` does + # not implement string methods such as ``.strip()`` and is not a valid + # input to ``email.utils.getaddresses``. + # + # Important: decode ``Header`` values into a plain parseable string first. + # In practice, strict address parsing can treat raw encoded-word tokens like + # ``=?unknown-8bit?...?=`` as the *address* itself, producing output such as + # ``To: =?unknown-8bit?...?=``. Decoding first gives + # ``Álpám Longsom `` so getaddresses() can split + # name/address correctly. + if raw_header is None: + return [] + if isinstance(raw_header, email.header.Header): + raw_header = decode_header_part(raw_header.encode()) + elif not isinstance(raw_header, str): + raw_header = str(raw_header) + parsed = email.utils.getaddresses([raw_header], strict=True) # If every result from the strict parser has an empty address — while the diff --git a/tests/test_utils.py b/tests/test_utils.py index 7665431..5fc46c3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -26,6 +26,7 @@ from mailparser.utils import ( decode_header_part, find_between, + get_addresses, msgconvert, parse_received, ported_open, @@ -600,6 +601,105 @@ def test_ported_string_handles_header_object(self): self.assertIsInstance(result, str) self.assertEqual(result, raw_val) + def test_get_addresses_handles_header_object(self): + """ + Test that get_addresses accepts an email.header.Header instance + without raising AttributeError on `.strip()`. + + Regression for the case where Message.get(name) returns a Header + for address headers containing RFC 2047 encoded-words (e.g. + non-ASCII display names like ``=?utf-8?q?=C3=81lp=C3=A1m_Longsom?=``). + Before the fix, this raised: + + AttributeError: 'Header' object has no attribute 'strip' + """ + from email.header import Header + + header_obj = Header("Álpám Longsom", charset="utf-8") + header_obj.append(" ", charset="us-ascii") + result = get_addresses(header_obj) + self.assertIsInstance(result, list) + self.assertEqual(len(result), 1) + display_name, addr = result[0] + self.assertEqual(addr, "recipient@example.com") + # get_addresses returns encoded-word form for the display name. + # Decoding to Unicode happens in core.py via decode_header_part. + self.assertEqual(display_name, "Álpám Longsom") + + def test_get_addresses_handles_unknown_8bit_header_object(self): + """ + Regression for real-world unknown-8bit encoded-word headers. + Address parsing must not treat the encoded-word token itself as + the address. + """ + from email.header import Header + + encoded_name = "=?unknown-8bit?b?w4FscMOhbSBMb25nc29t?=" + header_obj = Header() + header_obj.append(encoded_name, charset="us-ascii") + header_obj.append(" ", charset="us-ascii") + + result = get_addresses(header_obj) + self.assertEqual(result, [("Álpám Longsom", "recipient@example.com")]) + + def test_get_addresses_handles_none(self): + """ + Test that get_addresses returns an empty list when given None, + rather than crashing on attribute access. + """ + self.assertEqual(get_addresses(None), []) + + def test_get_addresses_plain_string_unchanged(self): + """ + Test that the existing plain-string path still works. This guards + against accidentally regressing the common case while adding + Header / None handling. + """ + result = get_addresses("Plain Name ") + self.assertEqual(result, [("Plain Name", "plain@example.com")]) + + def test_mailparser_from_bytes_preserves_unicode_display_name(self): + """ + Regression: Header objects from Message.get(name) must round-trip + through get_addresses() without introducing replacement characters. + + The parser should expose the decoded Unicode display name on + MailParser.to. + """ + from mailparser.core import MailParser + + raw_email = ( + b"From: Sender \r\n" + b"To: =?utf-8?b?w4FscMOhbSBMb25nc29t?= \r\n" + b"Subject: Test\r\n" + b"Date: Tue, 12 May 2026 18:00:00 +0000\r\n" + b"Content-Type: text/plain; charset=utf-8\r\n" + b"\r\n" + b"hello\r\n" + ) + + mail = MailParser.from_bytes(raw_email) + self.assertEqual(mail.to, [("Álpám Longsom", "recipient@example.com")]) + + def test_mailparser_from_bytes_unknown_8bit_display_name(self): + """ + End-to-end regression for unknown-8bit encoded-word in To header. + """ + from mailparser.core import MailParser + + raw_email = ( + b"From: Sender \r\n" + b"To: =?unknown-8bit?b?w4FscMOhbSBMb25nc29t?= \r\n" + b"Subject: Test\r\n" + b"Date: Tue, 12 May 2026 18:00:00 +0000\r\n" + b"Content-Type: text/plain; charset=utf-8\r\n" + b"\r\n" + b"hello\r\n" + ) + + mail = MailParser.from_bytes(raw_email) + self.assertEqual(mail.to, [("Álpám Longsom", "recipient@example.com")]) + def test_parse_received_envelope_from_with_angle_brackets(self): """Test utils.py:294-296 — envelope-from clause with angle-bracket match""" # When envelope-from keyword is present AND its value has angle