From 88e366ec506748f726c92f69b55b9532a2c161a6 Mon Sep 17 00:00:00 2001 From: Martin Kurtz <70766440+rakurtz@users.noreply.github.com> Date: Tue, 12 May 2026 18:58:54 +0200 Subject: [PATCH 1/3] fix(get_addresses): coerce Header inputs to str before .strip() Message.get() returns email.header.Header for RFC 2047 encoded-word headers (non-ASCII display names). The Header type has no .strip() method and is not a valid input to email.utils.getaddresses, which caused MailParser.from_bytes() to raise AttributeError: 'Header' object has no attribute 'strip' for any mail with a non-ASCII display name in an address header. Normalize input at the top of get_addresses() so callers can pass either a str, a Header, or None. Adds three regression tests in tests/test_utils.py: - get_addresses with an email.header.Header instance - get_addresses with None - get_addresses with a plain str (guards the existing happy path) Co-authored-by: Cursor --- src/mailparser/utils.py | 19 +++++++++++++++++-- tests/test_utils.py | 42 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/src/mailparser/utils.py b/src/mailparser/utils.py index ff4e725..addfca8 100644 --- a/src/mailparser/utils.py +++ b/src/mailparser/utils.py @@ -100,13 +100,28 @@ def get_addresses(raw_header): that was actually present in the header. Args: - raw_header (str): raw value of an address header - (e.g. ``From``, ``To``, ``CC`` …) + raw_header (str | email.header.Header | None): raw value of an + address header (e.g. ``From``, ``To``, ``CC`` …). Accepts a + plain ``str``, an ``email.header.Header`` instance (returned + by ``email.message.Message.get`` for headers containing + RFC 2047 encoded-words such as non-ASCII display names), or + ``None``. Returns: list[tuple[str, str]]: list of ``(display_name, email_addr)`` tuples. ``display_name`` is an empty string when absent. """ + # ``Message.get(name)`` returns an ``email.header.Header`` for any header + # whose value contains RFC 2047 encoded-words (typical for non-ASCII + # display names like ``=?utf-8?q?=C3=81rp=C3=A1d?=``). ``Header`` does + # not implement string methods such as ``.strip()`` and is not a valid + # input to ``email.utils.getaddresses``. Normalise once here so callers + # can pass whatever ``Message.get`` returned without having to coerce. + if raw_header is None: + return [] + if not isinstance(raw_header, str): + raw_header = str(raw_header) + parsed = email.utils.getaddresses([raw_header], strict=True) # If every result from the strict parser has an empty address — while the diff --git a/tests/test_utils.py b/tests/test_utils.py index 7665431..ab46a70 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -26,6 +26,7 @@ from mailparser.utils import ( decode_header_part, find_between, + get_addresses, msgconvert, parse_received, ported_open, @@ -600,6 +601,47 @@ def test_ported_string_handles_header_object(self): self.assertIsInstance(result, str) self.assertEqual(result, raw_val) + def test_get_addresses_handles_header_object(self): + """ + Test that get_addresses accepts an email.header.Header instance + without raising AttributeError on `.strip()`. + + Regression for the case where Message.get(name) returns a Header + for address headers containing RFC 2047 encoded-words (e.g. + non-ASCII display names like ``=?utf-8?q?=C3=81lp=C3=A1m_Longsom?=``). + Before the fix, this raised: + + AttributeError: 'Header' object has no attribute 'strip' + """ + from email.header import Header + + raw_val = "=?utf-8?q?=C3=81lp=C3=A1m_Longsom?= " + header_obj = Header(raw_val, charset="utf-8") + result = get_addresses(header_obj) + self.assertIsInstance(result, list) + self.assertEqual(len(result), 1) + display_name, addr = result[0] + self.assertEqual(addr, "recipient@example.com") + # Header.__str__ yields the encoded-word form; the display name + # is preserved (decoding happens in core.py via decode_header_part). + self.assertIn("=?utf-8?q?=C3=81lp=C3=A1m_Longsom?=", display_name) + + def test_get_addresses_handles_none(self): + """ + Test that get_addresses returns an empty list when given None, + rather than crashing on attribute access. + """ + self.assertEqual(get_addresses(None), []) + + def test_get_addresses_plain_string_unchanged(self): + """ + Test that the existing plain-string path still works. This guards + against accidentally regressing the common case while adding + Header / None handling. + """ + result = get_addresses("Plain Name ") + self.assertEqual(result, [("Plain Name", "plain@example.com")]) + def test_parse_received_envelope_from_with_angle_brackets(self): """Test utils.py:294-296 — envelope-from clause with angle-bracket match""" # When envelope-from keyword is present AND its value has angle From b4dd6d1a7fb351d8667b275f7b3e0f80070c9b95 Mon Sep 17 00:00:00 2001 From: Martin Kurtz <70766440+rakurtz@users.noreply.github.com> Date: Tue, 12 May 2026 19:25:34 +0200 Subject: [PATCH 2/3] fix(get_addresses): preserve RFC2047 header encoding for Unicode names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Coercing email.header.Header via str() avoids the AttributeError crash, but can be lossy for some real-world headers and surface replacement characters (�) in rendered output. Use Header.encode() in get_addresses() to preserve RFC 2047 encoded-words. This keeps the header parseable by email.utils.getaddresses while allowing core.py to decode display names later via decode_header_part() without introducing replacement chars. Adds a regression test that parses a full message via MailParser.from_bytes and asserts Unicode display names are preserved on mail.to. Co-authored-by: Cursor --- src/mailparser/utils.py | 12 +++++++++--- tests/test_utils.py | 33 ++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/src/mailparser/utils.py b/src/mailparser/utils.py index addfca8..b48b4d4 100644 --- a/src/mailparser/utils.py +++ b/src/mailparser/utils.py @@ -115,11 +115,17 @@ def get_addresses(raw_header): # whose value contains RFC 2047 encoded-words (typical for non-ASCII # display names like ``=?utf-8?q?=C3=81rp=C3=A1d?=``). ``Header`` does # not implement string methods such as ``.strip()`` and is not a valid - # input to ``email.utils.getaddresses``. Normalise once here so callers - # can pass whatever ``Message.get`` returned without having to coerce. + # input to ``email.utils.getaddresses``. + # + # Important: use ``Header.encode()`` (not ``str(Header)``) to preserve the + # original RFC 2047 encoded-word form. ``core.py`` decodes display names + # later via ``decode_header_part()``; keeping encoded-words intact here + # avoids lossy intermediate conversion that can introduce replacement chars. if raw_header is None: return [] - if not isinstance(raw_header, str): + if isinstance(raw_header, email.header.Header): + raw_header = raw_header.encode() + elif not isinstance(raw_header, str): raw_header = str(raw_header) parsed = email.utils.getaddresses([raw_header], strict=True) diff --git a/tests/test_utils.py b/tests/test_utils.py index ab46a70..8bb768c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -615,16 +615,16 @@ def test_get_addresses_handles_header_object(self): """ from email.header import Header - raw_val = "=?utf-8?q?=C3=81lp=C3=A1m_Longsom?= " - header_obj = Header(raw_val, charset="utf-8") + header_obj = Header("Álpám Longsom", charset="utf-8") + header_obj.append(" ", charset="us-ascii") result = get_addresses(header_obj) self.assertIsInstance(result, list) self.assertEqual(len(result), 1) display_name, addr = result[0] self.assertEqual(addr, "recipient@example.com") - # Header.__str__ yields the encoded-word form; the display name - # is preserved (decoding happens in core.py via decode_header_part). - self.assertIn("=?utf-8?q?=C3=81lp=C3=A1m_Longsom?=", display_name) + # get_addresses returns encoded-word form for the display name. + # Decoding to Unicode happens in core.py via decode_header_part. + self.assertIn("=?utf-8?b?", display_name) def test_get_addresses_handles_none(self): """ @@ -642,6 +642,29 @@ def test_get_addresses_plain_string_unchanged(self): result = get_addresses("Plain Name ") self.assertEqual(result, [("Plain Name", "plain@example.com")]) + def test_mailparser_from_bytes_preserves_unicode_display_name(self): + """ + Regression: Header objects from Message.get(name) must round-trip + through get_addresses() without introducing replacement characters. + + The parser should expose the decoded Unicode display name on + MailParser.to. + """ + from mailparser.core import MailParser + + raw_email = ( + b"From: Sender \r\n" + b"To: =?utf-8?b?w4FscMOhbSBMb25nc29t?= \r\n" + b"Subject: Test\r\n" + b"Date: Tue, 12 May 2026 18:00:00 +0000\r\n" + b"Content-Type: text/plain; charset=utf-8\r\n" + b"\r\n" + b"hello\r\n" + ) + + mail = MailParser.from_bytes(raw_email) + self.assertEqual(mail.to, [("Álpám Longsom", "recipient@example.com")]) + def test_parse_received_envelope_from_with_angle_brackets(self): """Test utils.py:294-296 — envelope-from clause with angle-bracket match""" # When envelope-from keyword is present AND its value has angle From b81a2cd2dbe49265d79ca7b054a78d4f52bd1c5e Mon Sep 17 00:00:00 2001 From: Martin Kurtz <70766440+rakurtz@users.noreply.github.com> Date: Tue, 12 May 2026 19:30:18 +0200 Subject: [PATCH 3/3] fix(get_addresses): decode Header values before strict address parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some real-world mails produce To/From Header objects that render as encoded-word tokens like =?unknown-8bit?...?=. Passing those directly to email.utils.getaddresses(strict=True) can misclassify the token itself as the address, leaking encoded text into downstream output (e.g. PDF To: line). Decode Header values first via decode_header_part(raw_header.encode()) so getaddresses operates on a parseable Unicode string (e.g. "Álpám Longsom "). Adds regression tests for: - get_addresses with unknown-8bit Header objects - MailParser.from_bytes end-to-end unknown-8bit display-name decoding Co-authored-by: Cursor --- src/mailparser/utils.py | 12 +++++++----- tests/test_utils.py | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/src/mailparser/utils.py b/src/mailparser/utils.py index b48b4d4..bd4dfc1 100644 --- a/src/mailparser/utils.py +++ b/src/mailparser/utils.py @@ -117,14 +117,16 @@ def get_addresses(raw_header): # not implement string methods such as ``.strip()`` and is not a valid # input to ``email.utils.getaddresses``. # - # Important: use ``Header.encode()`` (not ``str(Header)``) to preserve the - # original RFC 2047 encoded-word form. ``core.py`` decodes display names - # later via ``decode_header_part()``; keeping encoded-words intact here - # avoids lossy intermediate conversion that can introduce replacement chars. + # Important: decode ``Header`` values into a plain parseable string first. + # In practice, strict address parsing can treat raw encoded-word tokens like + # ``=?unknown-8bit?...?=`` as the *address* itself, producing output such as + # ``To: =?unknown-8bit?...?=``. Decoding first gives + # ``Álpám Longsom `` so getaddresses() can split + # name/address correctly. if raw_header is None: return [] if isinstance(raw_header, email.header.Header): - raw_header = raw_header.encode() + raw_header = decode_header_part(raw_header.encode()) elif not isinstance(raw_header, str): raw_header = str(raw_header) diff --git a/tests/test_utils.py b/tests/test_utils.py index 8bb768c..5fc46c3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -624,7 +624,23 @@ def test_get_addresses_handles_header_object(self): self.assertEqual(addr, "recipient@example.com") # get_addresses returns encoded-word form for the display name. # Decoding to Unicode happens in core.py via decode_header_part. - self.assertIn("=?utf-8?b?", display_name) + self.assertEqual(display_name, "Álpám Longsom") + + def test_get_addresses_handles_unknown_8bit_header_object(self): + """ + Regression for real-world unknown-8bit encoded-word headers. + Address parsing must not treat the encoded-word token itself as + the address. + """ + from email.header import Header + + encoded_name = "=?unknown-8bit?b?w4FscMOhbSBMb25nc29t?=" + header_obj = Header() + header_obj.append(encoded_name, charset="us-ascii") + header_obj.append(" ", charset="us-ascii") + + result = get_addresses(header_obj) + self.assertEqual(result, [("Álpám Longsom", "recipient@example.com")]) def test_get_addresses_handles_none(self): """ @@ -665,6 +681,25 @@ def test_mailparser_from_bytes_preserves_unicode_display_name(self): mail = MailParser.from_bytes(raw_email) self.assertEqual(mail.to, [("Álpám Longsom", "recipient@example.com")]) + def test_mailparser_from_bytes_unknown_8bit_display_name(self): + """ + End-to-end regression for unknown-8bit encoded-word in To header. + """ + from mailparser.core import MailParser + + raw_email = ( + b"From: Sender \r\n" + b"To: =?unknown-8bit?b?w4FscMOhbSBMb25nc29t?= \r\n" + b"Subject: Test\r\n" + b"Date: Tue, 12 May 2026 18:00:00 +0000\r\n" + b"Content-Type: text/plain; charset=utf-8\r\n" + b"\r\n" + b"hello\r\n" + ) + + mail = MailParser.from_bytes(raw_email) + self.assertEqual(mail.to, [("Álpám Longsom", "recipient@example.com")]) + def test_parse_received_envelope_from_with_angle_brackets(self): """Test utils.py:294-296 — envelope-from clause with angle-bracket match""" # When envelope-from keyword is present AND its value has angle