Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions src/mailparser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,36 @@ def get_addresses(raw_header):
that was actually present in the header.

Args:
raw_header (str): raw value of an address header
(e.g. ``From``, ``To``, ``CC`` …)
raw_header (str | email.header.Header | None): raw value of an
address header (e.g. ``From``, ``To``, ``CC`` …). Accepts a
plain ``str``, an ``email.header.Header`` instance (returned
by ``email.message.Message.get`` for headers containing
RFC 2047 encoded-words such as non-ASCII display names), or
``None``.

Returns:
list[tuple[str, str]]: list of ``(display_name, email_addr)`` tuples.
``display_name`` is an empty string when absent.
"""
# ``Message.get(name)`` returns an ``email.header.Header`` for any header
# whose value contains RFC 2047 encoded-words (typical for non-ASCII
# display names like ``=?utf-8?q?=C3=81rp=C3=A1d?=``). ``Header`` does
# not implement string methods such as ``.strip()`` and is not a valid
# input to ``email.utils.getaddresses``.
#
# Important: decode ``Header`` values into a plain parseable string first.
# In practice, strict address parsing can treat raw encoded-word tokens like
# ``=?unknown-8bit?...?=`` as the *address* itself, producing output such as
# ``To: =?unknown-8bit?...?=``. Decoding first gives
# ``Álpám Longsom <recipient@example.com>`` so getaddresses() can split
# name/address correctly.
if raw_header is None:
return []
if isinstance(raw_header, email.header.Header):
raw_header = decode_header_part(raw_header.encode())
elif not isinstance(raw_header, str):
raw_header = str(raw_header)

parsed = email.utils.getaddresses([raw_header], strict=True)

# If every result from the strict parser has an empty address — while the
Expand Down
100 changes: 100 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from mailparser.utils import (
decode_header_part,
find_between,
get_addresses,
msgconvert,
parse_received,
ported_open,
Expand Down Expand Up @@ -600,6 +601,105 @@ def test_ported_string_handles_header_object(self):
self.assertIsInstance(result, str)
self.assertEqual(result, raw_val)

def test_get_addresses_handles_header_object(self):
"""
Test that get_addresses accepts an email.header.Header instance
without raising AttributeError on `.strip()`.

Regression for the case where Message.get(name) returns a Header
for address headers containing RFC 2047 encoded-words (e.g.
non-ASCII display names like ``=?utf-8?q?=C3=81lp=C3=A1m_Longsom?=``).
Before the fix, this raised:

AttributeError: 'Header' object has no attribute 'strip'
"""
from email.header import Header

header_obj = Header("Álpám Longsom", charset="utf-8")
header_obj.append(" <recipient@example.com>", charset="us-ascii")
result = get_addresses(header_obj)
self.assertIsInstance(result, list)
self.assertEqual(len(result), 1)
display_name, addr = result[0]
self.assertEqual(addr, "recipient@example.com")
# get_addresses returns encoded-word form for the display name.
# Decoding to Unicode happens in core.py via decode_header_part.
self.assertEqual(display_name, "Álpám Longsom")

def test_get_addresses_handles_unknown_8bit_header_object(self):
"""
Regression for real-world unknown-8bit encoded-word headers.
Address parsing must not treat the encoded-word token itself as
the address.
"""
from email.header import Header

encoded_name = "=?unknown-8bit?b?w4FscMOhbSBMb25nc29t?="
header_obj = Header()
header_obj.append(encoded_name, charset="us-ascii")
header_obj.append(" <recipient@example.com>", charset="us-ascii")

result = get_addresses(header_obj)
self.assertEqual(result, [("Álpám Longsom", "recipient@example.com")])

def test_get_addresses_handles_none(self):
"""
Test that get_addresses returns an empty list when given None,
rather than crashing on attribute access.
"""
self.assertEqual(get_addresses(None), [])

def test_get_addresses_plain_string_unchanged(self):
"""
Test that the existing plain-string path still works. This guards
against accidentally regressing the common case while adding
Header / None handling.
"""
result = get_addresses("Plain Name <plain@example.com>")
self.assertEqual(result, [("Plain Name", "plain@example.com")])

def test_mailparser_from_bytes_preserves_unicode_display_name(self):
"""
Regression: Header objects from Message.get(name) must round-trip
through get_addresses() without introducing replacement characters.

The parser should expose the decoded Unicode display name on
MailParser.to.
"""
from mailparser.core import MailParser

raw_email = (
b"From: Sender <sender@example.com>\r\n"
b"To: =?utf-8?b?w4FscMOhbSBMb25nc29t?= <recipient@example.com>\r\n"
b"Subject: Test\r\n"
b"Date: Tue, 12 May 2026 18:00:00 +0000\r\n"
b"Content-Type: text/plain; charset=utf-8\r\n"
b"\r\n"
b"hello\r\n"
)

mail = MailParser.from_bytes(raw_email)
self.assertEqual(mail.to, [("Álpám Longsom", "recipient@example.com")])

def test_mailparser_from_bytes_unknown_8bit_display_name(self):
"""
End-to-end regression for unknown-8bit encoded-word in To header.
"""
from mailparser.core import MailParser

raw_email = (
b"From: Sender <sender@example.com>\r\n"
b"To: =?unknown-8bit?b?w4FscMOhbSBMb25nc29t?= <recipient@example.com>\r\n"
b"Subject: Test\r\n"
b"Date: Tue, 12 May 2026 18:00:00 +0000\r\n"
b"Content-Type: text/plain; charset=utf-8\r\n"
b"\r\n"
b"hello\r\n"
)

mail = MailParser.from_bytes(raw_email)
self.assertEqual(mail.to, [("Álpám Longsom", "recipient@example.com")])

def test_parse_received_envelope_from_with_angle_brackets(self):
"""Test utils.py:294-296 — envelope-from clause with angle-bracket match"""
# When envelope-from keyword is present AND its value has angle
Expand Down