Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
ImageConverter,
AudioConverter,
OutlookMsgConverter,
EmlConverter,
ZipConverter,
EpubConverter,
DocumentIntelligenceConverter,
Expand Down Expand Up @@ -200,6 +201,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(IpynbConverter())
self.register_converter(PdfConverter())
self.register_converter(OutlookMsgConverter())
self.register_converter(EmlConverter())
self.register_converter(EpubConverter())
self.register_converter(CsvConverter())

Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ._image_converter import ImageConverter
from ._audio_converter import AudioConverter
from ._outlook_msg_converter import OutlookMsgConverter
from ._eml_converter import EmlConverter
from ._zip_converter import ZipConverter
from ._doc_intel_converter import (
DocumentIntelligenceConverter,
Expand All @@ -40,6 +41,7 @@
"ImageConverter",
"AudioConverter",
"OutlookMsgConverter",
"EmlConverter",
"ZipConverter",
"DocumentIntelligenceConverter",
"DocumentIntelligenceFileType",
Expand Down
135 changes: 135 additions & 0 deletions packages/markitdown/src/markitdown/converters/_eml_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import email
import email.policy
import re
from typing import Any, BinaryIO
from .._stream_info import StreamInfo
from .._base_converter import DocumentConverter, DocumentConverterResult

ACCEPTED_MIME_TYPE_PREFIXES = [
"message/rfc822",
]

ACCEPTED_FILE_EXTENSIONS = [".eml"]


class EmlConverter(DocumentConverter):
"""Converts EML (RFC 822) email files to markdown by extracting headers and body content.

Uses Python's built-in email module to parse the message and extract:
- Email headers (From, To, Cc, Subject, Date)
- Email body content (prefers text/plain, falls back to text/html with tag stripping)
"""

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

if extension in ACCEPTED_FILE_EXTENSIONS:
return True

for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True

return False

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
raw_bytes = file_stream.read()
msg = email.message_from_bytes(raw_bytes, policy=email.policy.default)

md_content = "# Email Message\n\n"

headers = {
"From": msg.get("From", ""),
"To": msg.get("To", ""),
"Cc": msg.get("Cc", ""),
"Subject": msg.get("Subject", ""),
"Date": msg.get("Date", ""),
}

for key, value in headers.items():
if value:
md_content += f"**{key}:** {value}\n"

md_content += "\n## Content\n\n"

body = self._get_body(msg)
if body:
md_content += body

return DocumentConverterResult(
markdown=md_content.strip(),
title=headers.get("Subject") or None,
)

def _get_body(self, msg: email.message.Message) -> str:
"""Extract the body from the email message.

Prefers text/plain. Falls back to text/html with HTML tag stripping.
"""
if msg.is_multipart():
plain_part = None
html_part = None
for part in msg.walk():
content_type = part.get_content_type()
if content_type == "text/plain" and plain_part is None:
plain_part = part
elif content_type == "text/html" and html_part is None:
html_part = part

if plain_part is not None:
return self._decode_part(plain_part)
elif html_part is not None:
return self._strip_html(self._decode_part(html_part))
else:
content_type = msg.get_content_type()
body = self._decode_payload(msg)
if content_type == "text/html":
return self._strip_html(body)
return body

return ""

def _decode_part(self, part: email.message.Message) -> str:
"""Decode a MIME part's payload to a string."""
payload = part.get_payload(decode=True)
if payload is None:
return ""
charset = part.get_content_charset() or "utf-8"
try:
return payload.decode(charset).strip()
except (UnicodeDecodeError, LookupError):
return payload.decode("utf-8", errors="ignore").strip()

def _decode_payload(self, msg: email.message.Message) -> str:
"""Decode a non-multipart message payload to a string."""
payload = msg.get_payload(decode=True)
if payload is None:
return ""
charset = msg.get_content_charset() or "utf-8"
try:
return payload.decode(charset).strip()
except (UnicodeDecodeError, LookupError):
return payload.decode("utf-8", errors="ignore").strip()

def _strip_html(self, html: str) -> str:
"""Strip HTML tags to extract plain text."""
text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
text = re.sub(r"</?p\s*/?>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
text = re.sub(r"&nbsp;", " ", text)
text = re.sub(r"&amp;", "&", text)
text = re.sub(r"&lt;", "<", text)
text = re.sub(r"&gt;", ">", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
33 changes: 33 additions & 0 deletions packages/markitdown/tests/_test_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,39 @@ class FileTestVector(object):
],
must_not_include=[],
),
FileTestVector(
filename="test_email.eml",
mimetype="message/rfc822",
charset="utf-8",
url=None,
must_include=[
"# Email Message",
"**From:** test.sender@example.com",
"**To:** test.recipient@example.com",
"**Cc:** test.cc@example.com",
"**Subject:** Test Email Message",
"## Content",
"This is the body of the test email message",
],
must_not_include=[],
),
FileTestVector(
filename="test_email_html.eml",
mimetype="message/rfc822",
charset="utf-8",
url=None,
must_include=[
"# Email Message",
"**From:** html.sender@example.com",
"**Subject:** HTML Test Email",
"## Content",
"This is the plain text version of the email",
],
must_not_include=[
"<html>",
"<body>",
],
),
FileTestVector(
filename="test.pdf",
mimetype="application/pdf",
Expand Down
15 changes: 15 additions & 0 deletions packages/markitdown/tests/test_files/test_email.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
From: test.sender@example.com
To: test.recipient@example.com
Cc: test.cc@example.com
Subject: Test Email Message
Date: Fri, 21 Mar 2026 10:00:00 -0700
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 7bit

This is the body of the test email message.

It has multiple paragraphs with some content for testing.

Best regards,
Test Sender
18 changes: 18 additions & 0 deletions packages/markitdown/tests/test_files/test_email_html.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
From: html.sender@example.com
To: html.recipient@example.com
Subject: HTML Test Email
Date: Fri, 21 Mar 2026 11:00:00 -0700
MIME-Version: 1.0
Content-Type: multipart/alternative; boundary="boundary123"

--boundary123
Content-Type: text/plain; charset=utf-8

This is the plain text version of the email.

--boundary123
Content-Type: text/html; charset=utf-8

<html><body><p>This is the <b>HTML</b> version of the email.</p></body></html>

--boundary123--