diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index f342a614b..f1c1256b3 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -35,6 +35,7 @@
ImageConverter,
AudioConverter,
OutlookMsgConverter,
+ EmlConverter,
ZipConverter,
EpubConverter,
DocumentIntelligenceConverter,
@@ -200,6 +201,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(IpynbConverter())
self.register_converter(PdfConverter())
self.register_converter(OutlookMsgConverter())
+ self.register_converter(EmlConverter())
self.register_converter(EpubConverter())
self.register_converter(CsvConverter())
diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
index e4437a582..4d95daf4f 100644
--- a/packages/markitdown/src/markitdown/converters/__init__.py
+++ b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -16,6 +16,7 @@
from ._image_converter import ImageConverter
from ._audio_converter import AudioConverter
from ._outlook_msg_converter import OutlookMsgConverter
+from ._eml_converter import EmlConverter
from ._zip_converter import ZipConverter
from ._doc_intel_converter import (
DocumentIntelligenceConverter,
@@ -40,6 +41,7 @@
"ImageConverter",
"AudioConverter",
"OutlookMsgConverter",
+ "EmlConverter",
"ZipConverter",
"DocumentIntelligenceConverter",
"DocumentIntelligenceFileType",
diff --git a/packages/markitdown/src/markitdown/converters/_eml_converter.py b/packages/markitdown/src/markitdown/converters/_eml_converter.py
new file mode 100644
index 000000000..57c15e3bf
--- /dev/null
+++ b/packages/markitdown/src/markitdown/converters/_eml_converter.py
@@ -0,0 +1,135 @@
+import email
+import email.policy
+import re
+from typing import Any, BinaryIO
+from .._stream_info import StreamInfo
+from .._base_converter import DocumentConverter, DocumentConverterResult
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+ "message/rfc822",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".eml"]
+
+
+class EmlConverter(DocumentConverter):
+ """Converts EML (RFC 822) email files to markdown by extracting headers and body content.
+
+ Uses Python's built-in email module to parse the message and extract:
+ - Email headers (From, To, Cc, Subject, Date)
+ - Email body content (prefers text/plain, falls back to text/html with tag stripping)
+ """
+
+ def accepts(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any,
+ ) -> bool:
+ mimetype = (stream_info.mimetype or "").lower()
+ extension = (stream_info.extension or "").lower()
+
+ if extension in ACCEPTED_FILE_EXTENSIONS:
+ return True
+
+ for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+ if mimetype.startswith(prefix):
+ return True
+
+ return False
+
+ def convert(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any,
+ ) -> DocumentConverterResult:
+ raw_bytes = file_stream.read()
+ msg = email.message_from_bytes(raw_bytes, policy=email.policy.default)
+
+ md_content = "# Email Message\n\n"
+
+ headers = {
+ "From": msg.get("From", ""),
+ "To": msg.get("To", ""),
+ "Cc": msg.get("Cc", ""),
+ "Subject": msg.get("Subject", ""),
+ "Date": msg.get("Date", ""),
+ }
+
+ for key, value in headers.items():
+ if value:
+ md_content += f"**{key}:** {value}\n"
+
+ md_content += "\n## Content\n\n"
+
+ body = self._get_body(msg)
+ if body:
+ md_content += body
+
+ return DocumentConverterResult(
+ markdown=md_content.strip(),
+ title=headers.get("Subject") or None,
+ )
+
+ def _get_body(self, msg: email.message.Message) -> str:
+ """Extract the body from the email message.
+
+ Prefers text/plain. Falls back to text/html with HTML tag stripping.
+ """
+ if msg.is_multipart():
+ plain_part = None
+ html_part = None
+ for part in msg.walk():
+ content_type = part.get_content_type()
+ if content_type == "text/plain" and plain_part is None:
+ plain_part = part
+ elif content_type == "text/html" and html_part is None:
+ html_part = part
+
+ if plain_part is not None:
+ return self._decode_part(plain_part)
+ elif html_part is not None:
+ return self._strip_html(self._decode_part(html_part))
+ else:
+ content_type = msg.get_content_type()
+ body = self._decode_payload(msg)
+ if content_type == "text/html":
+ return self._strip_html(body)
+ return body
+
+ return ""
+
+ def _decode_part(self, part: email.message.Message) -> str:
+ """Decode a MIME part's payload to a string."""
+ payload = part.get_payload(decode=True)
+ if payload is None:
+ return ""
+ charset = part.get_content_charset() or "utf-8"
+ try:
+ return payload.decode(charset).strip()
+ except (UnicodeDecodeError, LookupError):
+ return payload.decode("utf-8", errors="ignore").strip()
+
+ def _decode_payload(self, msg: email.message.Message) -> str:
+ """Decode a non-multipart message payload to a string."""
+ payload = msg.get_payload(decode=True)
+ if payload is None:
+ return ""
+ charset = msg.get_content_charset() or "utf-8"
+ try:
+ return payload.decode(charset).strip()
+ except (UnicodeDecodeError, LookupError):
+ return payload.decode("utf-8", errors="ignore").strip()
+
+ def _strip_html(self, html: str) -> str:
+ """Strip HTML tags to extract plain text."""
+ text = re.sub(r"
", "\n", html, flags=re.IGNORECASE)
+ text = re.sub(r"?p\s*/?>", "\n", text, flags=re.IGNORECASE)
+ text = re.sub(r"<[^>]+>", "", text)
+ text = re.sub(r" ", " ", text)
+ text = re.sub(r"&", "&", text)
+ text = re.sub(r"<", "<", text)
+ text = re.sub(r">", ">", text)
+ text = re.sub(r"\n{3,}", "\n\n", text)
+ return text.strip()
diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py
index 74fa9bd0a..247d7f90c 100644
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -87,6 +87,39 @@ class FileTestVector(object):
],
must_not_include=[],
),
+ FileTestVector(
+ filename="test_email.eml",
+ mimetype="message/rfc822",
+ charset="utf-8",
+ url=None,
+ must_include=[
+ "# Email Message",
+ "**From:** test.sender@example.com",
+ "**To:** test.recipient@example.com",
+ "**Cc:** test.cc@example.com",
+ "**Subject:** Test Email Message",
+ "## Content",
+ "This is the body of the test email message",
+ ],
+ must_not_include=[],
+ ),
+ FileTestVector(
+ filename="test_email_html.eml",
+ mimetype="message/rfc822",
+ charset="utf-8",
+ url=None,
+ must_include=[
+ "# Email Message",
+ "**From:** html.sender@example.com",
+ "**Subject:** HTML Test Email",
+ "## Content",
+ "This is the plain text version of the email",
+ ],
+ must_not_include=[
+ "",
+ "
This is the HTML version of the email.
+ +--boundary123--