microsoft · mvanhorn · Mar 22, 2026
diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -35,6 +35,7 @@
     ImageConverter,
     AudioConverter,
     OutlookMsgConverter,
+    EmlConverter,
     ZipConverter,
     EpubConverter,
     DocumentIntelligenceConverter,
@@ -200,6 +201,7 @@ def enable_builtins(self, **kwargs) -> None:
             self.register_converter(IpynbConverter())
             self.register_converter(PdfConverter())
             self.register_converter(OutlookMsgConverter())
+            self.register_converter(EmlConverter())
             self.register_converter(EpubConverter())
             self.register_converter(CsvConverter())
 

diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -16,6 +16,7 @@
 from ._image_converter import ImageConverter
 from ._audio_converter import AudioConverter
 from ._outlook_msg_converter import OutlookMsgConverter
+from ._eml_converter import EmlConverter
 from ._zip_converter import ZipConverter
 from ._doc_intel_converter import (
     DocumentIntelligenceConverter,
@@ -40,6 +41,7 @@
     "ImageConverter",
     "AudioConverter",
     "OutlookMsgConverter",
+    "EmlConverter",
     "ZipConverter",
     "DocumentIntelligenceConverter",
     "DocumentIntelligenceFileType",

diff --git a/packages/markitdown/src/markitdown/converters/_eml_converter.py b/packages/markitdown/src/markitdown/converters/_eml_converter.py
@@ -0,0 +1,135 @@
+import email
+import email.policy
+import re
+from typing import Any, BinaryIO
+from .._stream_info import StreamInfo
+from .._base_converter import DocumentConverter, DocumentConverterResult
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "message/rfc822",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".eml"]
+
+
+class EmlConverter(DocumentConverter):
+    """Converts EML (RFC 822) email files to markdown by extracting headers and body content.
+
+    Uses Python's built-in email module to parse the message and extract:
+    - Email headers (From, To, Cc, Subject, Date)
+    - Email body content (prefers text/plain, falls back to text/html with tag stripping)
+    """
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        raw_bytes = file_stream.read()
+        msg = email.message_from_bytes(raw_bytes, policy=email.policy.default)
+
+        md_content = "# Email Message\n\n"
+
+        headers = {
+            "From": msg.get("From", ""),
+            "To": msg.get("To", ""),
+            "Cc": msg.get("Cc", ""),
+            "Subject": msg.get("Subject", ""),
+            "Date": msg.get("Date", ""),
+        }
+
+        for key, value in headers.items():
+            if value:
+                md_content += f"**{key}:** {value}\n"
+
+        md_content += "\n## Content\n\n"
+
+        body = self._get_body(msg)
+        if body:
+            md_content += body
+
+        return DocumentConverterResult(
+            markdown=md_content.strip(),
+            title=headers.get("Subject") or None,
+        )
+
+    def _get_body(self, msg: email.message.Message) -> str:
+        """Extract the body from the email message.
+
+        Prefers text/plain. Falls back to text/html with HTML tag stripping.
+        """
+        if msg.is_multipart():
+            plain_part = None
+            html_part = None
+            for part in msg.walk():
+                content_type = part.get_content_type()
+                if content_type == "text/plain" and plain_part is None:
+                    plain_part = part
+                elif content_type == "text/html" and html_part is None:
+                    html_part = part
+
+            if plain_part is not None:
+                return self._decode_part(plain_part)
+            elif html_part is not None:
+                return self._strip_html(self._decode_part(html_part))
+        else:
+            content_type = msg.get_content_type()
+            body = self._decode_payload(msg)
+            if content_type == "text/html":
+                return self._strip_html(body)
+            return body
+
+        return ""
+
+    def _decode_part(self, part: email.message.Message) -> str:
+        """Decode a MIME part's payload to a string."""
+        payload = part.get_payload(decode=True)
+        if payload is None:
+            return ""
+        charset = part.get_content_charset() or "utf-8"
+        try:
+            return payload.decode(charset).strip()
+        except (UnicodeDecodeError, LookupError):
+            return payload.decode("utf-8", errors="ignore").strip()
+
+    def _decode_payload(self, msg: email.message.Message) -> str:
+        """Decode a non-multipart message payload to a string."""
+        payload = msg.get_payload(decode=True)
+        if payload is None:
+            return ""
+        charset = msg.get_content_charset() or "utf-8"
+        try:
+            return payload.decode(charset).strip()
+        except (UnicodeDecodeError, LookupError):
+            return payload.decode("utf-8", errors="ignore").strip()
+
+    def _strip_html(self, html: str) -> str:
+        """Strip HTML tags to extract plain text."""
+        text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
+        text = re.sub(r"</?p\s*/?>", "\n", text, flags=re.IGNORECASE)
+        text = re.sub(r"<[^>]+>", "", text)
+        text = re.sub(r"&nbsp;", " ", text)
+        text = re.sub(r"&amp;", "&", text)
+        text = re.sub(r"&lt;", "<", text)
+        text = re.sub(r"&gt;", ">", text)
+        text = re.sub(r"\n{3,}", "\n\n", text)
+        return text.strip()
diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py
@@ -87,6 +87,39 @@ class FileTestVector(object):
         ],
         must_not_include=[],
     ),
+    FileTestVector(
+        filename="test_email.eml",
+        mimetype="message/rfc822",
+        charset="utf-8",
+        url=None,
+        must_include=[
+            "# Email Message",
+            "**From:** test.sender@example.com",
+            "**To:** test.recipient@example.com",
+            "**Cc:** test.cc@example.com",
+            "**Subject:** Test Email Message",
+            "## Content",
+            "This is the body of the test email message",
+        ],
+        must_not_include=[],
+    ),
+    FileTestVector(
+        filename="test_email_html.eml",
+        mimetype="message/rfc822",
+        charset="utf-8",
+        url=None,
+        must_include=[
+            "# Email Message",
+            "**From:** html.sender@example.com",
+            "**Subject:** HTML Test Email",
+            "## Content",
+            "This is the plain text version of the email",
+        ],
+        must_not_include=[
+            "<html>",
+            "<body>",
+        ],
+    ),
     FileTestVector(
         filename="test.pdf",
         mimetype="application/pdf",

diff --git a/packages/markitdown/tests/test_files/test_email.eml b/packages/markitdown/tests/test_files/test_email.eml
@@ -0,0 +1,15 @@
+From: test.sender@example.com
+To: test.recipient@example.com
+Cc: test.cc@example.com
+Subject: Test Email Message
+Date: Fri, 21 Mar 2026 10:00:00 -0700
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+
+This is the body of the test email message.
+
+It has multiple paragraphs with some content for testing.
+
+Best regards,
+Test Sender
diff --git a/packages/markitdown/tests/test_files/test_email_html.eml b/packages/markitdown/tests/test_files/test_email_html.eml
@@ -0,0 +1,18 @@
+From: html.sender@example.com
+To: html.recipient@example.com
+Subject: HTML Test Email
+Date: Fri, 21 Mar 2026 11:00:00 -0700
+MIME-Version: 1.0
+Content-Type: multipart/alternative; boundary="boundary123"
+
+--boundary123
+Content-Type: text/plain; charset=utf-8
+
+This is the plain text version of the email.
+
+--boundary123
+Content-Type: text/html; charset=utf-8
+
+<html><body><p>This is the <b>HTML</b> version of the email.</p></body></html>
+
+--boundary123--