diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..f1c1256b3 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -35,6 +35,7 @@ ImageConverter, AudioConverter, OutlookMsgConverter, + EmlConverter, ZipConverter, EpubConverter, DocumentIntelligenceConverter, @@ -200,6 +201,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(IpynbConverter()) self.register_converter(PdfConverter()) self.register_converter(OutlookMsgConverter()) + self.register_converter(EmlConverter()) self.register_converter(EpubConverter()) self.register_converter(CsvConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..4d95daf4f 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -16,6 +16,7 @@ from ._image_converter import ImageConverter from ._audio_converter import AudioConverter from ._outlook_msg_converter import OutlookMsgConverter +from ._eml_converter import EmlConverter from ._zip_converter import ZipConverter from ._doc_intel_converter import ( DocumentIntelligenceConverter, @@ -40,6 +41,7 @@ "ImageConverter", "AudioConverter", "OutlookMsgConverter", + "EmlConverter", "ZipConverter", "DocumentIntelligenceConverter", "DocumentIntelligenceFileType", diff --git a/packages/markitdown/src/markitdown/converters/_eml_converter.py b/packages/markitdown/src/markitdown/converters/_eml_converter.py new file mode 100644 index 000000000..57c15e3bf --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_eml_converter.py @@ -0,0 +1,135 @@ +import email +import email.policy +import re +from typing import Any, BinaryIO +from .._stream_info import StreamInfo +from .._base_converter import DocumentConverter, DocumentConverterResult + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "message/rfc822", +] + +ACCEPTED_FILE_EXTENSIONS = [".eml"] + + +class EmlConverter(DocumentConverter): + """Converts EML (RFC 822) email files to markdown by extracting headers and body content. + + Uses Python's built-in email module to parse the message and extract: + - Email headers (From, To, Cc, Subject, Date) + - Email body content (prefers text/plain, falls back to text/html with tag stripping) + """ + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + raw_bytes = file_stream.read() + msg = email.message_from_bytes(raw_bytes, policy=email.policy.default) + + md_content = "# Email Message\n\n" + + headers = { + "From": msg.get("From", ""), + "To": msg.get("To", ""), + "Cc": msg.get("Cc", ""), + "Subject": msg.get("Subject", ""), + "Date": msg.get("Date", ""), + } + + for key, value in headers.items(): + if value: + md_content += f"**{key}:** {value}\n" + + md_content += "\n## Content\n\n" + + body = self._get_body(msg) + if body: + md_content += body + + return DocumentConverterResult( + markdown=md_content.strip(), + title=headers.get("Subject") or None, + ) + + def _get_body(self, msg: email.message.Message) -> str: + """Extract the body from the email message. + + Prefers text/plain. Falls back to text/html with HTML tag stripping. + """ + if msg.is_multipart(): + plain_part = None + html_part = None + for part in msg.walk(): + content_type = part.get_content_type() + if content_type == "text/plain" and plain_part is None: + plain_part = part + elif content_type == "text/html" and html_part is None: + html_part = part + + if plain_part is not None: + return self._decode_part(plain_part) + elif html_part is not None: + return self._strip_html(self._decode_part(html_part)) + else: + content_type = msg.get_content_type() + body = self._decode_payload(msg) + if content_type == "text/html": + return self._strip_html(body) + return body + + return "" + + def _decode_part(self, part: email.message.Message) -> str: + """Decode a MIME part's payload to a string.""" + payload = part.get_payload(decode=True) + if payload is None: + return "" + charset = part.get_content_charset() or "utf-8" + try: + return payload.decode(charset).strip() + except (UnicodeDecodeError, LookupError): + return payload.decode("utf-8", errors="ignore").strip() + + def _decode_payload(self, msg: email.message.Message) -> str: + """Decode a non-multipart message payload to a string.""" + payload = msg.get_payload(decode=True) + if payload is None: + return "" + charset = msg.get_content_charset() or "utf-8" + try: + return payload.decode(charset).strip() + except (UnicodeDecodeError, LookupError): + return payload.decode("utf-8", errors="ignore").strip() + + def _strip_html(self, html: str) -> str: + """Strip HTML tags to extract plain text.""" + text = re.sub(r"", "\n", html, flags=re.IGNORECASE) + text = re.sub(r"", "\n", text, flags=re.IGNORECASE) + text = re.sub(r"<[^>]+>", "", text) + text = re.sub(r" ", " ", text) + text = re.sub(r"&", "&", text) + text = re.sub(r"<", "<", text) + text = re.sub(r">", ">", text) + text = re.sub(r"\n{3,}", "\n\n", text) + return text.strip() diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 74fa9bd0a..247d7f90c 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -87,6 +87,39 @@ class FileTestVector(object): ], must_not_include=[], ), + FileTestVector( + filename="test_email.eml", + mimetype="message/rfc822", + charset="utf-8", + url=None, + must_include=[ + "# Email Message", + "**From:** test.sender@example.com", + "**To:** test.recipient@example.com", + "**Cc:** test.cc@example.com", + "**Subject:** Test Email Message", + "## Content", + "This is the body of the test email message", + ], + must_not_include=[], + ), + FileTestVector( + filename="test_email_html.eml", + mimetype="message/rfc822", + charset="utf-8", + url=None, + must_include=[ + "# Email Message", + "**From:** html.sender@example.com", + "**Subject:** HTML Test Email", + "## Content", + "This is the plain text version of the email", + ], + must_not_include=[ + "", + "", + ], + ), FileTestVector( filename="test.pdf", mimetype="application/pdf", diff --git a/packages/markitdown/tests/test_files/test_email.eml b/packages/markitdown/tests/test_files/test_email.eml new file mode 100644 index 000000000..6d1d77b96 --- /dev/null +++ b/packages/markitdown/tests/test_files/test_email.eml @@ -0,0 +1,15 @@ +From: test.sender@example.com +To: test.recipient@example.com +Cc: test.cc@example.com +Subject: Test Email Message +Date: Fri, 21 Mar 2026 10:00:00 -0700 +MIME-Version: 1.0 +Content-Type: text/plain; charset=utf-8 +Content-Transfer-Encoding: 7bit + +This is the body of the test email message. + +It has multiple paragraphs with some content for testing. + +Best regards, +Test Sender diff --git a/packages/markitdown/tests/test_files/test_email_html.eml b/packages/markitdown/tests/test_files/test_email_html.eml new file mode 100644 index 000000000..9786ed73e --- /dev/null +++ b/packages/markitdown/tests/test_files/test_email_html.eml @@ -0,0 +1,18 @@ +From: html.sender@example.com +To: html.recipient@example.com +Subject: HTML Test Email +Date: Fri, 21 Mar 2026 11:00:00 -0700 +MIME-Version: 1.0 +Content-Type: multipart/alternative; boundary="boundary123" + +--boundary123 +Content-Type: text/plain; charset=utf-8 + +This is the plain text version of the email. + +--boundary123 +Content-Type: text/html; charset=utf-8 + +

This is the HTML version of the email.

+ +--boundary123--