From 47d58f2eef4e86740098e9266fdebc0bd32d9560 Mon Sep 17 00:00:00 2001
From: carljxlin <769338334@qq.com>
Date: Sun, 29 Mar 2026 19:11:18 +0800
Subject: [PATCH] Add --save-images flag for extracting images to disk across
 converters

- Fix image loss in EPUB conversion: resolve relative <img src> paths
  inside the ZIP and embed them as base64 or save to files
- Add --save-images [DIR] CLI flag (and save_images kwarg for the API):
  - No DIR: auto-creates images_{output_stem}/ next to the output file
  - With DIR: saves images to the specified path
- Support image extraction for EPUB, DOCX, PPTX, and PDF converters
- PDF: interleave extracted images at their correct vertical position
  in the text rather than appending them at the end; preserve table
  structure in cropped page regions when images are present
- Extract shared dir-resolution logic into converter_utils/images.py
- Add debug/ and generated image dirs to .gitignore

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitignore                                    |   4 +
 .../markitdown/src/markitdown/__main__.py     |  32 +++-
 .../src/markitdown/converter_utils/images.py  |  45 +++++
 .../markitdown/converters/_docx_converter.py  |  44 ++++-
 .../markitdown/converters/_epub_converter.py  |  97 +++++++++--
 .../markitdown/converters/_pdf_converter.py   | 154 +++++++++++++++++-
 .../markitdown/converters/_pptx_converter.py  |  26 ++-
 7 files changed, 374 insertions(+), 28 deletions(-)
 create mode 100644 packages/markitdown/src/markitdown/converter_utils/images.py
diff --git a/.gitignore b/.gitignore
index 15613ea8a..49030756f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,8 @@
 .vscode
+debug/
+test_pdf*.md
+images_*/
+my_*/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index 6085ad6bb..3e5c90418 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: MIT
 import argparse
+import os
+import re
 import sys
 import codecs
 from textwrap import dedent
@@ -110,6 +112,17 @@ def main():
         help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
     )
 
+    parser.add_argument(
+        "--save-images",
+        nargs="?",
+        const=True,
+        default=False,
+        metavar="DIR",
+        help="Extract images from documents and save them to a directory. "
+        "If DIR is omitted, images are saved to ./images_{output_filename}/. "
+        "When omitted entirely, images are not included in the output (default).",
+    )
+
     parser.add_argument("filename", nargs="?")
     args = parser.parse_args()
 
@@ -186,15 +199,32 @@ def main():
     else:
         markitdown = MarkItDown(enable_plugins=args.use_plugins)
 
+    # Resolve the images directory path
+    save_images = args.save_images
+    if save_images is True:
+        # Auto-compute directory name from output filename, then input filename
+        if args.output:
+            stem = os.path.splitext(os.path.basename(args.output))[0]
+        elif args.filename:
+            stem = os.path.splitext(os.path.basename(args.filename))[0]
+        else:
+            stem = "output"
+        stem = re.sub(r"[^\w\-]", "_", stem)
+        save_images = f"images_{stem}"
+
     if args.filename is None:
         result = markitdown.convert_stream(
             sys.stdin.buffer,
             stream_info=stream_info,
             keep_data_uris=args.keep_data_uris,
+            save_images=save_images,
         )
     else:
         result = markitdown.convert(
-            args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
+            args.filename,
+            stream_info=stream_info,
+            keep_data_uris=args.keep_data_uris,
+            save_images=save_images,
         )
 
     _handle_output(args, result)
diff --git a/packages/markitdown/src/markitdown/converter_utils/images.py b/packages/markitdown/src/markitdown/converter_utils/images.py
new file mode 100644
index 000000000..526fac9b3
--- /dev/null
+++ b/packages/markitdown/src/markitdown/converter_utils/images.py
@@ -0,0 +1,45 @@
+import os
+import re
+
+from .._stream_info import StreamInfo
+
+
+def resolve_images_dir(
+    save_images: bool | str,
+    stream_info: StreamInfo,
+    fallback_name: str,
+) -> tuple[str, str]:
+    """Resolve the images directory and markdown prefix from a ``save_images`` kwarg.
+
+    Parameters
+    ----------
+    save_images:
+        - ``str``  — use this path directly as both the directory and the
+                     markdown image prefix.
+        - ``True`` — auto-derive ``images_{stem}`` from *stream_info.filename*,
+                     falling back to *fallback_name* when no filename is available.
+    stream_info:
+        Stream metadata; ``stream_info.filename`` is used for auto-naming.
+    fallback_name:
+        Format-specific fallback stem (e.g. ``"epub"``, ``"pdf"``) used when
+        no filename is available and *save_images* is ``True``.
+
+    Returns
+    -------
+    (actual_images_dir, md_images_prefix)
+        The directory to write images into, and the prefix to use in markdown
+        ``![alt](prefix/filename)`` references.  The directory is created
+        (including any parents) before returning.
+    """
+    if isinstance(save_images, str):
+        actual_images_dir = save_images
+        md_images_prefix = save_images
+    else:
+        file_stem = re.sub(
+            r"[^\w\-]", "_", os.path.splitext(stream_info.filename or fallback_name)[0]
+        )
+        actual_images_dir = f"images_{file_stem}"
+        md_images_prefix = f"./images_{file_stem}"
+
+    os.makedirs(actual_images_dir, exist_ok=True)
+    return actual_images_dir, md_images_prefix
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index 3975107b1..2c3f92e4d 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -1,11 +1,17 @@
+import base64
+import mimetypes
+import os
+import re
 import sys
 import io
 from warnings import warn
 
+from bs4 import BeautifulSoup
 from typing import BinaryIO, Any
 
 from ._html_converter import HtmlConverter
 from ..converter_utils.docx.pre_process import pre_process_docx
+from ..converter_utils.images import resolve_images_dir
 from .._base_converter import DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@@ -27,6 +33,9 @@
 
 ACCEPTED_FILE_EXTENSIONS = [".docx"]
 
+# Map mimetypes.guess_extension() quirks to sane extensions
+_EXT_FIXES = {".jpe": ".jpg", ".jpeg": ".jpg"}
+
 
 class DocxConverter(HtmlConverter):
     """
@@ -77,7 +86,34 @@ def convert(
 
         style_map = kwargs.get("style_map", None)
         pre_process_stream = pre_process_docx(file_stream)
-        return self._html_converter.convert_string(
-            mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
-            **kwargs,
-        )
+        html = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value
+
+        save_images = kwargs.get("save_images", False)
+        if save_images:
+            actual_images_dir, md_prefix = resolve_images_dir(
+                save_images, stream_info, "docx"
+            )
+            html = self._save_images(html, actual_images_dir, md_prefix)
+
+        return self._html_converter.convert_string(html, **kwargs)
+
+    def _save_images(self, html: str, images_dir: str, md_prefix: str) -> str:
+        """Extract base64 data URI images from mammoth HTML, save to *images_dir*,
+        and replace each src with a *md_prefix*/filename relative path."""
+        soup = BeautifulSoup(html, "html.parser")
+        for i, img in enumerate(soup.find_all("img")):
+            src = img.get("src", "")
+            if not src.startswith("data:"):
+                continue
+            try:
+                header, b64data = src.split(",", 1)
+                mime = header.split(":")[1].split(";")[0]
+                ext = mimetypes.guess_extension(mime) or ".bin"
+                ext = _EXT_FIXES.get(ext, ext)
+                filename = f"image_{i + 1}{ext}"
+                with open(os.path.join(images_dir, filename), "wb") as f:
+                    f.write(base64.b64decode(b64data))
+                img["src"] = f"{md_prefix}/{filename}"
+            except Exception:
+                continue
+        return str(soup)
diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py
index 3be65b016..c14c40318 100644
--- a/packages/markitdown/src/markitdown/converters/_epub_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -1,12 +1,19 @@
+import base64
+import io
+import mimetypes
 import os
+import posixpath
+import re
 import zipfile
 from defusedxml import minidom
 from xml.dom.minidom import Document
 
+from bs4 import BeautifulSoup
 from typing import BinaryIO, Any, Dict, List
 
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverterResult
+from ..converter_utils.images import resolve_images_dir
 from .._stream_info import StreamInfo
 
 ACCEPTED_MIME_TYPE_PREFIXES = [
@@ -98,22 +105,45 @@ def convert(
             ]
 
             # Extract and convert the content
+            # images_dir: optional base directory where images will be saved.
+            # A subdirectory image_{stem} is created inside it per file, so
+            # converting multiple files into the same dir never mixes images.
+            # When omitted, images are embedded inline as base64 data URIs.
+            save_images = kwargs.get("save_images", False)
+            actual_images_dir: str | None = None
+            md_images_prefix: str | None = None
+            if save_images:
+                actual_images_dir, md_images_prefix = resolve_images_dir(
+                    save_images, stream_info, "epub"
+                )
+
+            namelist_set = set(z.namelist())
             markdown_content: List[str] = []
             for file in spine:
-                if file in z.namelist():
+                if file in namelist_set:
                     with z.open(file) as f:
-                        filename = os.path.basename(file)
-                        extension = os.path.splitext(filename)[1].lower()
-                        mimetype = MIME_TYPE_MAPPING.get(extension)
-                        converted_content = self._html_converter.convert(
-                            f,
-                            StreamInfo(
-                                mimetype=mimetype,
-                                extension=extension,
-                                filename=filename,
-                            ),
-                        )
-                        markdown_content.append(converted_content.markdown.strip())
+                        html_bytes = f.read()
+
+                    # Resolve relative image src attributes so that images survive
+                    # the conversion to Markdown.
+                    html_bytes = self._resolve_images(
+                        html_bytes, file, z, namelist_set,
+                        actual_images_dir, md_images_prefix,
+                    )
+
+                    filename = os.path.basename(file)
+                    extension = os.path.splitext(filename)[1].lower()
+                    mimetype = MIME_TYPE_MAPPING.get(extension)
+                    converted_content = self._html_converter.convert(
+                        io.BytesIO(html_bytes),
+                        StreamInfo(
+                            mimetype=mimetype,
+                            extension=extension,
+                            filename=filename,
+                        ),
+                        keep_data_uris=actual_images_dir is None,
+                    )
+                    markdown_content.append(converted_content.markdown.strip())
 
             # Format and add the metadata
             metadata_markdown = []
@@ -129,6 +159,47 @@ def convert(
                 markdown="\n\n".join(markdown_content), title=metadata["title"]
             )
 
+    def _resolve_images(
+        self,
+        html_bytes: bytes,
+        html_path: str,
+        z: zipfile.ZipFile,
+        namelist_set: set,
+        images_dir: str | None,
+        md_images_prefix: str | None,
+    ) -> bytes:
+        """Rewrite <img src> attributes so images survive HTML-to-Markdown conversion.
+
+        If *images_dir* is given, each image is extracted there and the src is
+        replaced with *md_images_prefix*/filename (a path relative to the markdown
+        file).  Otherwise the image is embedded as a base64 data URI.
+        """
+        soup = BeautifulSoup(html_bytes, "html.parser")
+        changed = False
+        html_dir = posixpath.dirname(html_path)
+
+        for img in soup.find_all("img"):
+            src = img.get("src", "")
+            if not src or src.startswith("data:") or src.startswith("http"):
+                continue
+            resolved = posixpath.normpath(posixpath.join(html_dir, src))
+            if resolved not in namelist_set:
+                continue
+            img_bytes = z.read(resolved)
+            if images_dir:
+                img_filename = os.path.basename(resolved)
+                with open(os.path.join(images_dir, img_filename), "wb") as out:
+                    out.write(img_bytes)
+                img["src"] = f"{md_images_prefix}/{img_filename}"
+            else:
+                mime, _ = mimetypes.guess_type(resolved)
+                mime = mime or "image/jpeg"
+                b64 = base64.b64encode(img_bytes).decode("ascii")
+                img["src"] = f"data:{mime};base64,{b64}"
+            changed = True
+
+        return soup.encode("utf-8") if changed else html_bytes
+
     def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
         """Convenience function to extract a single occurrence of a tag (e.g., title)."""
         texts = self._get_all_texts_from_nodes(dom, tag_name)
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
index ffbcbd990..0e756ea8c 100644
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -1,11 +1,13 @@
+import os
+import re
 import sys
 import io
-import re
 from typing import BinaryIO, Any
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+from ..converter_utils.images import resolve_images_dir
 
 # Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
 PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
@@ -492,6 +494,99 @@ def _extract_tables_from_words(page: Any) -> list[list[list[str]]]:
     return [table_rows]
 
 
+def _iter_lt_images(layout: Any):
+    """Recursively yield LTImage objects from a pdfminer layout tree."""
+    from pdfminer.layout import LTFigure, LTImage
+
+    for elem in layout:
+        if isinstance(elem, LTFigure):
+            yield from _iter_lt_images(elem)
+        elif isinstance(elem, LTImage):
+            yield elem
+
+
+def _extract_region_text(region: Any) -> str:
+    """Extract text from a cropped page region, preserving table structure where present."""
+    form_text = _extract_form_content_from_words(region)
+    if form_text is not None:
+        return form_text.strip()
+    text = region.extract_text()
+    return text.strip() if text else ""
+
+
+def _extract_text_with_images(
+    page: Any, image_items: list, md_prefix: str
+) -> str:
+    """Extract text from *page* with images interleaved at their vertical positions.
+
+    *image_items* is a list of (pptop, ppbottom, filename) tuples sorted by pptop,
+    where coordinates are in pdfplumber's top-down system.
+
+    Each text region between images is passed through _extract_form_content_from_words
+    so that table structure is preserved even when images are present.
+    """
+    chunks = []
+    current_y = 0.0
+    page_h = page.height
+    page_w = page.width
+
+    for img_top, img_bottom, filename in image_items:
+        if img_top > current_y + 1:
+            text = _extract_region_text(page.crop((0, current_y, page_w, img_top)))
+            if text:
+                chunks.append(text)
+        chunks.append(f"![image]({md_prefix}/{filename})")
+        current_y = max(current_y, img_bottom)
+
+    if current_y < page_h - 1:
+        text = _extract_region_text(page.crop((0, current_y, page_w, page_h)))
+        if text:
+            chunks.append(text)
+
+    return "\n\n".join(chunks)
+
+
+def _save_lt_image(lt_img: Any, images_dir: str, index: int) -> str | None:
+    """Save an LTImage to *images_dir*. Returns the saved path, or None on failure.
+
+    JPEG images (DCTDecode) are written from their raw compressed bytes so no
+    additional dependencies are required.  Other formats are decoded and saved
+    as PNG via Pillow; if Pillow is not installed those images are skipped.
+    """
+    try:
+        filters = lt_img.stream.get_filters() or []
+        filter_names = [f[0] if isinstance(f, tuple) else f for f in filters]
+
+        if filter_names in (["DCTDecode"], ["JPXDecode"]):
+            ext = ".jpg" if "DCTDecode" in filter_names else ".jp2"
+            img_bytes = lt_img.stream.get_rawdata()
+        else:
+            try:
+                import PIL.Image as PILImage
+
+                raw = lt_img.stream.get_data()
+                attrs = lt_img.stream.attrs
+                w, h = int(attrs.get("Width", 0)), int(attrs.get("Height", 0))
+                if not w or not h:
+                    return None
+                mode = "RGB" if "RGB" in str(attrs.get("ColorSpace", "")) else "L"
+                img = PILImage.frombytes(mode, (w, h), raw)
+                buf = io.BytesIO()
+                img.save(buf, format="PNG")
+                img_bytes = buf.getvalue()
+                ext = ".png"
+            except Exception:
+                return None
+
+        filename = f"image_{index}{ext}"
+        dest = os.path.join(images_dir, filename)
+        with open(dest, "wb") as fout:
+            fout.write(img_bytes)
+        return dest
+    except Exception:
+        return None
+
+
 class PdfConverter(DocumentConverter):
     """
     Converts PDFs to Markdown.
@@ -536,6 +631,16 @@ def convert(
 
         assert isinstance(file_stream, io.IOBase)
 
+        save_images = kwargs.get("save_images", False)
+        actual_images_dir: str | None = None
+        md_images_prefix: str | None = None
+        img_count = 0
+
+        if save_images:
+            actual_images_dir, md_images_prefix = resolve_images_dir(
+                save_images, stream_info, "pdf"
+            )
+
         # Read file stream into BytesIO for compatibility with pdfplumber
         pdf_bytes = io.BytesIO(file_stream.read())
 
@@ -551,23 +656,56 @@ def convert(
 
             with pdfplumber.open(pdf_bytes) as pdf:
                 for page_idx, page in enumerate(pdf.pages):
+                    # Collect and save images before closing the page
+                    page_image_items: list = []
+                    if actual_images_dir:
+                        for lt_img in _iter_lt_images(page.layout):
+                            dest = _save_lt_image(
+                                lt_img, actual_images_dir, img_count + 1
+                            )
+                            if dest:
+                                img_count += 1
+                                pptop = page.height - lt_img.y1
+                                ppbot = page.height - lt_img.y0
+                                page_image_items.append(
+                                    (pptop, ppbot, os.path.basename(dest))
+                                )
+                        page_image_items.sort(key=lambda x: x[0])
+
                     page_content = _extract_form_content_from_words(page)
 
                     if page_content is not None:
                         form_page_count += 1
                         if page_content.strip():
-                            markdown_chunks.append(page_content)
+                            if page_image_items:
+                                # Images present: use crop-based extraction so
+                                # images appear at their correct vertical position
+                                # rather than being appended after the form content.
+                                chunk = _extract_text_with_images(
+                                    page, page_image_items, md_images_prefix
+                                )
+                            else:
+                                chunk = page_content
+                            if chunk:
+                                markdown_chunks.append(chunk)
                     else:
                         plain_page_indices.append(page_idx)
-                        text = page.extract_text()
-                        if text and text.strip():
-                            markdown_chunks.append(text.strip())
+                        if page_image_items:
+                            chunk = _extract_text_with_images(
+                                page, page_image_items, md_images_prefix
+                            )
+                        else:
+                            text = page.extract_text()
+                            chunk = text.strip() if text else ""
+                        if chunk:
+                            markdown_chunks.append(chunk)
 
                     page.close()  # Free cached page data immediately
 
-            # If no pages had form-style content, use pdfminer for
-            # the whole document (better text spacing for prose).
-            if form_page_count == 0:
+            # If no pages had form-style content, use pdfminer for better
+            # text spacing — unless images were requested (need pdfplumber
+            # positions for interleaving).
+            if form_page_count == 0 and not save_images:
                 pdf_bytes.seek(0)
                 markdown = pdfminer.high_level.extract_text(pdf_bytes)
             else:
diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
index 360f17706..c381e1953 100644
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -1,5 +1,6 @@
 import sys
 import base64
+import mimetypes
 import os
 import io
 import re
@@ -10,6 +11,7 @@
 
 from ._html_converter import HtmlConverter
 from ._llm_caption import llm_caption
+from ..converter_utils.images import resolve_images_dir
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@@ -79,9 +81,19 @@ def convert(
             )
 
         # Perform the conversion
+        save_images = kwargs.get("save_images", False)
+        if save_images:
+            actual_images_dir, md_images_prefix = resolve_images_dir(
+                save_images, stream_info, "pptx"
+            )
+        else:
+            actual_images_dir = None
+            md_images_prefix = None
+
         presentation = pptx.Presentation(file_stream)
         md_content = ""
         slide_num = 0
+        img_count = 0
         for slide in presentation.slides:
             slide_num += 1
 
@@ -140,8 +152,18 @@ def get_shape_content(shape, **kwargs):
                     alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
                     alt_text = re.sub(r"\s+", " ", alt_text).strip()
 
-                    # If keep_data_uris is True, use base64 encoding for images
-                    if kwargs.get("keep_data_uris", False):
+                    # Emit the image reference
+                    if actual_images_dir:
+                        nonlocal img_count
+                        img_count += 1
+                        content_type = shape.image.content_type or "image/png"
+                        ext = mimetypes.guess_extension(content_type) or ".png"
+                        ext = {".jpe": ".jpg", ".jpeg": ".jpg"}.get(ext, ext)
+                        img_filename = f"image_{img_count}{ext}"
+                        with open(os.path.join(actual_images_dir, img_filename), "wb") as f:
+                            f.write(shape.image.blob)
+                        md_content += f"\n![{alt_text}]({md_images_prefix}/{img_filename})\n"
+                    elif kwargs.get("keep_data_uris", False):
                         blob = shape.image.blob
                         content_type = shape.image.content_type or "image/png"
                         b64_string = base64.b64encode(blob).decode("utf-8")