From 47d58f2eef4e86740098e9266fdebc0bd32d9560 Mon Sep 17 00:00:00 2001
From: carljxlin <769338334@qq.com>
Date: Sun, 29 Mar 2026 19:11:18 +0800
Subject: [PATCH] Add --save-images flag for extracting images to disk across
converters
- Fix image loss in EPUB conversion: resolve relative
paths
inside the ZIP and embed them as base64 or save to files
- Add --save-images [DIR] CLI flag (and save_images kwarg for the API):
- No DIR: auto-creates images_{output_stem}/ next to the output file
- With DIR: saves images to the specified path
- Support image extraction for EPUB, DOCX, PPTX, and PDF converters
- PDF: interleave extracted images at their correct vertical position
in the text rather than appending them at the end; preserve table
structure in cropped page regions when images are present
- Extract shared dir-resolution logic into converter_utils/images.py
- Add debug/ and generated image dirs to .gitignore
Co-Authored-By: Claude Sonnet 4.6
---
.gitignore | 4 +
.../markitdown/src/markitdown/__main__.py | 32 +++-
.../src/markitdown/converter_utils/images.py | 45 +++++
.../markitdown/converters/_docx_converter.py | 44 ++++-
.../markitdown/converters/_epub_converter.py | 97 +++++++++--
.../markitdown/converters/_pdf_converter.py | 154 +++++++++++++++++-
.../markitdown/converters/_pptx_converter.py | 26 ++-
7 files changed, 374 insertions(+), 28 deletions(-)
create mode 100644 packages/markitdown/src/markitdown/converter_utils/images.py
diff --git a/.gitignore b/.gitignore
index 15613ea8a..49030756f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,8 @@
.vscode
+debug/
+test_pdf*.md
+images_*/
+my_*/
# Byte-compiled / optimized / DLL files
__pycache__/
diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index 6085ad6bb..3e5c90418 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: MIT
import argparse
+import os
+import re
import sys
import codecs
from textwrap import dedent
@@ -110,6 +112,17 @@ def main():
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)
+ parser.add_argument(
+ "--save-images",
+ nargs="?",
+ const=True,
+ default=False,
+ metavar="DIR",
+ help="Extract images from documents and save them to a directory. "
+ "If DIR is omitted, images are saved to ./images_{output_filename}/. "
+ "When omitted entirely, images are not included in the output (default).",
+ )
+
parser.add_argument("filename", nargs="?")
args = parser.parse_args()
@@ -186,15 +199,32 @@ def main():
else:
markitdown = MarkItDown(enable_plugins=args.use_plugins)
+ # Resolve the images directory path
+ save_images = args.save_images
+ if save_images is True:
+ # Auto-compute directory name from output filename, then input filename
+ if args.output:
+ stem = os.path.splitext(os.path.basename(args.output))[0]
+ elif args.filename:
+ stem = os.path.splitext(os.path.basename(args.filename))[0]
+ else:
+ stem = "output"
+ stem = re.sub(r"[^\w\-]", "_", stem)
+ save_images = f"images_{stem}"
+
if args.filename is None:
result = markitdown.convert_stream(
sys.stdin.buffer,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
+ save_images=save_images,
)
else:
result = markitdown.convert(
- args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
+ args.filename,
+ stream_info=stream_info,
+ keep_data_uris=args.keep_data_uris,
+ save_images=save_images,
)
_handle_output(args, result)
diff --git a/packages/markitdown/src/markitdown/converter_utils/images.py b/packages/markitdown/src/markitdown/converter_utils/images.py
new file mode 100644
index 000000000..526fac9b3
--- /dev/null
+++ b/packages/markitdown/src/markitdown/converter_utils/images.py
@@ -0,0 +1,45 @@
+import os
+import re
+
+from .._stream_info import StreamInfo
+
+
+def resolve_images_dir(
+ save_images: bool | str,
+ stream_info: StreamInfo,
+ fallback_name: str,
+) -> tuple[str, str]:
+ """Resolve the images directory and markdown prefix from a ``save_images`` kwarg.
+
+ Parameters
+ ----------
+ save_images:
+ - ``str`` — use this path directly as both the directory and the
+ markdown image prefix.
+ - ``True`` — auto-derive ``images_{stem}`` from *stream_info.filename*,
+ falling back to *fallback_name* when no filename is available.
+ stream_info:
+ Stream metadata; ``stream_info.filename`` is used for auto-naming.
+ fallback_name:
+ Format-specific fallback stem (e.g. ``"epub"``, ``"pdf"``) used when
+ no filename is available and *save_images* is ``True``.
+
+ Returns
+ -------
+ (actual_images_dir, md_images_prefix)
+ The directory to write images into, and the prefix to use in markdown
+ ```` references. The directory is created
+ (including any parents) before returning.
+ """
+ if isinstance(save_images, str):
+ actual_images_dir = save_images
+ md_images_prefix = save_images
+ else:
+ file_stem = re.sub(
+ r"[^\w\-]", "_", os.path.splitext(stream_info.filename or fallback_name)[0]
+ )
+ actual_images_dir = f"images_{file_stem}"
+ md_images_prefix = f"./images_{file_stem}"
+
+ os.makedirs(actual_images_dir, exist_ok=True)
+ return actual_images_dir, md_images_prefix
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index 3975107b1..2c3f92e4d 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -1,11 +1,17 @@
+import base64
+import mimetypes
+import os
+import re
import sys
import io
from warnings import warn
+from bs4 import BeautifulSoup
from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from ..converter_utils.docx.pre_process import pre_process_docx
+from ..converter_utils.images import resolve_images_dir
from .._base_converter import DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@@ -27,6 +33,9 @@
ACCEPTED_FILE_EXTENSIONS = [".docx"]
+# Map mimetypes.guess_extension() quirks to sane extensions
+_EXT_FIXES = {".jpe": ".jpg", ".jpeg": ".jpg"}
+
class DocxConverter(HtmlConverter):
"""
@@ -77,7 +86,34 @@ def convert(
style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream)
- return self._html_converter.convert_string(
- mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
- **kwargs,
- )
+ html = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value
+
+ save_images = kwargs.get("save_images", False)
+ if save_images:
+ actual_images_dir, md_prefix = resolve_images_dir(
+ save_images, stream_info, "docx"
+ )
+ html = self._save_images(html, actual_images_dir, md_prefix)
+
+ return self._html_converter.convert_string(html, **kwargs)
+
+ def _save_images(self, html: str, images_dir: str, md_prefix: str) -> str:
+ """Extract base64 data URI images from mammoth HTML, save to *images_dir*,
+ and replace each src with a *md_prefix*/filename relative path."""
+ soup = BeautifulSoup(html, "html.parser")
+ for i, img in enumerate(soup.find_all("img")):
+ src = img.get("src", "")
+ if not src.startswith("data:"):
+ continue
+ try:
+ header, b64data = src.split(",", 1)
+ mime = header.split(":")[1].split(";")[0]
+ ext = mimetypes.guess_extension(mime) or ".bin"
+ ext = _EXT_FIXES.get(ext, ext)
+ filename = f"image_{i + 1}{ext}"
+ with open(os.path.join(images_dir, filename), "wb") as f:
+ f.write(base64.b64decode(b64data))
+ img["src"] = f"{md_prefix}/{filename}"
+ except Exception:
+ continue
+ return str(soup)
diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py
index 3be65b016..c14c40318 100644
--- a/packages/markitdown/src/markitdown/converters/_epub_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -1,12 +1,19 @@
+import base64
+import io
+import mimetypes
import os
+import posixpath
+import re
import zipfile
from defusedxml import minidom
from xml.dom.minidom import Document
+from bs4 import BeautifulSoup
from typing import BinaryIO, Any, Dict, List
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverterResult
+from ..converter_utils.images import resolve_images_dir
from .._stream_info import StreamInfo
ACCEPTED_MIME_TYPE_PREFIXES = [
@@ -98,22 +105,45 @@ def convert(
]
# Extract and convert the content
+ # images_dir: optional base directory where images will be saved.
+ # A subdirectory image_{stem} is created inside it per file, so
+ # converting multiple files into the same dir never mixes images.
+ # When omitted, images are embedded inline as base64 data URIs.
+ save_images = kwargs.get("save_images", False)
+ actual_images_dir: str | None = None
+ md_images_prefix: str | None = None
+ if save_images:
+ actual_images_dir, md_images_prefix = resolve_images_dir(
+ save_images, stream_info, "epub"
+ )
+
+ namelist_set = set(z.namelist())
markdown_content: List[str] = []
for file in spine:
- if file in z.namelist():
+ if file in namelist_set:
with z.open(file) as f:
- filename = os.path.basename(file)
- extension = os.path.splitext(filename)[1].lower()
- mimetype = MIME_TYPE_MAPPING.get(extension)
- converted_content = self._html_converter.convert(
- f,
- StreamInfo(
- mimetype=mimetype,
- extension=extension,
- filename=filename,
- ),
- )
- markdown_content.append(converted_content.markdown.strip())
+ html_bytes = f.read()
+
+ # Resolve relative image src attributes so that images survive
+ # the conversion to Markdown.
+ html_bytes = self._resolve_images(
+ html_bytes, file, z, namelist_set,
+ actual_images_dir, md_images_prefix,
+ )
+
+ filename = os.path.basename(file)
+ extension = os.path.splitext(filename)[1].lower()
+ mimetype = MIME_TYPE_MAPPING.get(extension)
+ converted_content = self._html_converter.convert(
+ io.BytesIO(html_bytes),
+ StreamInfo(
+ mimetype=mimetype,
+ extension=extension,
+ filename=filename,
+ ),
+ keep_data_uris=actual_images_dir is None,
+ )
+ markdown_content.append(converted_content.markdown.strip())
# Format and add the metadata
metadata_markdown = []
@@ -129,6 +159,47 @@ def convert(
markdown="\n\n".join(markdown_content), title=metadata["title"]
)
+ def _resolve_images(
+ self,
+ html_bytes: bytes,
+ html_path: str,
+ z: zipfile.ZipFile,
+ namelist_set: set,
+ images_dir: str | None,
+ md_images_prefix: str | None,
+ ) -> bytes:
+ """Rewrite
attributes so images survive HTML-to-Markdown conversion.
+
+ If *images_dir* is given, each image is extracted there and the src is
+ replaced with *md_images_prefix*/filename (a path relative to the markdown
+ file). Otherwise the image is embedded as a base64 data URI.
+ """
+ soup = BeautifulSoup(html_bytes, "html.parser")
+ changed = False
+ html_dir = posixpath.dirname(html_path)
+
+ for img in soup.find_all("img"):
+ src = img.get("src", "")
+ if not src or src.startswith("data:") or src.startswith("http"):
+ continue
+ resolved = posixpath.normpath(posixpath.join(html_dir, src))
+ if resolved not in namelist_set:
+ continue
+ img_bytes = z.read(resolved)
+ if images_dir:
+ img_filename = os.path.basename(resolved)
+ with open(os.path.join(images_dir, img_filename), "wb") as out:
+ out.write(img_bytes)
+ img["src"] = f"{md_images_prefix}/{img_filename}"
+ else:
+ mime, _ = mimetypes.guess_type(resolved)
+ mime = mime or "image/jpeg"
+ b64 = base64.b64encode(img_bytes).decode("ascii")
+ img["src"] = f"data:{mime};base64,{b64}"
+ changed = True
+
+ return soup.encode("utf-8") if changed else html_bytes
+
def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
texts = self._get_all_texts_from_nodes(dom, tag_name)
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
index ffbcbd990..0e756ea8c 100644
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -1,11 +1,13 @@
+import os
+import re
import sys
import io
-import re
from typing import BinaryIO, Any
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+from ..converter_utils.images import resolve_images_dir
# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
@@ -492,6 +494,99 @@ def _extract_tables_from_words(page: Any) -> list[list[list[str]]]:
return [table_rows]
+def _iter_lt_images(layout: Any):
+ """Recursively yield LTImage objects from a pdfminer layout tree."""
+ from pdfminer.layout import LTFigure, LTImage
+
+ for elem in layout:
+ if isinstance(elem, LTFigure):
+ yield from _iter_lt_images(elem)
+ elif isinstance(elem, LTImage):
+ yield elem
+
+
+def _extract_region_text(region: Any) -> str:
+ """Extract text from a cropped page region, preserving table structure where present."""
+ form_text = _extract_form_content_from_words(region)
+ if form_text is not None:
+ return form_text.strip()
+ text = region.extract_text()
+ return text.strip() if text else ""
+
+
+def _extract_text_with_images(
+ page: Any, image_items: list, md_prefix: str
+) -> str:
+ """Extract text from *page* with images interleaved at their vertical positions.
+
+ *image_items* is a list of (pptop, ppbottom, filename) tuples sorted by pptop,
+ where coordinates are in pdfplumber's top-down system.
+
+ Each text region between images is passed through _extract_form_content_from_words
+ so that table structure is preserved even when images are present.
+ """
+ chunks = []
+ current_y = 0.0
+ page_h = page.height
+ page_w = page.width
+
+ for img_top, img_bottom, filename in image_items:
+ if img_top > current_y + 1:
+ text = _extract_region_text(page.crop((0, current_y, page_w, img_top)))
+ if text:
+ chunks.append(text)
+ chunks.append(f"")
+ current_y = max(current_y, img_bottom)
+
+ if current_y < page_h - 1:
+ text = _extract_region_text(page.crop((0, current_y, page_w, page_h)))
+ if text:
+ chunks.append(text)
+
+ return "\n\n".join(chunks)
+
+
+def _save_lt_image(lt_img: Any, images_dir: str, index: int) -> str | None:
+ """Save an LTImage to *images_dir*. Returns the saved path, or None on failure.
+
+ JPEG images (DCTDecode) are written from their raw compressed bytes so no
+ additional dependencies are required. Other formats are decoded and saved
+ as PNG via Pillow; if Pillow is not installed those images are skipped.
+ """
+ try:
+ filters = lt_img.stream.get_filters() or []
+ filter_names = [f[0] if isinstance(f, tuple) else f for f in filters]
+
+ if filter_names in (["DCTDecode"], ["JPXDecode"]):
+ ext = ".jpg" if "DCTDecode" in filter_names else ".jp2"
+ img_bytes = lt_img.stream.get_rawdata()
+ else:
+ try:
+ import PIL.Image as PILImage
+
+ raw = lt_img.stream.get_data()
+ attrs = lt_img.stream.attrs
+ w, h = int(attrs.get("Width", 0)), int(attrs.get("Height", 0))
+ if not w or not h:
+ return None
+ mode = "RGB" if "RGB" in str(attrs.get("ColorSpace", "")) else "L"
+ img = PILImage.frombytes(mode, (w, h), raw)
+ buf = io.BytesIO()
+ img.save(buf, format="PNG")
+ img_bytes = buf.getvalue()
+ ext = ".png"
+ except Exception:
+ return None
+
+ filename = f"image_{index}{ext}"
+ dest = os.path.join(images_dir, filename)
+ with open(dest, "wb") as fout:
+ fout.write(img_bytes)
+ return dest
+ except Exception:
+ return None
+
+
class PdfConverter(DocumentConverter):
"""
Converts PDFs to Markdown.
@@ -536,6 +631,16 @@ def convert(
assert isinstance(file_stream, io.IOBase)
+ save_images = kwargs.get("save_images", False)
+ actual_images_dir: str | None = None
+ md_images_prefix: str | None = None
+ img_count = 0
+
+ if save_images:
+ actual_images_dir, md_images_prefix = resolve_images_dir(
+ save_images, stream_info, "pdf"
+ )
+
# Read file stream into BytesIO for compatibility with pdfplumber
pdf_bytes = io.BytesIO(file_stream.read())
@@ -551,23 +656,56 @@ def convert(
with pdfplumber.open(pdf_bytes) as pdf:
for page_idx, page in enumerate(pdf.pages):
+ # Collect and save images before closing the page
+ page_image_items: list = []
+ if actual_images_dir:
+ for lt_img in _iter_lt_images(page.layout):
+ dest = _save_lt_image(
+ lt_img, actual_images_dir, img_count + 1
+ )
+ if dest:
+ img_count += 1
+ pptop = page.height - lt_img.y1
+ ppbot = page.height - lt_img.y0
+ page_image_items.append(
+ (pptop, ppbot, os.path.basename(dest))
+ )
+ page_image_items.sort(key=lambda x: x[0])
+
page_content = _extract_form_content_from_words(page)
if page_content is not None:
form_page_count += 1
if page_content.strip():
- markdown_chunks.append(page_content)
+ if page_image_items:
+ # Images present: use crop-based extraction so
+ # images appear at their correct vertical position
+ # rather than being appended after the form content.
+ chunk = _extract_text_with_images(
+ page, page_image_items, md_images_prefix
+ )
+ else:
+ chunk = page_content
+ if chunk:
+ markdown_chunks.append(chunk)
else:
plain_page_indices.append(page_idx)
- text = page.extract_text()
- if text and text.strip():
- markdown_chunks.append(text.strip())
+ if page_image_items:
+ chunk = _extract_text_with_images(
+ page, page_image_items, md_images_prefix
+ )
+ else:
+ text = page.extract_text()
+ chunk = text.strip() if text else ""
+ if chunk:
+ markdown_chunks.append(chunk)
page.close() # Free cached page data immediately
- # If no pages had form-style content, use pdfminer for
- # the whole document (better text spacing for prose).
- if form_page_count == 0:
+ # If no pages had form-style content, use pdfminer for better
+ # text spacing — unless images were requested (need pdfplumber
+ # positions for interleaving).
+ if form_page_count == 0 and not save_images:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
else:
diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
index 360f17706..c381e1953 100644
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -1,5 +1,6 @@
import sys
import base64
+import mimetypes
import os
import io
import re
@@ -10,6 +11,7 @@
from ._html_converter import HtmlConverter
from ._llm_caption import llm_caption
+from ..converter_utils.images import resolve_images_dir
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@@ -79,9 +81,19 @@ def convert(
)
# Perform the conversion
+ save_images = kwargs.get("save_images", False)
+ if save_images:
+ actual_images_dir, md_images_prefix = resolve_images_dir(
+ save_images, stream_info, "pptx"
+ )
+ else:
+ actual_images_dir = None
+ md_images_prefix = None
+
presentation = pptx.Presentation(file_stream)
md_content = ""
slide_num = 0
+ img_count = 0
for slide in presentation.slides:
slide_num += 1
@@ -140,8 +152,18 @@ def get_shape_content(shape, **kwargs):
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
alt_text = re.sub(r"\s+", " ", alt_text).strip()
- # If keep_data_uris is True, use base64 encoding for images
- if kwargs.get("keep_data_uris", False):
+ # Emit the image reference
+ if actual_images_dir:
+ nonlocal img_count
+ img_count += 1
+ content_type = shape.image.content_type or "image/png"
+ ext = mimetypes.guess_extension(content_type) or ".png"
+ ext = {".jpe": ".jpg", ".jpeg": ".jpg"}.get(ext, ext)
+ img_filename = f"image_{img_count}{ext}"
+ with open(os.path.join(actual_images_dir, img_filename), "wb") as f:
+ f.write(shape.image.blob)
+ md_content += f"\n\n"
+ elif kwargs.get("keep_data_uris", False):
blob = shape.image.blob
content_type = shape.image.content_type or "image/png"
b64_string = base64.b64encode(blob).decode("utf-8")