From 47d58f2eef4e86740098e9266fdebc0bd32d9560 Mon Sep 17 00:00:00 2001 From: carljxlin <769338334@qq.com> Date: Sun, 29 Mar 2026 19:11:18 +0800 Subject: [PATCH] Add --save-images flag for extracting images to disk across converters - Fix image loss in EPUB conversion: resolve relative paths inside the ZIP and embed them as base64 or save to files - Add --save-images [DIR] CLI flag (and save_images kwarg for the API): - No DIR: auto-creates images_{output_stem}/ next to the output file - With DIR: saves images to the specified path - Support image extraction for EPUB, DOCX, PPTX, and PDF converters - PDF: interleave extracted images at their correct vertical position in the text rather than appending them at the end; preserve table structure in cropped page regions when images are present - Extract shared dir-resolution logic into converter_utils/images.py - Add debug/ and generated image dirs to .gitignore Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 4 + .../markitdown/src/markitdown/__main__.py | 32 +++- .../src/markitdown/converter_utils/images.py | 45 +++++ .../markitdown/converters/_docx_converter.py | 44 ++++- .../markitdown/converters/_epub_converter.py | 97 +++++++++-- .../markitdown/converters/_pdf_converter.py | 154 +++++++++++++++++- .../markitdown/converters/_pptx_converter.py | 26 ++- 7 files changed, 374 insertions(+), 28 deletions(-) create mode 100644 packages/markitdown/src/markitdown/converter_utils/images.py diff --git a/.gitignore b/.gitignore index 15613ea8a..49030756f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,8 @@ .vscode +debug/ +test_pdf*.md +images_*/ +my_*/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6bb..3e5c90418 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: MIT import argparse +import os +import re import sys import codecs from textwrap import dedent @@ -110,6 +112,17 @@ def main(): help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", ) + parser.add_argument( + "--save-images", + nargs="?", + const=True, + default=False, + metavar="DIR", + help="Extract images from documents and save them to a directory. " + "If DIR is omitted, images are saved to ./images_{output_filename}/. " + "When omitted entirely, images are not included in the output (default).", + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() @@ -186,15 +199,32 @@ def main(): else: markitdown = MarkItDown(enable_plugins=args.use_plugins) + # Resolve the images directory path + save_images = args.save_images + if save_images is True: + # Auto-compute directory name from output filename, then input filename + if args.output: + stem = os.path.splitext(os.path.basename(args.output))[0] + elif args.filename: + stem = os.path.splitext(os.path.basename(args.filename))[0] + else: + stem = "output" + stem = re.sub(r"[^\w\-]", "_", stem) + save_images = f"images_{stem}" + if args.filename is None: result = markitdown.convert_stream( sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris, + save_images=save_images, ) else: result = markitdown.convert( - args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris + args.filename, + stream_info=stream_info, + keep_data_uris=args.keep_data_uris, + save_images=save_images, ) _handle_output(args, result) diff --git a/packages/markitdown/src/markitdown/converter_utils/images.py b/packages/markitdown/src/markitdown/converter_utils/images.py new file mode 100644 index 000000000..526fac9b3 --- /dev/null +++ b/packages/markitdown/src/markitdown/converter_utils/images.py @@ -0,0 +1,45 @@ +import os +import re + +from .._stream_info import StreamInfo + + +def resolve_images_dir( + save_images: bool | str, + stream_info: StreamInfo, + fallback_name: str, +) -> tuple[str, str]: + """Resolve the images directory and markdown prefix from a ``save_images`` kwarg. + + Parameters + ---------- + save_images: + - ``str`` — use this path directly as both the directory and the + markdown image prefix. + - ``True`` — auto-derive ``images_{stem}`` from *stream_info.filename*, + falling back to *fallback_name* when no filename is available. + stream_info: + Stream metadata; ``stream_info.filename`` is used for auto-naming. + fallback_name: + Format-specific fallback stem (e.g. ``"epub"``, ``"pdf"``) used when + no filename is available and *save_images* is ``True``. + + Returns + ------- + (actual_images_dir, md_images_prefix) + The directory to write images into, and the prefix to use in markdown + ``![alt](prefix/filename)`` references. The directory is created + (including any parents) before returning. + """ + if isinstance(save_images, str): + actual_images_dir = save_images + md_images_prefix = save_images + else: + file_stem = re.sub( + r"[^\w\-]", "_", os.path.splitext(stream_info.filename or fallback_name)[0] + ) + actual_images_dir = f"images_{file_stem}" + md_images_prefix = f"./images_{file_stem}" + + os.makedirs(actual_images_dir, exist_ok=True) + return actual_images_dir, md_images_prefix diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 3975107b1..2c3f92e4d 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -1,11 +1,17 @@ +import base64 +import mimetypes +import os +import re import sys import io from warnings import warn +from bs4 import BeautifulSoup from typing import BinaryIO, Any from ._html_converter import HtmlConverter from ..converter_utils.docx.pre_process import pre_process_docx +from ..converter_utils.images import resolve_images_dir from .._base_converter import DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -27,6 +33,9 @@ ACCEPTED_FILE_EXTENSIONS = [".docx"] +# Map mimetypes.guess_extension() quirks to sane extensions +_EXT_FIXES = {".jpe": ".jpg", ".jpeg": ".jpg"} + class DocxConverter(HtmlConverter): """ @@ -77,7 +86,34 @@ def convert( style_map = kwargs.get("style_map", None) pre_process_stream = pre_process_docx(file_stream) - return self._html_converter.convert_string( - mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, - **kwargs, - ) + html = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value + + save_images = kwargs.get("save_images", False) + if save_images: + actual_images_dir, md_prefix = resolve_images_dir( + save_images, stream_info, "docx" + ) + html = self._save_images(html, actual_images_dir, md_prefix) + + return self._html_converter.convert_string(html, **kwargs) + + def _save_images(self, html: str, images_dir: str, md_prefix: str) -> str: + """Extract base64 data URI images from mammoth HTML, save to *images_dir*, + and replace each src with a *md_prefix*/filename relative path.""" + soup = BeautifulSoup(html, "html.parser") + for i, img in enumerate(soup.find_all("img")): + src = img.get("src", "") + if not src.startswith("data:"): + continue + try: + header, b64data = src.split(",", 1) + mime = header.split(":")[1].split(";")[0] + ext = mimetypes.guess_extension(mime) or ".bin" + ext = _EXT_FIXES.get(ext, ext) + filename = f"image_{i + 1}{ext}" + with open(os.path.join(images_dir, filename), "wb") as f: + f.write(base64.b64decode(b64data)) + img["src"] = f"{md_prefix}/{filename}" + except Exception: + continue + return str(soup) diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py index 3be65b016..c14c40318 100644 --- a/packages/markitdown/src/markitdown/converters/_epub_converter.py +++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py @@ -1,12 +1,19 @@ +import base64 +import io +import mimetypes import os +import posixpath +import re import zipfile from defusedxml import minidom from xml.dom.minidom import Document +from bs4 import BeautifulSoup from typing import BinaryIO, Any, Dict, List from ._html_converter import HtmlConverter from .._base_converter import DocumentConverterResult +from ..converter_utils.images import resolve_images_dir from .._stream_info import StreamInfo ACCEPTED_MIME_TYPE_PREFIXES = [ @@ -98,22 +105,45 @@ def convert( ] # Extract and convert the content + # images_dir: optional base directory where images will be saved. + # A subdirectory image_{stem} is created inside it per file, so + # converting multiple files into the same dir never mixes images. + # When omitted, images are embedded inline as base64 data URIs. + save_images = kwargs.get("save_images", False) + actual_images_dir: str | None = None + md_images_prefix: str | None = None + if save_images: + actual_images_dir, md_images_prefix = resolve_images_dir( + save_images, stream_info, "epub" + ) + + namelist_set = set(z.namelist()) markdown_content: List[str] = [] for file in spine: - if file in z.namelist(): + if file in namelist_set: with z.open(file) as f: - filename = os.path.basename(file) - extension = os.path.splitext(filename)[1].lower() - mimetype = MIME_TYPE_MAPPING.get(extension) - converted_content = self._html_converter.convert( - f, - StreamInfo( - mimetype=mimetype, - extension=extension, - filename=filename, - ), - ) - markdown_content.append(converted_content.markdown.strip()) + html_bytes = f.read() + + # Resolve relative image src attributes so that images survive + # the conversion to Markdown. + html_bytes = self._resolve_images( + html_bytes, file, z, namelist_set, + actual_images_dir, md_images_prefix, + ) + + filename = os.path.basename(file) + extension = os.path.splitext(filename)[1].lower() + mimetype = MIME_TYPE_MAPPING.get(extension) + converted_content = self._html_converter.convert( + io.BytesIO(html_bytes), + StreamInfo( + mimetype=mimetype, + extension=extension, + filename=filename, + ), + keep_data_uris=actual_images_dir is None, + ) + markdown_content.append(converted_content.markdown.strip()) # Format and add the metadata metadata_markdown = [] @@ -129,6 +159,47 @@ def convert( markdown="\n\n".join(markdown_content), title=metadata["title"] ) + def _resolve_images( + self, + html_bytes: bytes, + html_path: str, + z: zipfile.ZipFile, + namelist_set: set, + images_dir: str | None, + md_images_prefix: str | None, + ) -> bytes: + """Rewrite attributes so images survive HTML-to-Markdown conversion. + + If *images_dir* is given, each image is extracted there and the src is + replaced with *md_images_prefix*/filename (a path relative to the markdown + file). Otherwise the image is embedded as a base64 data URI. + """ + soup = BeautifulSoup(html_bytes, "html.parser") + changed = False + html_dir = posixpath.dirname(html_path) + + for img in soup.find_all("img"): + src = img.get("src", "") + if not src or src.startswith("data:") or src.startswith("http"): + continue + resolved = posixpath.normpath(posixpath.join(html_dir, src)) + if resolved not in namelist_set: + continue + img_bytes = z.read(resolved) + if images_dir: + img_filename = os.path.basename(resolved) + with open(os.path.join(images_dir, img_filename), "wb") as out: + out.write(img_bytes) + img["src"] = f"{md_images_prefix}/{img_filename}" + else: + mime, _ = mimetypes.guess_type(resolved) + mime = mime or "image/jpeg" + b64 = base64.b64encode(img_bytes).decode("ascii") + img["src"] = f"data:{mime};base64,{b64}" + changed = True + + return soup.encode("utf-8") if changed else html_bytes + def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None: """Convenience function to extract a single occurrence of a tag (e.g., title).""" texts = self._get_all_texts_from_nodes(dom, tag_name) diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index ffbcbd990..0e756ea8c 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,11 +1,13 @@ +import os +import re import sys import io -import re from typing import BinaryIO, Any from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +from ..converter_utils.images import resolve_images_dir # Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10") PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$") @@ -492,6 +494,99 @@ def _extract_tables_from_words(page: Any) -> list[list[list[str]]]: return [table_rows] +def _iter_lt_images(layout: Any): + """Recursively yield LTImage objects from a pdfminer layout tree.""" + from pdfminer.layout import LTFigure, LTImage + + for elem in layout: + if isinstance(elem, LTFigure): + yield from _iter_lt_images(elem) + elif isinstance(elem, LTImage): + yield elem + + +def _extract_region_text(region: Any) -> str: + """Extract text from a cropped page region, preserving table structure where present.""" + form_text = _extract_form_content_from_words(region) + if form_text is not None: + return form_text.strip() + text = region.extract_text() + return text.strip() if text else "" + + +def _extract_text_with_images( + page: Any, image_items: list, md_prefix: str +) -> str: + """Extract text from *page* with images interleaved at their vertical positions. + + *image_items* is a list of (pptop, ppbottom, filename) tuples sorted by pptop, + where coordinates are in pdfplumber's top-down system. + + Each text region between images is passed through _extract_form_content_from_words + so that table structure is preserved even when images are present. + """ + chunks = [] + current_y = 0.0 + page_h = page.height + page_w = page.width + + for img_top, img_bottom, filename in image_items: + if img_top > current_y + 1: + text = _extract_region_text(page.crop((0, current_y, page_w, img_top))) + if text: + chunks.append(text) + chunks.append(f"![image]({md_prefix}/{filename})") + current_y = max(current_y, img_bottom) + + if current_y < page_h - 1: + text = _extract_region_text(page.crop((0, current_y, page_w, page_h))) + if text: + chunks.append(text) + + return "\n\n".join(chunks) + + +def _save_lt_image(lt_img: Any, images_dir: str, index: int) -> str | None: + """Save an LTImage to *images_dir*. Returns the saved path, or None on failure. + + JPEG images (DCTDecode) are written from their raw compressed bytes so no + additional dependencies are required. Other formats are decoded and saved + as PNG via Pillow; if Pillow is not installed those images are skipped. + """ + try: + filters = lt_img.stream.get_filters() or [] + filter_names = [f[0] if isinstance(f, tuple) else f for f in filters] + + if filter_names in (["DCTDecode"], ["JPXDecode"]): + ext = ".jpg" if "DCTDecode" in filter_names else ".jp2" + img_bytes = lt_img.stream.get_rawdata() + else: + try: + import PIL.Image as PILImage + + raw = lt_img.stream.get_data() + attrs = lt_img.stream.attrs + w, h = int(attrs.get("Width", 0)), int(attrs.get("Height", 0)) + if not w or not h: + return None + mode = "RGB" if "RGB" in str(attrs.get("ColorSpace", "")) else "L" + img = PILImage.frombytes(mode, (w, h), raw) + buf = io.BytesIO() + img.save(buf, format="PNG") + img_bytes = buf.getvalue() + ext = ".png" + except Exception: + return None + + filename = f"image_{index}{ext}" + dest = os.path.join(images_dir, filename) + with open(dest, "wb") as fout: + fout.write(img_bytes) + return dest + except Exception: + return None + + class PdfConverter(DocumentConverter): """ Converts PDFs to Markdown. @@ -536,6 +631,16 @@ def convert( assert isinstance(file_stream, io.IOBase) + save_images = kwargs.get("save_images", False) + actual_images_dir: str | None = None + md_images_prefix: str | None = None + img_count = 0 + + if save_images: + actual_images_dir, md_images_prefix = resolve_images_dir( + save_images, stream_info, "pdf" + ) + # Read file stream into BytesIO for compatibility with pdfplumber pdf_bytes = io.BytesIO(file_stream.read()) @@ -551,23 +656,56 @@ def convert( with pdfplumber.open(pdf_bytes) as pdf: for page_idx, page in enumerate(pdf.pages): + # Collect and save images before closing the page + page_image_items: list = [] + if actual_images_dir: + for lt_img in _iter_lt_images(page.layout): + dest = _save_lt_image( + lt_img, actual_images_dir, img_count + 1 + ) + if dest: + img_count += 1 + pptop = page.height - lt_img.y1 + ppbot = page.height - lt_img.y0 + page_image_items.append( + (pptop, ppbot, os.path.basename(dest)) + ) + page_image_items.sort(key=lambda x: x[0]) + page_content = _extract_form_content_from_words(page) if page_content is not None: form_page_count += 1 if page_content.strip(): - markdown_chunks.append(page_content) + if page_image_items: + # Images present: use crop-based extraction so + # images appear at their correct vertical position + # rather than being appended after the form content. + chunk = _extract_text_with_images( + page, page_image_items, md_images_prefix + ) + else: + chunk = page_content + if chunk: + markdown_chunks.append(chunk) else: plain_page_indices.append(page_idx) - text = page.extract_text() - if text and text.strip(): - markdown_chunks.append(text.strip()) + if page_image_items: + chunk = _extract_text_with_images( + page, page_image_items, md_images_prefix + ) + else: + text = page.extract_text() + chunk = text.strip() if text else "" + if chunk: + markdown_chunks.append(chunk) page.close() # Free cached page data immediately - # If no pages had form-style content, use pdfminer for - # the whole document (better text spacing for prose). - if form_page_count == 0: + # If no pages had form-style content, use pdfminer for better + # text spacing — unless images were requested (need pdfplumber + # positions for interleaving). + if form_page_count == 0 and not save_images: pdf_bytes.seek(0) markdown = pdfminer.high_level.extract_text(pdf_bytes) else: diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index 360f17706..c381e1953 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -1,5 +1,6 @@ import sys import base64 +import mimetypes import os import io import re @@ -10,6 +11,7 @@ from ._html_converter import HtmlConverter from ._llm_caption import llm_caption +from ..converter_utils.images import resolve_images_dir from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -79,9 +81,19 @@ def convert( ) # Perform the conversion + save_images = kwargs.get("save_images", False) + if save_images: + actual_images_dir, md_images_prefix = resolve_images_dir( + save_images, stream_info, "pptx" + ) + else: + actual_images_dir = None + md_images_prefix = None + presentation = pptx.Presentation(file_stream) md_content = "" slide_num = 0 + img_count = 0 for slide in presentation.slides: slide_num += 1 @@ -140,8 +152,18 @@ def get_shape_content(shape, **kwargs): alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text) alt_text = re.sub(r"\s+", " ", alt_text).strip() - # If keep_data_uris is True, use base64 encoding for images - if kwargs.get("keep_data_uris", False): + # Emit the image reference + if actual_images_dir: + nonlocal img_count + img_count += 1 + content_type = shape.image.content_type or "image/png" + ext = mimetypes.guess_extension(content_type) or ".png" + ext = {".jpe": ".jpg", ".jpeg": ".jpg"}.get(ext, ext) + img_filename = f"image_{img_count}{ext}" + with open(os.path.join(actual_images_dir, img_filename), "wb") as f: + f.write(shape.image.blob) + md_content += f"\n![{alt_text}]({md_images_prefix}/{img_filename})\n" + elif kwargs.get("keep_data_uris", False): blob = shape.image.blob content_type = shape.image.content_type or "image/png" b64_string = base64.b64encode(blob).decode("utf-8")