Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
.vscode
debug/
test_pdf*.md
images_*/
my_*/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
32 changes: 31 additions & 1 deletion packages/markitdown/src/markitdown/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: MIT
import argparse
import os
import re
import sys
import codecs
from textwrap import dedent
Expand Down Expand Up @@ -110,6 +112,17 @@ def main():
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)

parser.add_argument(
"--save-images",
nargs="?",
const=True,
default=False,
metavar="DIR",
help="Extract images from documents and save them to a directory. "
"If DIR is omitted, images are saved to ./images_{output_filename}/. "
"When omitted entirely, images are not included in the output (default).",
)

parser.add_argument("filename", nargs="?")
args = parser.parse_args()

Expand Down Expand Up @@ -186,15 +199,32 @@ def main():
else:
markitdown = MarkItDown(enable_plugins=args.use_plugins)

# Resolve the images directory path
save_images = args.save_images
if save_images is True:
# Auto-compute directory name from output filename, then input filename
if args.output:
stem = os.path.splitext(os.path.basename(args.output))[0]
elif args.filename:
stem = os.path.splitext(os.path.basename(args.filename))[0]
else:
stem = "output"
stem = re.sub(r"[^\w\-]", "_", stem)
save_images = f"images_{stem}"

if args.filename is None:
result = markitdown.convert_stream(
sys.stdin.buffer,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
save_images=save_images,
)
else:
result = markitdown.convert(
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
args.filename,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
save_images=save_images,
)

_handle_output(args, result)
Expand Down
45 changes: 45 additions & 0 deletions packages/markitdown/src/markitdown/converter_utils/images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
import re

from .._stream_info import StreamInfo


def resolve_images_dir(
save_images: bool | str,
stream_info: StreamInfo,
fallback_name: str,
) -> tuple[str, str]:
"""Resolve the images directory and markdown prefix from a ``save_images`` kwarg.

Parameters
----------
save_images:
- ``str`` — use this path directly as both the directory and the
markdown image prefix.
- ``True`` — auto-derive ``images_{stem}`` from *stream_info.filename*,
falling back to *fallback_name* when no filename is available.
stream_info:
Stream metadata; ``stream_info.filename`` is used for auto-naming.
fallback_name:
Format-specific fallback stem (e.g. ``"epub"``, ``"pdf"``) used when
no filename is available and *save_images* is ``True``.

Returns
-------
(actual_images_dir, md_images_prefix)
The directory to write images into, and the prefix to use in markdown
``![alt](prefix/filename)`` references. The directory is created
(including any parents) before returning.
"""
if isinstance(save_images, str):
actual_images_dir = save_images
md_images_prefix = save_images
else:
file_stem = re.sub(
r"[^\w\-]", "_", os.path.splitext(stream_info.filename or fallback_name)[0]
)
actual_images_dir = f"images_{file_stem}"
md_images_prefix = f"./images_{file_stem}"

os.makedirs(actual_images_dir, exist_ok=True)
return actual_images_dir, md_images_prefix
44 changes: 40 additions & 4 deletions packages/markitdown/src/markitdown/converters/_docx_converter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
import base64
import mimetypes
import os
import re
import sys
import io
from warnings import warn

from bs4 import BeautifulSoup
from typing import BinaryIO, Any

from ._html_converter import HtmlConverter
from ..converter_utils.docx.pre_process import pre_process_docx
from ..converter_utils.images import resolve_images_dir
from .._base_converter import DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
Expand All @@ -27,6 +33,9 @@

ACCEPTED_FILE_EXTENSIONS = [".docx"]

# Map mimetypes.guess_extension() quirks to sane extensions
_EXT_FIXES = {".jpe": ".jpg", ".jpeg": ".jpg"}


class DocxConverter(HtmlConverter):
"""
Expand Down Expand Up @@ -77,7 +86,34 @@ def convert(

style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
**kwargs,
)
html = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value

save_images = kwargs.get("save_images", False)
if save_images:
actual_images_dir, md_prefix = resolve_images_dir(
save_images, stream_info, "docx"
)
html = self._save_images(html, actual_images_dir, md_prefix)

return self._html_converter.convert_string(html, **kwargs)

def _save_images(self, html: str, images_dir: str, md_prefix: str) -> str:
"""Extract base64 data URI images from mammoth HTML, save to *images_dir*,
and replace each src with a *md_prefix*/filename relative path."""
soup = BeautifulSoup(html, "html.parser")
for i, img in enumerate(soup.find_all("img")):
src = img.get("src", "")
if not src.startswith("data:"):
continue
try:
header, b64data = src.split(",", 1)
mime = header.split(":")[1].split(";")[0]
ext = mimetypes.guess_extension(mime) or ".bin"
ext = _EXT_FIXES.get(ext, ext)
filename = f"image_{i + 1}{ext}"
with open(os.path.join(images_dir, filename), "wb") as f:
f.write(base64.b64decode(b64data))
img["src"] = f"{md_prefix}/{filename}"
except Exception:
continue
return str(soup)
97 changes: 84 additions & 13 deletions packages/markitdown/src/markitdown/converters/_epub_converter.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
import base64
import io
import mimetypes
import os
import posixpath
import re
import zipfile
from defusedxml import minidom
from xml.dom.minidom import Document

from bs4 import BeautifulSoup
from typing import BinaryIO, Any, Dict, List

from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverterResult
from ..converter_utils.images import resolve_images_dir
from .._stream_info import StreamInfo

ACCEPTED_MIME_TYPE_PREFIXES = [
Expand Down Expand Up @@ -98,22 +105,45 @@ def convert(
]

# Extract and convert the content
# images_dir: optional base directory where images will be saved.
# A subdirectory image_{stem} is created inside it per file, so
# converting multiple files into the same dir never mixes images.
# When omitted, images are embedded inline as base64 data URIs.
save_images = kwargs.get("save_images", False)
actual_images_dir: str | None = None
md_images_prefix: str | None = None
if save_images:
actual_images_dir, md_images_prefix = resolve_images_dir(
save_images, stream_info, "epub"
)

namelist_set = set(z.namelist())
markdown_content: List[str] = []
for file in spine:
if file in z.namelist():
if file in namelist_set:
with z.open(file) as f:
filename = os.path.basename(file)
extension = os.path.splitext(filename)[1].lower()
mimetype = MIME_TYPE_MAPPING.get(extension)
converted_content = self._html_converter.convert(
f,
StreamInfo(
mimetype=mimetype,
extension=extension,
filename=filename,
),
)
markdown_content.append(converted_content.markdown.strip())
html_bytes = f.read()

# Resolve relative image src attributes so that images survive
# the conversion to Markdown.
html_bytes = self._resolve_images(
html_bytes, file, z, namelist_set,
actual_images_dir, md_images_prefix,
)

filename = os.path.basename(file)
extension = os.path.splitext(filename)[1].lower()
mimetype = MIME_TYPE_MAPPING.get(extension)
converted_content = self._html_converter.convert(
io.BytesIO(html_bytes),
StreamInfo(
mimetype=mimetype,
extension=extension,
filename=filename,
),
keep_data_uris=actual_images_dir is None,
)
markdown_content.append(converted_content.markdown.strip())

# Format and add the metadata
metadata_markdown = []
Expand All @@ -129,6 +159,47 @@ def convert(
markdown="\n\n".join(markdown_content), title=metadata["title"]
)

def _resolve_images(
self,
html_bytes: bytes,
html_path: str,
z: zipfile.ZipFile,
namelist_set: set,
images_dir: str | None,
md_images_prefix: str | None,
) -> bytes:
"""Rewrite <img src> attributes so images survive HTML-to-Markdown conversion.

If *images_dir* is given, each image is extracted there and the src is
replaced with *md_images_prefix*/filename (a path relative to the markdown
file). Otherwise the image is embedded as a base64 data URI.
"""
soup = BeautifulSoup(html_bytes, "html.parser")
changed = False
html_dir = posixpath.dirname(html_path)

for img in soup.find_all("img"):
src = img.get("src", "")
if not src or src.startswith("data:") or src.startswith("http"):
continue
resolved = posixpath.normpath(posixpath.join(html_dir, src))
if resolved not in namelist_set:
continue
img_bytes = z.read(resolved)
if images_dir:
img_filename = os.path.basename(resolved)
with open(os.path.join(images_dir, img_filename), "wb") as out:
out.write(img_bytes)
img["src"] = f"{md_images_prefix}/{img_filename}"
else:
mime, _ = mimetypes.guess_type(resolved)
mime = mime or "image/jpeg"
b64 = base64.b64encode(img_bytes).decode("ascii")
img["src"] = f"data:{mime};base64,{b64}"
changed = True

return soup.encode("utf-8") if changed else html_bytes

def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
texts = self._get_all_texts_from_nodes(dom, tag_name)
Expand Down
Loading