From ca951502cb3321c6201779646e920380a3a51796 Mon Sep 17 00:00:00 2001 From: Bruce-anle <840596168@qq.com> Date: Sat, 9 May 2026 01:08:44 +0800 Subject: [PATCH] fix: convert VML pict images Background: DOCX files can contain legacy VML images under w:pict/v:imagedata. The markdown converter only handled w:drawing images, so those images were skipped.\n\nChanges: detect w:pict VML image relationships and emit markdown image references using the existing image relationship map.\n\nVerification: /home/brucean/doc4agent/.venv/bin/python -m pytest tests -q -p no:cacheprovider passed. --- .../converters/markdown_converter.py | 16 ++++++++++ tests/test_markdown_vml_pict_images.py | 29 +++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 tests/test_markdown_vml_pict_images.py diff --git a/docx2everything/converters/markdown_converter.py b/docx2everything/converters/markdown_converter.py index 0ff408f..5b2f36d 100644 --- a/docx2everything/converters/markdown_converter.py +++ b/docx2everything/converters/markdown_converter.py @@ -347,6 +347,22 @@ def parse_paragraph_to_markdown(p_elem, numbering_info=None, hyperlinks=None, im para_text += '\n![' + img_filename + '](' + img_md_path + ')\n' elif img_path: para_text += '\n![' + os.path.basename(img_path) + '](' + img_path + ')\n' + + # Handle legacy VML images. + for pict in p_elem.findall('.//' + qn('w:pict')): + image_data = pict.find('.//{urn:schemas-microsoft-com:vml}imagedata') + if image_data is None: + continue + + rel_id = image_data.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id') + if rel_id and images: + img_path = images.get(rel_id, '') + if img_dir and img_path: + img_filename = os.path.basename(img_path) + img_md_path = os.path.join(img_dir, img_filename) + para_text += '\n![' + img_filename + '](' + img_md_path + ')\n' + elif img_path: + para_text += '\n![' + os.path.basename(img_path) + '](' + img_path + ')\n' para_text = para_text.strip() diff --git a/tests/test_markdown_vml_pict_images.py b/tests/test_markdown_vml_pict_images.py new file mode 100644 index 0000000..8f4737f --- /dev/null +++ b/tests/test_markdown_vml_pict_images.py @@ -0,0 +1,29 @@ +import xml.etree.ElementTree as ET + +from docx2everything.converters.markdown_converter import parse_paragraph_to_markdown + + +W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" +R_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships" +V_NS = "urn:schemas-microsoft-com:vml" + + +def test_vml_pict_image_is_converted_to_markdown_image(): + paragraph = ET.fromstring(f""" + + + + + + + + + + """) + + markdown = parse_paragraph_to_markdown( + paragraph, + images={"rIdImage1": "media/image1.png"}, + ) + + assert markdown == "![image1.png](media/image1.png)"