From ca951502cb3321c6201779646e920380a3a51796 Mon Sep 17 00:00:00 2001
From: Bruce-anle <840596168@qq.com>
Date: Sat, 9 May 2026 01:08:44 +0800
Subject: [PATCH] fix: convert VML pict images

Background: DOCX files can contain legacy VML images under w:pict/v:imagedata. The markdown converter only handled w:drawing images, so those images were skipped.\n\nChanges: detect w:pict VML image relationships and emit markdown image references using the existing image relationship map.\n\nVerification: /home/brucean/doc4agent/.venv/bin/python -m pytest tests -q -p no:cacheprovider passed.
---
 .../converters/markdown_converter.py          | 16 ++++++++++
 tests/test_markdown_vml_pict_images.py        | 29 +++++++++++++++++++
 2 files changed, 45 insertions(+)
 create mode 100644 tests/test_markdown_vml_pict_images.py

diff --git a/docx2everything/converters/markdown_converter.py b/docx2everything/converters/markdown_converter.py
index 0ff408f..5b2f36d 100644
--- a/docx2everything/converters/markdown_converter.py
+++ b/docx2everything/converters/markdown_converter.py
@@ -347,6 +347,22 @@ def parse_paragraph_to_markdown(p_elem, numbering_info=None, hyperlinks=None, im
                     para_text += '\n![' + img_filename + '](' + img_md_path + ')\n'
                 elif img_path:
                     para_text += '\n![' + os.path.basename(img_path) + '](' + img_path + ')\n'
+
+    # Handle legacy VML images.
+    for pict in p_elem.findall('.//' + qn('w:pict')):
+        image_data = pict.find('.//{urn:schemas-microsoft-com:vml}imagedata')
+        if image_data is None:
+            continue
+
+        rel_id = image_data.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id')
+        if rel_id and images:
+            img_path = images.get(rel_id, '')
+            if img_dir and img_path:
+                img_filename = os.path.basename(img_path)
+                img_md_path = os.path.join(img_dir, img_filename)
+                para_text += '\n![' + img_filename + '](' + img_md_path + ')\n'
+            elif img_path:
+                para_text += '\n![' + os.path.basename(img_path) + '](' + img_path + ')\n'
     
     para_text = para_text.strip()
     
diff --git a/tests/test_markdown_vml_pict_images.py b/tests/test_markdown_vml_pict_images.py
new file mode 100644
index 0000000..8f4737f
--- /dev/null
+++ b/tests/test_markdown_vml_pict_images.py
@@ -0,0 +1,29 @@
+import xml.etree.ElementTree as ET
+
+from docx2everything.converters.markdown_converter import parse_paragraph_to_markdown
+
+
+W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+R_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
+V_NS = "urn:schemas-microsoft-com:vml"
+
+
+def test_vml_pict_image_is_converted_to_markdown_image():
+    paragraph = ET.fromstring(f"""
+    <w:p xmlns:w="{W_NS}" xmlns:r="{R_NS}" xmlns:v="{V_NS}">
+      <w:r>
+        <w:pict>
+          <v:shape>
+            <v:imagedata r:id="rIdImage1"/>
+          </v:shape>
+        </w:pict>
+      </w:r>
+    </w:p>
+    """)
+
+    markdown = parse_paragraph_to_markdown(
+        paragraph,
+        images={"rIdImage1": "media/image1.png"},
+    )
+
+    assert markdown == "![image1.png](media/image1.png)"