From f0d3acaf357103b46e102f658d16b2052b215310 Mon Sep 17 00:00:00 2001
From: Viktor Lesyk <vilesyk@microsoft.com>
Date: Thu, 8 Jan 2026 19:35:23 +0100
Subject: [PATCH 1/4] Fix: PDF parsing doesn't support partially numbered lists

---
 .../markitdown/src/markitdown/__about__.py    |   2 +-
 .../markitdown/converters/_pdf_converter.py   |  68 +++++++
 .../masterformat_partial_numbering.pdf        | Bin 0 -> 2114 bytes
 .../markitdown/tests/test_pdf_masterformat.py | 172 ++++++++++++++++++
 4 files changed, 241 insertions(+), 1 deletion(-)
 create mode 100644 packages/markitdown/tests/test_files/masterformat_partial_numbering.pdf
 create mode 100644 packages/markitdown/tests/test_pdf_masterformat.py

diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
index 4c8b68f6d..3de6ec29f 100644
--- a/packages/markitdown/src/markitdown/__about__.py
+++ b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.4"
+__version__ = "0.1.5"
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
index b692f169f..76d17a670 100644
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -1,11 +1,62 @@
 import sys
 import io
+import re
 from typing import BinaryIO, Any
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 
+# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
+PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
+
+
+def _merge_partial_numbering_lines(text: str) -> str:
+    """
+    Post-process extracted text to merge MasterFormat-style partial numbering
+    with the following text line.
+
+    MasterFormat documents use partial numbering like:
+        .1  The intent of this Request for Proposal...
+        .2  Available information relative to...
+
+    Some PDF extractors split these into separate lines:
+        .1
+        The intent of this Request for Proposal...
+
+    This function merges them back together.
+    """
+    lines = text.split("\n")
+    result_lines: list[str] = []
+    i = 0
+
+    while i < len(lines):
+        line = lines[i]
+        stripped = line.strip()
+
+        # Check if this line is ONLY a partial numbering
+        if PARTIAL_NUMBERING_PATTERN.match(stripped):
+            # Look for the next non-empty line to merge with
+            j = i + 1
+            while j < len(lines) and not lines[j].strip():
+                j += 1
+
+            if j < len(lines):
+                # Merge the partial numbering with the next line
+                next_line = lines[j].strip()
+                result_lines.append(f"{stripped} {next_line}")
+                i = j + 1  # Skip past the merged line
+            else:
+                # No next line to merge with, keep as is
+                result_lines.append(line)
+                i += 1
+        else:
+            result_lines.append(line)
+            i += 1
+
+    return "\n".join(result_lines)
+
+
 # Load dependencies
 _dependency_exc_info = None
 try:
@@ -117,6 +168,14 @@ def _extract_form_content_from_words(page: Any) -> str | None:
         # Determine row type
         is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60
 
+        # Check for MasterFormat-style partial numbering (e.g., ".1", ".2")
+        # These should be treated as list items, not table rows
+        has_partial_numbering = False
+        if row_words:
+            first_word = row_words[0]["text"].strip()
+            if PARTIAL_NUMBERING_PATTERN.match(first_word):
+                has_partial_numbering = True
+
         row_info.append(
             {
                 "y_key": y_key,
@@ -125,6 +184,7 @@ def _extract_form_content_from_words(page: Any) -> str | None:
                 "x_groups": x_groups,
                 "is_paragraph": is_paragraph,
                 "num_columns": len(x_groups),
+                "has_partial_numbering": has_partial_numbering,
             }
         )
 
@@ -156,6 +216,11 @@ def _extract_form_content_from_words(page: Any) -> str | None:
             info["is_table_row"] = False
             continue
 
+        # Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows
+        if info["has_partial_numbering"]:
+            info["is_table_row"] = False
+            continue
+
         # Count how many global columns this row's words align with
         aligned_columns: set[int] = set()
         for word in info["words"]:
@@ -469,4 +534,7 @@ def convert(
             pdf_bytes.seek(0)
             markdown = pdfminer.high_level.extract_text(pdf_bytes)
 
+        # Post-process to merge MasterFormat-style partial numbering with following text
+        markdown = _merge_partial_numbering_lines(markdown)
+
         return DocumentConverterResult(markdown=markdown)
diff --git a/packages/markitdown/tests/test_files/masterformat_partial_numbering.pdf b/packages/markitdown/tests/test_files/masterformat_partial_numbering.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..246639a831813b1b6081aa5bdc0c0838c1bf3dfe
GIT binary patch
literal 2114
zcmb7F$$H{O5Wedv+AL;A8(OeI7y)9(?ATz8wX}q=kYp|KIETDIa?3r>nigZv@XaKd
zi4N+n>e~9R>Z;_pDHX^fjV1s5<M+S76uOQV3}gkgp$$DbfEqv&Ky%da2<-sugTTGQ
z@z>YaqIZ(DWTmJ&M~nmnaFhd9sbIK70t%5+fJ+phqa7v)7__xWAUgRF6lEVu4l;{(
zp!Ew3OjQPW%T^uDv<-l-OuKIT=2yl@K0;)a1o$jef;SftbM!}aw4f7Y>&#W;`afJ1
zM90#8#vk>Q{z_;58YmP5m=E~q{s13B&6Gvw6WAb{04zxXu1tX~z{j!|A<KQ1n?m0S
zJr(-kgk%Ix1TC^K_8fHv17rY?Qi9LnCj?ygC>v0GUmNdh7i<v>0G~PmN+D9m=bmhz
zOh$YRd;m5^%4dKvNc01+=xFF@2-SrTM~;m)5zGGw^fTfOIgl;KIJ3c?*{FxX-tj<M
zwjKNJ=!AX-;0+JTf$7-ID6(|(iXteMAPJr<Qw+`J2r@+wsSmkG`D;x^*B+RPCwp(8
z0KTh1q|}FRua2krAdLtV{g0d=Otnyj|IbSVAF96tR0EI>ZQq5eshdzkF6U-op<9J^
z5N%EzPj+4Oif~D`{Bx|Me1BlwaF7PdGu}5*>>Cgj(ln}#&mf<0KZ7_bsSj1jv;q`A
zDvJ87(e3g~6Qbu6y#)&a{7xWC185t;9<Uq{8S(>^g(KFMgG8WRckiv#R4=${&-K?h
z#Vt3T-KyoR;t!0!oKao67i??S)7(>H=x6n!z*eeseY_a$=f}yQQkL?r6uPN?`?)%w
z=D9}!4=>`+^;XWht}Q0KWTbP2TjBbVDRox8?DL&wrIwoC-QGVY3hi`zHJ)b7_7JWV
zgU(2)+(BQt4JGC+YFdr2z3jY;7Bxx{{xH)Ms&~!e!3b?JG~|bsSsLCYR!dyTkkXCP
z;712OO=L5%b)R6x*^3pDiOnO;)vs~6Up?x^i;=iAGkNN^bD4a6h>2uUe9*MrQb;hj
z!PqlPp4x)+z!4X4z)tF8+nsp1X(1r=?Rs{v*2H<eKdIF(bG>*kJIYqs>s0Ng;bsLn
zJP@(`zB!|nPHD4W`t)rzcBwpHE^dSzuacK4Wi}0VnoYhW&DQ!QRk)BwGm(!svMJYL
z%TqovOJ5bMg~W(WUF<d_eppr#H%ofGiy7L5aWU)-=b7VtF)FF6!(uWyY&-MaaZ6;U
z)q7)pRIQt5A*<jsQHAAN@t9XyX1q4;URAvv+b+uC@KOrKdEu6uC)bsP9<Lf~n#yT6
z*L*FO;8m?l^Xs*+e_<4A=T7dgh7ZNQqvc&Iv%ceGuC1nq`YtsS$vZkb*c2F2=AZVZ
zO#jfE90vLkr{`Tbxvx>V+GEjs?2Kgn&~YnFo*Y(Jsgc0@N>vl%QiB;jaf31$6B{=T
zy`?^n;nR@0y^3E*heUttr`ta7tr9|qGy1;?Xkfh8>HFyT<Ut)HKp7*xe1K&bngKfa
z!sAfak3L|3;gKYIP~Y$<3boa5cr^XJE<>S)_Kh4<{@xy6Ms58YxpJxel^1xjX+iHa
zAiGU~Z7NKO<#bhJh%&8{JV8(jWM!S!6_QpVtJ1=cCAL`dY-St(JhKZ0pqZ$1qi*$Y
v<2!?4c-Ph)5Owb<h6aZdfM^<^{mlFoBFs-+L-vAG5=f3HW65OeR>J-U#ll@}

literal 0
HcmV?d00001

diff --git a/packages/markitdown/tests/test_pdf_masterformat.py b/packages/markitdown/tests/test_pdf_masterformat.py
new file mode 100644
index 000000000..2b3e2c3a0
--- /dev/null
+++ b/packages/markitdown/tests/test_pdf_masterformat.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3 -m pytest
+"""Tests for MasterFormat-style partial numbering in PDF conversion."""
+
+import os
+import re
+import pytest
+
+from markitdown import MarkItDown
+
+TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
+
+
+class TestMasterFormatPartialNumbering:
+    """Test handling of MasterFormat-style partial numbering (.1, .2, etc.)."""
+
+    def test_partial_numbering_pattern_regex(self):
+        """Test that the partial numbering regex pattern correctly matches."""
+        from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
+
+        # Should match partial numbering patterns
+        assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None
+        assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None
+        assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None
+        assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None
+
+        # Should NOT match other patterns
+        assert PARTIAL_NUMBERING_PATTERN.match("1.") is None
+        assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None
+        assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None
+        assert PARTIAL_NUMBERING_PATTERN.match("text") is None
+        assert PARTIAL_NUMBERING_PATTERN.match(".a") is None
+        assert PARTIAL_NUMBERING_PATTERN.match("") is None
+
+    def test_masterformat_partial_numbering_not_split(self):
+        """Test that MasterFormat partial numbering stays with associated text.
+
+        MasterFormat documents use partial numbering like:
+            .1  The intent of this Request for Proposal...
+            .2  Available information relative to...
+
+        These should NOT be split into separate table columns, but kept
+        as coherent text lines with the number followed by its description.
+        """
+        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
+
+        markitdown = MarkItDown()
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # Partial numberings should NOT appear isolated on their own lines
+        # If they're isolated, it means the parser incorrectly split them from their text
+        lines = text_content.split("\n")
+        isolated_numberings = []
+        for line in lines:
+            stripped = line.strip()
+            # Check if line contains ONLY a partial numbering (with possible whitespace/pipes)
+            cleaned = stripped.replace("|", "").strip()
+            if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]:
+                isolated_numberings.append(stripped)
+
+        assert len(isolated_numberings) == 0, (
+            f"Partial numberings should not be isolated from their text. "
+            f"Found isolated: {isolated_numberings}"
+        )
+
+        # Verify that partial numberings appear WITH following text on the same line
+        # Look for patterns like ".1 The intent" or ".1  Some text"
+        partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
+        assert len(partial_with_text) > 0, (
+            "Expected to find partial numberings followed by text on the same line"
+        )
+
+    def test_masterformat_content_preserved(self):
+        """Test that MasterFormat document content is fully preserved."""
+        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
+
+        markitdown = MarkItDown()
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # Verify key content from the MasterFormat document is preserved
+        expected_content = [
+            "RFP for Construction Management Services",
+            "Section 00 00 43",
+            "Instructions to Respondents",
+            "Ken Sargent House",
+            "INTENT",
+            "Request for Proposal",
+            "KEN SARGENT HOUSE",
+            "GRANDE PRAIRIE, ALBERTA",
+            "Section 00 00 45",
+        ]
+
+        for content in expected_content:
+            assert content in text_content, (
+                f"Expected content '{content}' not found in extracted text"
+            )
+
+        # Verify partial numbering is followed by text on the same line
+        # .1 should be followed by "The intent" on the same line
+        assert re.search(r"\.1\s+The intent", text_content), (
+            "Partial numbering .1 should be followed by 'The intent' text"
+        )
+
+        # .2 should be followed by "Available information" on the same line
+        assert re.search(r"\.2\s+Available information", text_content), (
+            "Partial numbering .2 should be followed by 'Available information' text"
+        )
+
+        # Ensure text content is not empty and has reasonable length
+        assert len(text_content.strip()) > 100, (
+            "MasterFormat document should have substantial text content"
+        )
+
+    def test_merge_partial_numbering_with_empty_lines_between(self):
+        """Test that partial numberings merge correctly even with empty lines between.
+
+        When PDF extractors produce output like:
+            .1
+
+            The intent of this Request...
+
+        The merge logic should still combine them properly.
+        """
+        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
+
+        markitdown = MarkItDown()
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # The merged result should have .1 and .2 followed by text
+        # Check that we don't have patterns like ".1\n\nThe intent" (unmerged)
+        lines = text_content.split("\n")
+
+        for i, line in enumerate(lines):
+            stripped = line.strip()
+            # If we find an isolated partial numbering, the merge failed
+            if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]:
+                # Check if next non-empty line exists and wasn't merged
+                for j in range(i + 1, min(i + 3, len(lines))):
+                    if lines[j].strip():
+                        pytest.fail(
+                            f"Partial numbering '{stripped}' on line {i} was not "
+                            f"merged with following text '{lines[j].strip()[:30]}...'"
+                        )
+                        break
+
+    def test_multiple_partial_numberings_all_merged(self):
+        """Test that all partial numberings in a document are properly merged."""
+        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
+
+        markitdown = MarkItDown()
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # Count occurrences of merged partial numberings (number followed by text)
+        merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content))
+
+        # Count isolated partial numberings (number alone on a line)
+        isolated_count = 0
+        for line in text_content.split("\n"):
+            stripped = line.strip()
+            if re.match(r"^\.\d+$", stripped):
+                isolated_count += 1
+
+        assert merged_count >= 2, (
+            f"Expected at least 2 merged partial numberings, found {merged_count}"
+        )
+        assert isolated_count == 0, (
+            f"Found {isolated_count} isolated partial numberings that weren't merged"
+        )
+

From d7a696c11bf7c106fc94ae249b1186260d1a01c0 Mon Sep 17 00:00:00 2001
From: Viktor Lesyk <vilesyk@microsoft.com>
Date: Thu, 8 Jan 2026 19:41:24 +0100
Subject: [PATCH 2/4] Refactor: Move import of PARTIAL_NUMBERING_PATTERN to the
 top of the test file

---
 packages/markitdown/tests/test_pdf_masterformat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/markitdown/tests/test_pdf_masterformat.py b/packages/markitdown/tests/test_pdf_masterformat.py
index 2b3e2c3a0..4b1c5d727 100644
--- a/packages/markitdown/tests/test_pdf_masterformat.py
+++ b/packages/markitdown/tests/test_pdf_masterformat.py
@@ -6,6 +6,7 @@
 import pytest
 
 from markitdown import MarkItDown
+from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
 
 TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
 
@@ -15,7 +16,6 @@ class TestMasterFormatPartialNumbering:
 
     def test_partial_numbering_pattern_regex(self):
         """Test that the partial numbering regex pattern correctly matches."""
-        from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
 
         # Should match partial numbering patterns
         assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None

From c0c7fe2949465e014b26d222f2d2dbfd2b2c4845 Mon Sep 17 00:00:00 2001
From: Viktor Lesyk <vilesyk@microsoft.com>
Date: Thu, 8 Jan 2026 19:44:57 +0100
Subject: [PATCH 3/4] Refactor: Improve assertion formatting in partial
 numbering tests

---
 .../markitdown/tests/test_pdf_masterformat.py | 43 +++++++++----------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/packages/markitdown/tests/test_pdf_masterformat.py b/packages/markitdown/tests/test_pdf_masterformat.py
index 4b1c5d727..8d3eb0739 100644
--- a/packages/markitdown/tests/test_pdf_masterformat.py
+++ b/packages/markitdown/tests/test_pdf_masterformat.py
@@ -66,9 +66,9 @@ def test_masterformat_partial_numbering_not_split(self):
         # Verify that partial numberings appear WITH following text on the same line
         # Look for patterns like ".1 The intent" or ".1  Some text"
         partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
-        assert len(partial_with_text) > 0, (
-            "Expected to find partial numberings followed by text on the same line"
-        )
+        assert (
+            len(partial_with_text) > 0
+        ), "Expected to find partial numberings followed by text on the same line"
 
     def test_masterformat_content_preserved(self):
         """Test that MasterFormat document content is fully preserved."""
@@ -92,25 +92,25 @@ def test_masterformat_content_preserved(self):
         ]
 
         for content in expected_content:
-            assert content in text_content, (
-                f"Expected content '{content}' not found in extracted text"
-            )
+            assert (
+                content in text_content
+            ), f"Expected content '{content}' not found in extracted text"
 
         # Verify partial numbering is followed by text on the same line
         # .1 should be followed by "The intent" on the same line
-        assert re.search(r"\.1\s+The intent", text_content), (
-            "Partial numbering .1 should be followed by 'The intent' text"
-        )
+        assert re.search(
+            r"\.1\s+The intent", text_content
+        ), "Partial numbering .1 should be followed by 'The intent' text"
 
         # .2 should be followed by "Available information" on the same line
-        assert re.search(r"\.2\s+Available information", text_content), (
-            "Partial numbering .2 should be followed by 'Available information' text"
-        )
+        assert re.search(
+            r"\.2\s+Available information", text_content
+        ), "Partial numbering .2 should be followed by 'Available information' text"
 
         # Ensure text content is not empty and has reasonable length
-        assert len(text_content.strip()) > 100, (
-            "MasterFormat document should have substantial text content"
-        )
+        assert (
+            len(text_content.strip()) > 100
+        ), "MasterFormat document should have substantial text content"
 
     def test_merge_partial_numbering_with_empty_lines_between(self):
         """Test that partial numberings merge correctly even with empty lines between.
@@ -163,10 +163,9 @@ def test_multiple_partial_numberings_all_merged(self):
             if re.match(r"^\.\d+$", stripped):
                 isolated_count += 1
 
-        assert merged_count >= 2, (
-            f"Expected at least 2 merged partial numberings, found {merged_count}"
-        )
-        assert isolated_count == 0, (
-            f"Found {isolated_count} isolated partial numberings that weren't merged"
-        )
-
+        assert (
+            merged_count >= 2
+        ), f"Expected at least 2 merged partial numberings, found {merged_count}"
+        assert (
+            isolated_count == 0
+        ), f"Found {isolated_count} isolated partial numberings that weren't merged"

From 2574b0d2ec30b74e5a3e80964e34e864fd8b2dbb Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Thu, 8 Jan 2026 15:13:11 -0800
Subject: [PATCH 4/4] Move to beta channel.

---
 packages/markitdown/src/markitdown/__about__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
index 3de6ec29f..659cf6ba2 100644
--- a/packages/markitdown/src/markitdown/__about__.py
+++ b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.5"
+__version__ = "0.1.5b1"