From f0d3acaf357103b46e102f658d16b2052b215310 Mon Sep 17 00:00:00 2001 From: Viktor Lesyk Date: Thu, 8 Jan 2026 19:35:23 +0100 Subject: [PATCH 1/4] Fix: PDF parsing doesn't support partially numbered lists --- .../markitdown/src/markitdown/__about__.py | 2 +- .../markitdown/converters/_pdf_converter.py | 68 +++++++ .../masterformat_partial_numbering.pdf | Bin 0 -> 2114 bytes .../markitdown/tests/test_pdf_masterformat.py | 172 ++++++++++++++++++ 4 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 packages/markitdown/tests/test_files/masterformat_partial_numbering.pdf create mode 100644 packages/markitdown/tests/test_pdf_masterformat.py diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index 4c8b68f6d..3de6ec29f 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.1.4" +__version__ = "0.1.5" diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index b692f169f..76d17a670 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,11 +1,62 @@ import sys import io +import re from typing import BinaryIO, Any from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10") +PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$") + + +def _merge_partial_numbering_lines(text: str) -> str: + """ + Post-process extracted text to merge MasterFormat-style partial numbering + with the following text line. + + MasterFormat documents use partial numbering like: + .1 The intent of this Request for Proposal... + .2 Available information relative to... + + Some PDF extractors split these into separate lines: + .1 + The intent of this Request for Proposal... + + This function merges them back together. + """ + lines = text.split("\n") + result_lines: list[str] = [] + i = 0 + + while i < len(lines): + line = lines[i] + stripped = line.strip() + + # Check if this line is ONLY a partial numbering + if PARTIAL_NUMBERING_PATTERN.match(stripped): + # Look for the next non-empty line to merge with + j = i + 1 + while j < len(lines) and not lines[j].strip(): + j += 1 + + if j < len(lines): + # Merge the partial numbering with the next line + next_line = lines[j].strip() + result_lines.append(f"{stripped} {next_line}") + i = j + 1 # Skip past the merged line + else: + # No next line to merge with, keep as is + result_lines.append(line) + i += 1 + else: + result_lines.append(line) + i += 1 + + return "\n".join(result_lines) + + # Load dependencies _dependency_exc_info = None try: @@ -117,6 +168,14 @@ def _extract_form_content_from_words(page: Any) -> str | None: # Determine row type is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60 + # Check for MasterFormat-style partial numbering (e.g., ".1", ".2") + # These should be treated as list items, not table rows + has_partial_numbering = False + if row_words: + first_word = row_words[0]["text"].strip() + if PARTIAL_NUMBERING_PATTERN.match(first_word): + has_partial_numbering = True + row_info.append( { "y_key": y_key, @@ -125,6 +184,7 @@ def _extract_form_content_from_words(page: Any) -> str | None: "x_groups": x_groups, "is_paragraph": is_paragraph, "num_columns": len(x_groups), + "has_partial_numbering": has_partial_numbering, } ) @@ -156,6 +216,11 @@ def _extract_form_content_from_words(page: Any) -> str | None: info["is_table_row"] = False continue + # Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows + if info["has_partial_numbering"]: + info["is_table_row"] = False + continue + # Count how many global columns this row's words align with aligned_columns: set[int] = set() for word in info["words"]: @@ -469,4 +534,7 @@ def convert( pdf_bytes.seek(0) markdown = pdfminer.high_level.extract_text(pdf_bytes) + # Post-process to merge MasterFormat-style partial numbering with following text + markdown = _merge_partial_numbering_lines(markdown) + return DocumentConverterResult(markdown=markdown) diff --git a/packages/markitdown/tests/test_files/masterformat_partial_numbering.pdf b/packages/markitdown/tests/test_files/masterformat_partial_numbering.pdf new file mode 100644 index 0000000000000000000000000000000000000000..246639a831813b1b6081aa5bdc0c0838c1bf3dfe GIT binary patch literal 2114 zcmb7F$$H{O5Wedv+AL;A8(OeI7y)9(?ATz8wX}q=kYp|KIETDIa?3r>nigZv@XaKd zi4N+n>e~9R>Z;_pDHX^fjV1s5YaqIZ(DWTmJ&M~nmnaFhd9sbIK70t%5+fJ+phqa7v)7__xWAUgRF6lEVu4l;{( zp!Ew3OjQPW%T^uDv<-l-OuKIT=2yl@K0;)a1o$jef;SftbM!}aw4f7Y>&#W;`afJ1 zM90#8#vk>Q{z_;58YmP5m=E~q{s13B&6Gvw6WAb{04zxXu1tX~z{j!|Av0GUmNdh7i0G~PmN+D9m=bmhz zOh$YRd;m5^%4dKvNc01+=xFF@2-SrTM~;m)5zGGw^fTfOIgl;KIJ3c?*{FxX-tjZQq5eshdzkF6U-op<9J^ z5N%EzPj+4Oif~D`{Bx|Me1BlwaF7PdGu}5*>>Cgj(ln}#&mf<0KZ7_bsSj1jv;q`A zDvJ87(e3g~6Qbu6y#)&a{7xWC185t;9^g(KFMgG8WRckiv#R4=${&-K?h z#Vt3T-KyoR;t!0!oKao67i??S)7(>H=x6n!z*eeseY_a$=f}yQQkL?r6uPN?`?)%w z=D9}!4=>`+^;XWht}Q0KWTbP2TjBbVDRox8?DL&wrIwoC-QGVY3hi`zHJ)b7_7JWV zgU(2)+(BQt4JGC+YFdr2z3jY;7Bxx{{xH)Ms&~!e!3b?JG~|bsSsLCYR!dyTkkXCP z;712OO=L5%b)R6x*^3pDiOnO;)vs~6Up?x^i;=iAGkNN^bD4a6h>2uUe9*MrQb;hj z!PqlPp4x)+z!4X4z)tF8+nsp1X(1r=?Rs{v*2H*kJIYqs>s0Ng;bsLn zJP@(`zB!|nPHD4W`t)rzcBwpHE^dSzuacK4Wi}0VnoYhW&DQ!QRk)BwGm(!svMJYL z%TqovOJ5bMg~W(WUFV+GEjs?2Kgn&~YnFo*Y(Jsgc0@N>vl%QiB;jaf31$6B{=T zy`?^n;nR@0y^3E*heUttr`ta7tr9|qGy1;?Xkfh8>HFyTS)_Kh4<{@xy6Ms58YxpJxel^1xjX+iHa zAiGU~Z7NKO<#bhJh%&8{JV8(jWM!S!6_QpVtJ1=cCAL`dY-St(JhKZ0pqZ$1qi*$Y v<2!?4c-Ph)5OwbJ-U#ll@} literal 0 HcmV?d00001 diff --git a/packages/markitdown/tests/test_pdf_masterformat.py b/packages/markitdown/tests/test_pdf_masterformat.py new file mode 100644 index 000000000..2b3e2c3a0 --- /dev/null +++ b/packages/markitdown/tests/test_pdf_masterformat.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 -m pytest +"""Tests for MasterFormat-style partial numbering in PDF conversion.""" + +import os +import re +import pytest + +from markitdown import MarkItDown + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") + + +class TestMasterFormatPartialNumbering: + """Test handling of MasterFormat-style partial numbering (.1, .2, etc.).""" + + def test_partial_numbering_pattern_regex(self): + """Test that the partial numbering regex pattern correctly matches.""" + from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN + + # Should match partial numbering patterns + assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None + assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None + assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None + assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None + + # Should NOT match other patterns + assert PARTIAL_NUMBERING_PATTERN.match("1.") is None + assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None + assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None + assert PARTIAL_NUMBERING_PATTERN.match("text") is None + assert PARTIAL_NUMBERING_PATTERN.match(".a") is None + assert PARTIAL_NUMBERING_PATTERN.match("") is None + + def test_masterformat_partial_numbering_not_split(self): + """Test that MasterFormat partial numbering stays with associated text. + + MasterFormat documents use partial numbering like: + .1 The intent of this Request for Proposal... + .2 Available information relative to... + + These should NOT be split into separate table columns, but kept + as coherent text lines with the number followed by its description. + """ + pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf") + + markitdown = MarkItDown() + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Partial numberings should NOT appear isolated on their own lines + # If they're isolated, it means the parser incorrectly split them from their text + lines = text_content.split("\n") + isolated_numberings = [] + for line in lines: + stripped = line.strip() + # Check if line contains ONLY a partial numbering (with possible whitespace/pipes) + cleaned = stripped.replace("|", "").strip() + if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]: + isolated_numberings.append(stripped) + + assert len(isolated_numberings) == 0, ( + f"Partial numberings should not be isolated from their text. " + f"Found isolated: {isolated_numberings}" + ) + + # Verify that partial numberings appear WITH following text on the same line + # Look for patterns like ".1 The intent" or ".1 Some text" + partial_with_text = re.findall(r"\.\d+\s+\w+", text_content) + assert len(partial_with_text) > 0, ( + "Expected to find partial numberings followed by text on the same line" + ) + + def test_masterformat_content_preserved(self): + """Test that MasterFormat document content is fully preserved.""" + pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf") + + markitdown = MarkItDown() + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Verify key content from the MasterFormat document is preserved + expected_content = [ + "RFP for Construction Management Services", + "Section 00 00 43", + "Instructions to Respondents", + "Ken Sargent House", + "INTENT", + "Request for Proposal", + "KEN SARGENT HOUSE", + "GRANDE PRAIRIE, ALBERTA", + "Section 00 00 45", + ] + + for content in expected_content: + assert content in text_content, ( + f"Expected content '{content}' not found in extracted text" + ) + + # Verify partial numbering is followed by text on the same line + # .1 should be followed by "The intent" on the same line + assert re.search(r"\.1\s+The intent", text_content), ( + "Partial numbering .1 should be followed by 'The intent' text" + ) + + # .2 should be followed by "Available information" on the same line + assert re.search(r"\.2\s+Available information", text_content), ( + "Partial numbering .2 should be followed by 'Available information' text" + ) + + # Ensure text content is not empty and has reasonable length + assert len(text_content.strip()) > 100, ( + "MasterFormat document should have substantial text content" + ) + + def test_merge_partial_numbering_with_empty_lines_between(self): + """Test that partial numberings merge correctly even with empty lines between. + + When PDF extractors produce output like: + .1 + + The intent of this Request... + + The merge logic should still combine them properly. + """ + pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf") + + markitdown = MarkItDown() + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # The merged result should have .1 and .2 followed by text + # Check that we don't have patterns like ".1\n\nThe intent" (unmerged) + lines = text_content.split("\n") + + for i, line in enumerate(lines): + stripped = line.strip() + # If we find an isolated partial numbering, the merge failed + if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]: + # Check if next non-empty line exists and wasn't merged + for j in range(i + 1, min(i + 3, len(lines))): + if lines[j].strip(): + pytest.fail( + f"Partial numbering '{stripped}' on line {i} was not " + f"merged with following text '{lines[j].strip()[:30]}...'" + ) + break + + def test_multiple_partial_numberings_all_merged(self): + """Test that all partial numberings in a document are properly merged.""" + pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf") + + markitdown = MarkItDown() + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Count occurrences of merged partial numberings (number followed by text) + merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content)) + + # Count isolated partial numberings (number alone on a line) + isolated_count = 0 + for line in text_content.split("\n"): + stripped = line.strip() + if re.match(r"^\.\d+$", stripped): + isolated_count += 1 + + assert merged_count >= 2, ( + f"Expected at least 2 merged partial numberings, found {merged_count}" + ) + assert isolated_count == 0, ( + f"Found {isolated_count} isolated partial numberings that weren't merged" + ) + From d7a696c11bf7c106fc94ae249b1186260d1a01c0 Mon Sep 17 00:00:00 2001 From: Viktor Lesyk Date: Thu, 8 Jan 2026 19:41:24 +0100 Subject: [PATCH 2/4] Refactor: Move import of PARTIAL_NUMBERING_PATTERN to the top of the test file --- packages/markitdown/tests/test_pdf_masterformat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/markitdown/tests/test_pdf_masterformat.py b/packages/markitdown/tests/test_pdf_masterformat.py index 2b3e2c3a0..4b1c5d727 100644 --- a/packages/markitdown/tests/test_pdf_masterformat.py +++ b/packages/markitdown/tests/test_pdf_masterformat.py @@ -6,6 +6,7 @@ import pytest from markitdown import MarkItDown +from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") @@ -15,7 +16,6 @@ class TestMasterFormatPartialNumbering: def test_partial_numbering_pattern_regex(self): """Test that the partial numbering regex pattern correctly matches.""" - from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN # Should match partial numbering patterns assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None From c0c7fe2949465e014b26d222f2d2dbfd2b2c4845 Mon Sep 17 00:00:00 2001 From: Viktor Lesyk Date: Thu, 8 Jan 2026 19:44:57 +0100 Subject: [PATCH 3/4] Refactor: Improve assertion formatting in partial numbering tests --- .../markitdown/tests/test_pdf_masterformat.py | 43 +++++++++---------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/packages/markitdown/tests/test_pdf_masterformat.py b/packages/markitdown/tests/test_pdf_masterformat.py index 4b1c5d727..8d3eb0739 100644 --- a/packages/markitdown/tests/test_pdf_masterformat.py +++ b/packages/markitdown/tests/test_pdf_masterformat.py @@ -66,9 +66,9 @@ def test_masterformat_partial_numbering_not_split(self): # Verify that partial numberings appear WITH following text on the same line # Look for patterns like ".1 The intent" or ".1 Some text" partial_with_text = re.findall(r"\.\d+\s+\w+", text_content) - assert len(partial_with_text) > 0, ( - "Expected to find partial numberings followed by text on the same line" - ) + assert ( + len(partial_with_text) > 0 + ), "Expected to find partial numberings followed by text on the same line" def test_masterformat_content_preserved(self): """Test that MasterFormat document content is fully preserved.""" @@ -92,25 +92,25 @@ def test_masterformat_content_preserved(self): ] for content in expected_content: - assert content in text_content, ( - f"Expected content '{content}' not found in extracted text" - ) + assert ( + content in text_content + ), f"Expected content '{content}' not found in extracted text" # Verify partial numbering is followed by text on the same line # .1 should be followed by "The intent" on the same line - assert re.search(r"\.1\s+The intent", text_content), ( - "Partial numbering .1 should be followed by 'The intent' text" - ) + assert re.search( + r"\.1\s+The intent", text_content + ), "Partial numbering .1 should be followed by 'The intent' text" # .2 should be followed by "Available information" on the same line - assert re.search(r"\.2\s+Available information", text_content), ( - "Partial numbering .2 should be followed by 'Available information' text" - ) + assert re.search( + r"\.2\s+Available information", text_content + ), "Partial numbering .2 should be followed by 'Available information' text" # Ensure text content is not empty and has reasonable length - assert len(text_content.strip()) > 100, ( - "MasterFormat document should have substantial text content" - ) + assert ( + len(text_content.strip()) > 100 + ), "MasterFormat document should have substantial text content" def test_merge_partial_numbering_with_empty_lines_between(self): """Test that partial numberings merge correctly even with empty lines between. @@ -163,10 +163,9 @@ def test_multiple_partial_numberings_all_merged(self): if re.match(r"^\.\d+$", stripped): isolated_count += 1 - assert merged_count >= 2, ( - f"Expected at least 2 merged partial numberings, found {merged_count}" - ) - assert isolated_count == 0, ( - f"Found {isolated_count} isolated partial numberings that weren't merged" - ) - + assert ( + merged_count >= 2 + ), f"Expected at least 2 merged partial numberings, found {merged_count}" + assert ( + isolated_count == 0 + ), f"Found {isolated_count} isolated partial numberings that weren't merged" From 2574b0d2ec30b74e5a3e80964e34e864fd8b2dbb Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Thu, 8 Jan 2026 15:13:11 -0800 Subject: [PATCH 4/4] Move to beta channel. --- packages/markitdown/src/markitdown/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index 3de6ec29f..659cf6ba2 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.1.5" +__version__ = "0.1.5b1"