diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index 4c8b68f6d..659cf6ba2 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.1.4" +__version__ = "0.1.5b1" diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index b692f169f..76d17a670 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,11 +1,62 @@ import sys import io +import re from typing import BinaryIO, Any from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10") +PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$") + + +def _merge_partial_numbering_lines(text: str) -> str: + """ + Post-process extracted text to merge MasterFormat-style partial numbering + with the following text line. + + MasterFormat documents use partial numbering like: + .1 The intent of this Request for Proposal... + .2 Available information relative to... + + Some PDF extractors split these into separate lines: + .1 + The intent of this Request for Proposal... + + This function merges them back together. + """ + lines = text.split("\n") + result_lines: list[str] = [] + i = 0 + + while i < len(lines): + line = lines[i] + stripped = line.strip() + + # Check if this line is ONLY a partial numbering + if PARTIAL_NUMBERING_PATTERN.match(stripped): + # Look for the next non-empty line to merge with + j = i + 1 + while j < len(lines) and not lines[j].strip(): + j += 1 + + if j < len(lines): + # Merge the partial numbering with the next line + next_line = lines[j].strip() + result_lines.append(f"{stripped} {next_line}") + i = j + 1 # Skip past the merged line + else: + # No next line to merge with, keep as is + result_lines.append(line) + i += 1 + else: + result_lines.append(line) + i += 1 + + return "\n".join(result_lines) + + # Load dependencies _dependency_exc_info = None try: @@ -117,6 +168,14 @@ def _extract_form_content_from_words(page: Any) -> str | None: # Determine row type is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60 + # Check for MasterFormat-style partial numbering (e.g., ".1", ".2") + # These should be treated as list items, not table rows + has_partial_numbering = False + if row_words: + first_word = row_words[0]["text"].strip() + if PARTIAL_NUMBERING_PATTERN.match(first_word): + has_partial_numbering = True + row_info.append( { "y_key": y_key, @@ -125,6 +184,7 @@ def _extract_form_content_from_words(page: Any) -> str | None: "x_groups": x_groups, "is_paragraph": is_paragraph, "num_columns": len(x_groups), + "has_partial_numbering": has_partial_numbering, } ) @@ -156,6 +216,11 @@ def _extract_form_content_from_words(page: Any) -> str | None: info["is_table_row"] = False continue + # Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows + if info["has_partial_numbering"]: + info["is_table_row"] = False + continue + # Count how many global columns this row's words align with aligned_columns: set[int] = set() for word in info["words"]: @@ -469,4 +534,7 @@ def convert( pdf_bytes.seek(0) markdown = pdfminer.high_level.extract_text(pdf_bytes) + # Post-process to merge MasterFormat-style partial numbering with following text + markdown = _merge_partial_numbering_lines(markdown) + return DocumentConverterResult(markdown=markdown) diff --git a/packages/markitdown/tests/test_files/masterformat_partial_numbering.pdf b/packages/markitdown/tests/test_files/masterformat_partial_numbering.pdf new file mode 100644 index 000000000..246639a83 Binary files /dev/null and b/packages/markitdown/tests/test_files/masterformat_partial_numbering.pdf differ diff --git a/packages/markitdown/tests/test_pdf_masterformat.py b/packages/markitdown/tests/test_pdf_masterformat.py new file mode 100644 index 000000000..8d3eb0739 --- /dev/null +++ b/packages/markitdown/tests/test_pdf_masterformat.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 -m pytest +"""Tests for MasterFormat-style partial numbering in PDF conversion.""" + +import os +import re +import pytest + +from markitdown import MarkItDown +from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") + + +class TestMasterFormatPartialNumbering: + """Test handling of MasterFormat-style partial numbering (.1, .2, etc.).""" + + def test_partial_numbering_pattern_regex(self): + """Test that the partial numbering regex pattern correctly matches.""" + + # Should match partial numbering patterns + assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None + assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None + assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None + assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None + + # Should NOT match other patterns + assert PARTIAL_NUMBERING_PATTERN.match("1.") is None + assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None + assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None + assert PARTIAL_NUMBERING_PATTERN.match("text") is None + assert PARTIAL_NUMBERING_PATTERN.match(".a") is None + assert PARTIAL_NUMBERING_PATTERN.match("") is None + + def test_masterformat_partial_numbering_not_split(self): + """Test that MasterFormat partial numbering stays with associated text. + + MasterFormat documents use partial numbering like: + .1 The intent of this Request for Proposal... + .2 Available information relative to... + + These should NOT be split into separate table columns, but kept + as coherent text lines with the number followed by its description. + """ + pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf") + + markitdown = MarkItDown() + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Partial numberings should NOT appear isolated on their own lines + # If they're isolated, it means the parser incorrectly split them from their text + lines = text_content.split("\n") + isolated_numberings = [] + for line in lines: + stripped = line.strip() + # Check if line contains ONLY a partial numbering (with possible whitespace/pipes) + cleaned = stripped.replace("|", "").strip() + if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]: + isolated_numberings.append(stripped) + + assert len(isolated_numberings) == 0, ( + f"Partial numberings should not be isolated from their text. " + f"Found isolated: {isolated_numberings}" + ) + + # Verify that partial numberings appear WITH following text on the same line + # Look for patterns like ".1 The intent" or ".1 Some text" + partial_with_text = re.findall(r"\.\d+\s+\w+", text_content) + assert ( + len(partial_with_text) > 0 + ), "Expected to find partial numberings followed by text on the same line" + + def test_masterformat_content_preserved(self): + """Test that MasterFormat document content is fully preserved.""" + pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf") + + markitdown = MarkItDown() + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Verify key content from the MasterFormat document is preserved + expected_content = [ + "RFP for Construction Management Services", + "Section 00 00 43", + "Instructions to Respondents", + "Ken Sargent House", + "INTENT", + "Request for Proposal", + "KEN SARGENT HOUSE", + "GRANDE PRAIRIE, ALBERTA", + "Section 00 00 45", + ] + + for content in expected_content: + assert ( + content in text_content + ), f"Expected content '{content}' not found in extracted text" + + # Verify partial numbering is followed by text on the same line + # .1 should be followed by "The intent" on the same line + assert re.search( + r"\.1\s+The intent", text_content + ), "Partial numbering .1 should be followed by 'The intent' text" + + # .2 should be followed by "Available information" on the same line + assert re.search( + r"\.2\s+Available information", text_content + ), "Partial numbering .2 should be followed by 'Available information' text" + + # Ensure text content is not empty and has reasonable length + assert ( + len(text_content.strip()) > 100 + ), "MasterFormat document should have substantial text content" + + def test_merge_partial_numbering_with_empty_lines_between(self): + """Test that partial numberings merge correctly even with empty lines between. + + When PDF extractors produce output like: + .1 + + The intent of this Request... + + The merge logic should still combine them properly. + """ + pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf") + + markitdown = MarkItDown() + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # The merged result should have .1 and .2 followed by text + # Check that we don't have patterns like ".1\n\nThe intent" (unmerged) + lines = text_content.split("\n") + + for i, line in enumerate(lines): + stripped = line.strip() + # If we find an isolated partial numbering, the merge failed + if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]: + # Check if next non-empty line exists and wasn't merged + for j in range(i + 1, min(i + 3, len(lines))): + if lines[j].strip(): + pytest.fail( + f"Partial numbering '{stripped}' on line {i} was not " + f"merged with following text '{lines[j].strip()[:30]}...'" + ) + break + + def test_multiple_partial_numberings_all_merged(self): + """Test that all partial numberings in a document are properly merged.""" + pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf") + + markitdown = MarkItDown() + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Count occurrences of merged partial numberings (number followed by text) + merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content)) + + # Count isolated partial numberings (number alone on a line) + isolated_count = 0 + for line in text_content.split("\n"): + stripped = line.strip() + if re.match(r"^\.\d+$", stripped): + isolated_count += 1 + + assert ( + merged_count >= 2 + ), f"Expected at least 2 merged partial numberings, found {merged_count}" + assert ( + isolated_count == 0 + ), f"Found {isolated_count} isolated partial numberings that weren't merged"