From 76a254a37551a0f01f78283b7e8bfcca40245af0 Mon Sep 17 00:00:00 2001
From: Viktor Lesyk <vilesyk@microsoft.com>
Date: Tue, 10 Feb 2026 12:20:52 +0100
Subject: [PATCH 1/5] feat: enhance PDF table extraction to support complex
 forms and add new test cases

---
 .gitattributes                                |   3 +
 .../markitdown/converters/_pdf_converter.py   |   5 +-
 .../test_files/movie-theater-booking-2024.pdf | Bin 0 -> 3798 bytes
 packages/markitdown/tests/test_pdf_tables.py  | 134 ++++++++++++++++++
 4 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 packages/markitdown/tests/test_files/movie-theater-booking-2024.pdf
diff --git a/.gitattributes b/.gitattributes
index f787c0e47..304de55dd 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,5 @@
 packages/markitdown/tests/test_files/** linguist-vendored
 packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
+
+# Treat PDF files as binary to prevent line ending conversion
+*.pdf binary
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
index b692f169f..8e01628c7 100644
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -142,11 +142,12 @@ def _extract_form_content_from_words(page: Any) -> str | None:
     all_table_x_positions.sort()
     global_columns: list[float] = []
     for x in all_table_x_positions:
-        if not global_columns or x - global_columns[-1] > 30:
+        if not global_columns or x - global_columns[-1] > 35:
             global_columns.append(x)
 
     # Too many columns suggests dense text, not a form
-    if len(global_columns) > 8:
+    # Increased limit to support complex forms with many columns
+    if len(global_columns) > 20:
         return None
 
     # Now classify each row as table row or not
diff --git a/packages/markitdown/tests/test_files/movie-theater-booking-2024.pdf b/packages/markitdown/tests/test_files/movie-theater-booking-2024.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..8555bb833d5edb63aee640e9c201288fbbeb9d46
GIT binary patch
literal 3798
zcma)9*|w_66@A}dfel+xkw#Edut8MB9vc+w=v0(mNbb!;ejs_vd%iVz?m0JmoSTt^
z2diq;T5~ODEf_Vy1vh}Pa^(a3$KU_<Pe3$%FN`Lt1`JKt4AsaqfW!kZJUw<yHv;mW
zZ@Mop4s{dyzyK8Rw8sZce;@-4z>}p2AOjqzyk{S+)`#gDZ-@8J4@XvgnFB9@LU_~u
zlNnjM3Lt$~_YBLu0f=O|UH4`E+b~ldGXcc_WEE@CuYR&vC%?)1JtR8`)|XW8)c++F
z%zC!*|K!8}&wRv}e7&TUgrpZhz5hReOija5S??D>N!BP(#c-fj$AKe&EY$GLX6-jW
z(fp0k7x0bAH#u)=6NE+R=_@lz0Fd`FAZzoNgt(ET-c0uU*!w<?fMY@ffQVj{99t5O
ztV7lP2Klys06z#@B*9+@BNNvjguS<>o{_M+d6_fMO(t*C>qnu#G2T-n)%I>*FsffL
zx^Z;(LLjfYp8Kog#V-gzdZDRCmgjQs%JN(TNAW6(V<?7W6#_4!SPn&VKm2|U9-F`Z
z*$a&qkWW04>i!o<!kH24W>UhxyZKGt#L_}F{1t$K!<Q~TK+rdClID7tdeY#r`|?fQ
zx>}}@<Xc;jos=BA(R=dE5~{wRycWnnwO?O}d>`fyB70sEksKm2wvxgX;N2zS``~|L
z@EfUL_<Q%ukI;c-M@jL?iFa4M(TE~BGkG`OjczCGeP1L<6Vts#cK|0a;*Io*5{{jZ
zp&BtxJF^<YpJB<K(9t|i>7GjqO|lm>@z)Wf#Ly6Hdj)FkoeW*UI$`f6Q1!A@4%ABA
zrttfQKtisnIp-5z_Q=Dq+Tm#uISeVX{Mo>v!?k1|FNo7^z$iZ`q)ugUZ=mfkq|Xzs
zki#C-^1zIQU7VYx8Gh0luXd+nrBc5ap%u7d6^_~*aFz-uXJ4!xrn~t}0(n!a*NY|e
z&S$%u%xS#udNNd9z<93ekou!$%W>{G<d+TPAY!Gd3f}uRZtjD8V<M;FOJ)9yrVYcP
zol;Cg2^6x+*yT1lpC9D&aZi-%thb8C!+B_^11&fqP~m(7Zw~c3(6<e(g61(a^Eb&k
z^U4X;+&t4ncVPZ};=ydQxH4&V3Qfv`Q2Jyx8@;fr@Mdl|ogaI*MP*T-kJ(H-U(9eh
z;~wV6PVIcmWM#fQ3_r)RbDkEfM`}dc`>ifd_!6}3Od4#@(!eN|jFMmIrqQ9sRBxl_
zsJJJ~3a#>I?qW^V@oedP<qdyj2FJ*QH&%7GQkKo-{h)KqWS!YBhrU}6Uz@Gd2*fl7
z`fj+ARb*-v7R4*^XtPx~-jaRiEC=2VI(5WopbTKu2)e7K23ktq9E}1QI`_7H-XC(e
zk-t-vcu5FVZgwSiryiRVy2X-l%Cr&X8bA;JRBVKrA+`279#IkO)IHcThX+%wM1pYE
zpQ{oHgY3gKhN4XOwx8|wDeNqy0hHo1H*T!UkG6u`x@%T$GCLuAR-5hi01hHP&#TK_
zTgq6YNd~LYdz|d{6m#Jr{#-dSI+{OZkVjnvCFGbjyauU(Ws?9`sGMu=1qVqD=k-Sf
z^OV#Q)?;#1(qP>UeLaQqC@aes3=|70o09uEK@|5hfffk%l6oMH8u(nRAO_h@Ls<vl
zBvhOt#}97>V{z!4twUzJ_lU|#Y(2Ym3D*#5(|97&40ytuZm(D}b4FM1&;52pj-O4U
zSiTa|O$T<aHLJHi<Mr1Wyp_Q5`NmSIr?kSUHMp+k;&XO5&IP-|cG0S}$hIM!>2`Lx
zxTV}Qq_6Wdtmp{*>}#vsrNHh*a>dryi#?4!0xYvc`jX!h!aeh9YRfBsV&Zj!TZ@%4
zh0IzK6jJBapinDbn*@Q8Zl%=kS@@#*sXZJQt3}0NLp8Z0bXZ1ZYB{l>_EWf&qoVzk
za(lw0YE>J?4GY@0%ay|zqx0w}>|Y&>Zp`&l+^(^WM$nwQS%EzFYHK<M(HhoWX(^e*
z;@aLw-!7xvWg)Z*g;pfWp}|#yiqs*OS52aD0*^<0=rMV7TUcP@$G+q}h)DYMG_7?C
z;87fcgG?84ZZTEd7TSGUGy1*i7A?xP`RZhsvl4N4;-K-G8UnL=7V898Sw<UHZgjno
z#dg70B{LKZksI`U8A?7bvSUT#`*g(RK}!ccDHCyEUkn5PSu(haZKwKG2Qr}sUS%h<
zZA!AlZNbPF7rveIvtV2JoCfBWX5yJ}CQHQPJnJ=V>3SK(l|x)Swf0A2Gk<>SjDw%(
z{dSA%8!u2UG%dl$>3Y;7(Xky@!#OE#wsQJ@$)pL@h$1|Ayl@S}?@KGy5m=uPY`4|G
zxJb_zQZsf_;LL6`xyMrcDNLln(rPrHYNp-gs@rn+thC(hx>P<=1`}PcLaCkOwBC3L
zv+g)pyXOWr8R}G(+#Ux5i|KX;7jAf$@`EFO^4hJfHk>t@ObUKAO|^EM*oSM;v}9^0
zYiPldeUK`Q&;GTDrSa=Ho@4%O4A!payt7zy9rV+FthQ}0OS1mPvCaMKG*E-{YMOiO
z%HF{!RKZ8SAMMMtw1cy`en%<q&lGDGojgf7*@K}6PPSb_tq4MxdZ+#ruL9eg)wp_e
z$PMU5M$axN{IVGbx7jiK+#E@BvXG}DgKBdnTs+f<_C0RB)~E5{c41p7-?3p;896Ic
z@>6HK@=Envx4}3d+*>2<GP8z5)<XU<gND-y&8XNjTKbzc)o3eRL$4KwP6rQ~@eEs5
zpu2Cb?<V}IWG>^qOqgfJJ3gP-m`6?zuaTzFvnvY81@(DQFu{&=x>Obd{7^VD-K{>d
z_L;Ud?XE&S%lZ)EFf(H`7d}foQ>3Ro=2Sbj*IH|u=Bu)?TkmjIZ4P}#fSqHuM9IAk
zQcLO-N+4EN5YBd=e8V4&w%Pf;+-yW`(yWyA*GU;Rc*bm6Exp)c8L0PZ*VfRAW^E&V
zmLN8|jq=FgGBKN*$+DChOj9SC8b8~Us&;M-DZz)=!)9uWVFZ!E>md#{U08Hxd^j}3
zRIU2Sr^Pb|A1`_u8}x3nT}YQ|tIakJbktLQZpoZ3?jti8JUck3jFl}xW>e$kh*y~A
zYU8i_)H=1C*jA4r;rh{iE`4@u@Y5=MgQ7j#UBvSsJOxizk?usVS)EXIp~<z#j=rB}
z@jZeQFWsG4l21IO`&H-;PWm&|8RfT&ZQ)h&jpt(Q(((;nS4N;wey!~E=?V`WrVv`&
zQY(L`)zF6)pX}@5jul6IOK-AXb79<?scRR?J)K)NeOOey9fQnBB;44MRvE*rA+E(H
zEI|UMaazB9qo?CxQy*XKHb>a4a%tYFT1N80I|@6NKS-|(s$_B$ObW|M3(^1DO61M=
zd)IfN^Gj%6KTx3lf&TsgRg$cbz!mu2R!uJV?+0*yw_#`^k$$q_X#Eei3jW7f5=(BE
zKgE*NAMq$6(MCV{)xYMALe;X(@SB_)alnT{n-#2Xka`6rO+%w}^r}@gvu+ZEUd3wE
zRc-xmkK+gUMOEJZv#Nw)z_1dX@sqy%Kt}G>1K#RF{78h27XfdzlFV1uU(AH%Z}(m`
SjK0xcA<)_f2=?cLkADHZ|4*9$

literal 0
HcmV?d00001

diff --git a/packages/markitdown/tests/test_pdf_tables.py b/packages/markitdown/tests/test_pdf_tables.py
index 92a79ddc9..2f8d96d01 100644
--- a/packages/markitdown/tests/test_pdf_tables.py
+++ b/packages/markitdown/tests/test_pdf_tables.py
@@ -650,6 +650,140 @@ def test_scanned_pdf_handling(self, markitdown):
             result.text_content.strip() == ""
         ), f"Scanned PDF should have empty extraction, got: '{result.text_content[:100]}...'"
 
+    def test_broadcast_order_pdf_extraction(self, markitdown):
+        """Test extraction of broadcast order PDF with many columns.
+
+        Expected output: Pipe-separated format with order details, agencies,
+        advertisers, and line items in structured tables.
+        """
+        pdf_path = os.path.join(
+            TEST_FILES_DIR, "bf57756e-868b-4a9e-294a-81494d3ab2f1.pdf"
+        )
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # Validate pipe-separated table format
+        assert "|" in text_content, "Broadcast order should contain pipe separators"
+
+        # Validate key order information
+        expected_strings = [
+            "ORDER",
+            "1940495",  # Order number
+            "MIKE BLOOMBERG 2020 INC",  # Product description
+            "02/17/20 - 02/21/20",  # Flight dates
+            "WOC12391815",  # Alt order number
+        ]
+        validate_strings(result, expected_strings)
+
+        # Validate agency information
+        agency_strings = [
+            "Assembly / POL",  # Agency name
+            "Heather Goldsmith",  # Buying contact
+            "WTLV-TV",  # Station
+            "Jim Quinn",  # Primary AE
+        ]
+        validate_strings(result, agency_strings)
+
+        # Validate advertiser information
+        advertiser_strings = [
+            "POL/ Michael Bloomberg",
+            "A18-49",  # Demographic
+            "PL-Presidential",  # Product codes
+        ]
+        validate_strings(result, advertiser_strings)
+
+        # Validate bill plan totals
+        billing_strings = [
+            "$2,750.00",  # Gross amount
+            "$2,337.50",  # Net amount
+            "February 2020",  # Month
+        ]
+        validate_strings(result, billing_strings)
+
+        # Validate line item details
+        line_item_strings = [
+            "WTLV",  # Channel
+            "Local News @ 6p M-F",
+            "Lincoln Rhyme",
+            "$400.00",  # Rate
+            "$800.00",  # Rate
+            "$1,200.00",  # Amount
+        ]
+        validate_strings(result, line_item_strings)
+
+    def test_movie_theater_booking_pdf_extraction(self, markitdown):
+        """Test extraction of movie theater booking PDF with complex tables.
+
+        Expected output: Pipe-separated format with booking details, agency info,
+        customer details, and show schedules in structured tables.
+        """
+        pdf_path = os.path.join(
+            TEST_FILES_DIR, "movie-theater-booking-2024.pdf"
+        )
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # Validate pipe-separated table format
+        assert "|" in text_content, "Booking order should contain pipe separators"
+
+        # Validate key booking information
+        expected_strings = [
+            "BOOKING ORDER",
+            "2024-12-5678",  # Order number
+            "Holiday Movie Marathon Package",  # Product description
+            "12/20/2024 - 12/31/2024",  # Booking dates
+            "SC-WINTER-2024",  # Alt order number
+            "STARLIGHT CINEMAS",  # Cinema brand
+        ]
+        validate_strings(result, expected_strings)
+
+        # Validate agency information
+        agency_strings = [
+            "Premier Entertainment Group",  # Agency name
+            "Michael Chen",  # Contact
+            "Sarah Johnson",  # Primary contact
+            "Downtown Multiplex",  # Cinema name
+        ]
+        validate_strings(result, agency_strings)
+
+        # Validate customer information
+        customer_strings = [
+            "Universal Studios Distribution",  # Customer name
+            "Film Distributor",  # Category
+            "CUST-98765",  # Customer ID
+        ]
+        validate_strings(result, customer_strings)
+
+        # Validate booking summary totals
+        booking_strings = [
+            "$12,500.00",  # Gross amount
+            "$11,250.00",  # Net amount
+            "December 2024",  # Month
+            "48",  # Number of shows
+        ]
+        validate_strings(result, booking_strings)
+
+        # Validate show schedule details
+        show_strings = [
+            "Holiday Spectacular",  # Movie title
+            "Winter Wonderland",  # Movie title
+            "New Year Mystery",  # Movie title
+            "IMAX 3D",  # Format
+            "$250",  # Rate
+            "$300",  # Rate
+            "$3,000",  # Revenue
+            "$3,600",  # Revenue
+        ]
+        validate_strings(result, show_strings)
+
 
 class TestPdfTableMarkdownFormat:
     """Test that extracted tables have proper markdown formatting."""

From be94561a617b5ba3072c692097cb7cec50303af1 Mon Sep 17 00:00:00 2001
From: Viktor Lesyk <vilesyk@microsoft.com>
Date: Tue, 10 Feb 2026 12:21:36 +0100
Subject: [PATCH 2/5] chore: update version to 0.1.6b1

---
 packages/markitdown/src/markitdown/__about__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
index 4c8b68f6d..ff0280657 100644
--- a/packages/markitdown/src/markitdown/__about__.py
+++ b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.4"
+__version__ = "0.1.6b1"

From a50f2bb98e19db00e45de190a3b8d084d8af6fa2 Mon Sep 17 00:00:00 2001
From: Viktor Lesyk <vilesyk@microsoft.com>
Date: Tue, 10 Feb 2026 12:49:38 +0100
Subject: [PATCH 3/5] feat: enhance PDF table extraction with adaptive column
 clustering and add comprehensive test cases

---
 .../markitdown/converters/_pdf_converter.py   |  56 ++-
 ...EDRPT-2024-PAT-3847_medical_report_scan.md |   0
 .../RECEIPT-2024-TXN-98765_retail_purchase.md |  81 +++++
 .../REPAIR-2022-INV-001_multipage.md          |  76 ++++
 .../SPARSE-2024-INV-1234_borderless_table.md  |  44 +++
 .../movie-theater-booking-2024.md             |  62 ++++
 .../tests/test_files/expected_outputs/test.md |  65 ++++
 packages/markitdown/tests/test_pdf_tables.py  | 324 ++++++++++++++----
 8 files changed, 638 insertions(+), 70 deletions(-)
 create mode 100644 packages/markitdown/tests/test_files/expected_outputs/MEDRPT-2024-PAT-3847_medical_report_scan.md
 create mode 100644 packages/markitdown/tests/test_files/expected_outputs/RECEIPT-2024-TXN-98765_retail_purchase.md
 create mode 100644 packages/markitdown/tests/test_files/expected_outputs/REPAIR-2022-INV-001_multipage.md
 create mode 100644 packages/markitdown/tests/test_files/expected_outputs/SPARSE-2024-INV-1234_borderless_table.md
 create mode 100644 packages/markitdown/tests/test_files/expected_outputs/movie-theater-booking-2024.md
 create mode 100644 packages/markitdown/tests/test_files/expected_outputs/test.md

diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
index 8e01628c7..5134a1e35 100644
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -138,16 +138,62 @@ def _extract_form_content_from_words(page: Any) -> str | None:
     if not all_table_x_positions:
         return None
 
-    # Compute global column boundaries
+    # Compute adaptive column clustering tolerance based on gap analysis
     all_table_x_positions.sort()
+
+    # Calculate gaps between consecutive x-positions
+    gaps = []
+    for i in range(len(all_table_x_positions) - 1):
+        gap = all_table_x_positions[i + 1] - all_table_x_positions[i]
+        if gap > 5:  # Only significant gaps
+            gaps.append(gap)
+
+    # Determine optimal tolerance using statistical analysis
+    if gaps and len(gaps) >= 3:
+        # Use 70th percentile of gaps as threshold (balances precision/recall)
+        sorted_gaps = sorted(gaps)
+        percentile_70_idx = int(len(sorted_gaps) * 0.70)
+        adaptive_tolerance = sorted_gaps[percentile_70_idx]
+
+        # Clamp tolerance to reasonable range [25, 50]
+        adaptive_tolerance = max(25, min(50, adaptive_tolerance))
+    else:
+        # Fallback to conservative value
+        adaptive_tolerance = 35
+
+    # Compute global column boundaries using adaptive tolerance
     global_columns: list[float] = []
     for x in all_table_x_positions:
-        if not global_columns or x - global_columns[-1] > 35:
+        if not global_columns or x - global_columns[-1] > adaptive_tolerance:
             global_columns.append(x)
 
-    # Too many columns suggests dense text, not a form
-    # Increased limit to support complex forms with many columns
-    if len(global_columns) > 20:
+    # Adaptive max column check based on page characteristics
+    # Calculate average column width
+    if len(global_columns) > 1:
+        content_width = global_columns[-1] - global_columns[0]
+        avg_col_width = content_width / len(global_columns)
+
+        # Forms with very narrow columns (< 30px) are likely dense text
+        if avg_col_width < 30:
+            return None
+
+        # Compute adaptive max based on columns per inch
+        # Typical forms have 3-8 columns per inch
+        columns_per_inch = len(global_columns) / (content_width / 72)
+
+        # If density is too high (> 10 cols/inch), likely not a form
+        if columns_per_inch > 10:
+            return None
+
+        # Adaptive max: allow more columns for wider pages
+        # Standard letter is 612pt wide, so scale accordingly
+        adaptive_max_columns = int(20 * (page_width / 612))
+        adaptive_max_columns = max(15, adaptive_max_columns)  # At least 15
+
+        if len(global_columns) > adaptive_max_columns:
+            return None
+    else:
+        # Single column, not a form
         return None
 
     # Now classify each row as table row or not
diff --git a/packages/markitdown/tests/test_files/expected_outputs/MEDRPT-2024-PAT-3847_medical_report_scan.md b/packages/markitdown/tests/test_files/expected_outputs/MEDRPT-2024-PAT-3847_medical_report_scan.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/packages/markitdown/tests/test_files/expected_outputs/RECEIPT-2024-TXN-98765_retail_purchase.md b/packages/markitdown/tests/test_files/expected_outputs/RECEIPT-2024-TXN-98765_retail_purchase.md
new file mode 100644
index 000000000..379de4df7
--- /dev/null
+++ b/packages/markitdown/tests/test_files/expected_outputs/RECEIPT-2024-TXN-98765_retail_purchase.md
@@ -0,0 +1,81 @@
+TECHMART ELECTRONICS
+4567 Innovation Blvd
+San Francisco, CA 94103
+(415) 555-0199
+
+===================================
+
+Store #0342 - Downtown SF
+11/23/2024 14:32:18 PST
+TXN: TXN-98765-2024
+Cashier: Emily Rodriguez
+Register: POS-07
+
+-----------------------------------
+
+Wireless Noise-Cancelling
+Headphones - Premium Black
+AUDIO-5521 1 @ $349.99
+Member Discount $-50.00
+$299.99
+USB-C Hub 7-in-1 Adapter
+with HDMI & Ethernet
+ACC-8834 2 @ $79.99
+$159.98
+Portable SSD 2TB
+Thunderbolt 3 Compatible
+STOR-2241 1 @ $289.00
+Member Discount $-29.00
+$260.00
+Ergonomic Wireless Mouse
+Rechargeable Battery
+ACC-9012 1 @ $59.99
+$59.99
+Screen Cleaning Kit
+Professional Grade
+CARE-1156 3 @ $12.99
+$38.97
+HDMI 2.1 Cable 6ft
+8K Resolution Support
+CABLE-7789 2 @ $24.99
+Member Discount $-5.00
+$44.98
+-----------------------------------
+
+SUBTOTAL $863.91
+Member Discount (15%)-$84.00
+Sales Tax (8.5%) $66.23
+Rewards Applied -$25.00
+===================================
+TOTAL $821.14
+===================================
+
+PAYMENT METHOD
+Visa Card ending in 4782
+Auth: 847392
+Ref: REF-20241123-98765
+
+-----------------------------------
+
+REWARDS MEMBER
+Sarah Mitchell
+ID: TM-447821
+Points Earned: 821
+Total Points: 3,247
+Next Reward: $50 gift card
+at 5,000 pts (1,753 to go)
+
+-----------------------------------
+
+RETURN POLICY
+Returns within 30 days
+Receipt required
+Electronics must be unopened
+
+*TXN98765202411231432*
+
+Thank you for shopping!
+www.techmart.example.com
+
+===================================
+
diff --git a/packages/markitdown/tests/test_files/expected_outputs/REPAIR-2022-INV-001_multipage.md b/packages/markitdown/tests/test_files/expected_outputs/REPAIR-2022-INV-001_multipage.md
new file mode 100644
index 000000000..e80967b45
--- /dev/null
+++ b/packages/markitdown/tests/test_files/expected_outputs/REPAIR-2022-INV-001_multipage.md
@@ -0,0 +1,76 @@
+ZAVA AUTO REPAIR
+Certified Collision Repair
+123 Main Street, Redmond, WA 98052
+Phone: (425) 000-0000
+Preliminary Estimate (ID: EST-1008)
+| Customer Information |                     |     | Vehicle Information |                   |
+| -------------------- | ------------------- | --- | ------------------- | ----------------- |
+| Insured name         | Gabriel Diaz        |     | Year                | 2022              |
+| Claim #              | SF-1008             |     | Make                | Jeep              |
+| Policy #             | POL-2022-555        |     | Model               | Grand Cherokee    |
+| Phone                | (425) 111-1111      |     | Trim                | Limited           |
+| Email                | gabriel@contoso.com |     | VIN                 | 1C4RJFBG2NC123456 |
+|                      |                     |     | Color               | White             |
+|                      |                     |     | Odometer            | 9,800             |
+| Repair Order #       | RO-20221108         |     | Estimator           | Ellis Turner      |
+Estimate Totals
+|                  |     | Hours | Rate | Cost  |
+| ---------------- | --- | ----- | ---- | ----- |
+| Parts            |     |       |      | 2,100 |
+| Body Labor       |     | 2     | 150  | 300   |
+| Paint Labor      |     | 1.5   | 150  | 225   |
+| Mechanical Labor |     | -     | -    | -     |
+Supplies
+|               | Paint Supplies           |     |        | 60     |
+| ------------- | ------------------------ | --- | ------ | ------ |
+|               | Body Supplies            |     |        | 30     |
+| Other Charges |                          |     |        | 15     |
+| Subtotal      |                          |     |        | 2,730  |
+| Sales Tax     |                          |     | 10.20% | 278.46 |
+| GRAND TOTAL   |                          |     |        | 5,738  |
+| Note          | Minor rear bumper repair |     |        |        |
+This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
+after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
+present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
+models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
+any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
+deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
+
+ZAVA AUTO REPAIR
+Certified Collision Repair
+123 Main Street, Redmond, WA 98052
+Phone: (425) 000-0000
+Preliminary Estimate (ID: EST-1008)
+Customer Information Vehicle Information
+| Insured name   | Bruce Wayne                |     | Year      | 2025         |
+| -------------- | -------------------------- | --- | --------- | ------------ |
+| Claim #        |                            | 999 | Make      | Batman       |
+| Policy #       | IM-BATMAN                  |     | Model     | Batmobile    |
+| Phone          | (416) 555-1234             |     | Trim      | Limited      |
+| Email          | batman@wayneindustries.com |     | VIN       | XXX          |
+|                |                            |     | Color     | Black        |
+|                |                            |     | Odometer  | 1            |
+| Repair Order # | RO-20221108                |     | Estimator | Ellis Turner |
+Estimate Totals
+|                  |     | Hours | Rate | Cost   |
+| ---------------- | --- | ----- | ---- | ------ |
+| Parts            |     |       |      | 99,999 |
+| Body Labor       |     | 2     | 150  | 300    |
+| Paint Labor      |     | 1.5   | 150  | 225    |
+| Mechanical Labor |     | -     | -    | -      |
+Supplies
+|               | Paint Supplies           |     |        | 60        |
+| ------------- | ------------------------ | --- | ------ | --------- |
+|               | Body Supplies            |     |        | 30        |
+| Other Charges |                          |     |        | 15        |
+| Subtotal      |                          |     |        | 100,629   |
+| Sales Tax     |                          |     | 10.20% | 10264.158 |
+| GRAND TOTAL   |                          |     |        | 211,522   |
+| Note          | Minor rear bumper repair |     |        |           |
+
+This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
+after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
+present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
+models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
+any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
+deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
\ No newline at end of file
diff --git a/packages/markitdown/tests/test_files/expected_outputs/SPARSE-2024-INV-1234_borderless_table.md b/packages/markitdown/tests/test_files/expected_outputs/SPARSE-2024-INV-1234_borderless_table.md
new file mode 100644
index 000000000..797496452
--- /dev/null
+++ b/packages/markitdown/tests/test_files/expected_outputs/SPARSE-2024-INV-1234_borderless_table.md
@@ -0,0 +1,44 @@
+INVENTORY RECONCILIATION REPORT
+Report ID: SPARSE-2024-INV-1234
+Warehouse: Distribution Center East
+Report Date: 2024-11-15
+Prepared By: Sarah Martinez
+| Product Code | Location | Expected | Actual | Variance | Status   |
+| ------------ | -------- | -------- | ------ | -------- | -------- |
+| SKU-8847     | A-12     | 450      |        |          |          |
+|              | B-07     |          | 289    | -23      |          |
+| SKU-9201     |          | 780      | 778    |          | OK       |
+|              | C-15     |          |        | +15      |          |
+| SKU-4563     | D-22     |          | 156    |          | CRITICAL |
+|              |          | 180      |        | -24      |          |
+| SKU-7728     | A-08     | 920      |        |          |          |
+|              |          |          | 935    | +15      | OK       |
+Variance Analysis:
+Summary Statistics:
+Total Variance Cost: $4,287.50
+Critical Items: 1
+Overall Accuracy: 97.2%
+Detailed Analysis by Category:
+The inventory reconciliation reveals several key findings. The primary variance driver is SKU-4563,
+which shows a -24 unit discrepancy requiring immediate investigation. Location B-07 handling of
+SKU-8847 also demonstrates significant variance. Cross-location verification protocols should be
+
+reviewed to prevent future discrepancies. The overall accuracy rate of 97.2% meets our target
+threshold, but critical items require expedited resolution to maintain operational efficiency.
+Extended Inventory Review:
+| Product Code | Category    | Unit Cost | Total Value | Last Audit | Notes      |
+| ------------ | ----------- | --------- | ----------- | ---------- | ---------- |
+| SKU-8847     | Electronics | $45.00    | $13,005.00  | 2024-10-15 |            |
+| SKU-9201     | Hardware    | $32.50    | $25,285.00  | 2024-10-22 | Verified   |
+| SKU-4563     | Software    | $120.00   | $18,720.00  |            | Critical   |
+| SKU-7728     | Accessories | $15.75    | $14,726.25  | 2024-11-01 |            |
+| SKU-3345     | Electronics | $67.00    | $22,445.00  | 2024-10-18 |            |
+| SKU-5512     | Hardware    | $89.00    | $31,150.00  |            | Pending    |
+| SKU-6678     | Software    | $200.00   | $42,000.00  | 2024-10-25 | High Value |
+| SKU-7789     | Accessories | $8.50     | $5,950.00   | 2024-11-05 |            |
+| SKU-2234     | Electronics | $125.00   | $35,000.00  |            |            |
+| SKU-1123     | Hardware    | $55.00    | $27,500.00  | 2024-10-30 | Verified   |
+Recommendations:
+1. Immediate review of SKU-4563 handling procedures. 2. Implement additional verification for critical
+items. 3. Schedule follow-up audit for high-value products (SKU-6678, SKU-2234).
+Approval:
\ No newline at end of file
diff --git a/packages/markitdown/tests/test_files/expected_outputs/movie-theater-booking-2024.md b/packages/markitdown/tests/test_files/expected_outputs/movie-theater-booking-2024.md
new file mode 100644
index 000000000..371cee776
--- /dev/null
+++ b/packages/markitdown/tests/test_files/expected_outputs/movie-theater-booking-2024.md
@@ -0,0 +1,62 @@
+BOOKING ORDER
+Print Date 12/15/2024 14:30:22
+Page 1 of 1
+STARLIGHT CINEMAS
+Orders
+| Order / Rev: | 2024-12-5678   |     |     | Cinema:          |     | Downtown Multiplex |
+| ------------ | -------------- | --- | --- | ---------------- | --- | ------------------ |
+| Alt Order #: | SC-WINTER-2024 |     |     | Primary Contact: |     | Sarah Johnson      |
+Product Desc: Holiday Movie Marathon Package Location: NYC-01
+| Estimate:            | EST-456                 |     |     | Region: |     | NORTHEAST |
+| -------------------- | ----------------------- | --- | --- | ------- | --- | --------- |
+| Booking Dates:       | 12/20/2024 - 12/31/2024 |     |     |         |     |           |
+| Original Date / Rev: | 12/01/24 / 12/10/24     |     |     |         |     |           |
+| Order Type:          | Premium Package         |     |     |         |     |           |
+Booking Agency
+| Name:            | Premier Entertainment Group |     |     |                |     |           |
+| ---------------- | --------------------------- | --- | --- | -------------- | --- | --------- |
+|                  |                             |     |     | Billing Type:  |     | Net 30    |
+| Contact:         | Michael Chen                |     |     |                |     |           |
+|                  |                             |     |     | Payment Terms: |     | Corporate |
+| Billing Contact: | accounting@premierent.com   |     |     |                |     |           |
+|                  |                             |     |     | Commission:    |     | 10%       |
+555 Broadway Suite 1200
+New York, NY 10012
+Customer
+| Name:          | Universal Studios Distribution |     |     |     |     |     |
+| -------------- | ------------------------------ | --- | --- | --- | --- | --- |
+| Category:      | Film Distributor               |     |     |     |     |     |
+| Contact Email: | bookings@universalstudios.com  |     |     |     |     |     |
+| Customer ID:   | CUST-98765                     |     |     |     |     |     |
+| Revenue Code:  | FILM-PREMIUM                   |     |     |     |     |     |
+Booking Summary
+| Start Date | End Date | # Shows | Gross Amount | Net Amount |     |     |
+| ---------- | -------- | ------- | ------------ | ---------- | --- | --- |
+| 12/20/24   | 12/31/24 | 48      | $12,500.00   | $11,250.00 |     |     |
+Totals
+| Month         | # Shows | Gross Amount |     | Net Amount |     | Occupancy |
+| ------------- | ------- | ------------ | --- | ---------- | --- | --------- |
+| December 2024 | 48      | $12,500.00   |     | $11,250.00 |     | 85%       |
+| Totals        | 48      | $12,500.00   |     | $11,250.00 |     | 85%       |
+Account Representatives
+Representative Territory Region Start Date / End Date Commission %
+| Sarah Johnson | NYC Metro | NORTHEAST | 12/20/24 - 12/31/24 |     | 100% |     |
+| ------------- | --------- | --------- | ------------------- | --- | ---- | --- |
+Show Schedule Details
+Ln Screen Start End Movie Title Format Showtime Days Shows Rate Type Total
+1 SCR-1 12/20/24 12/25/24 Holiday Spectacular IMAX 3D 7:00 PM Daily 12 $250 PM $3,000
+(Runtime: 142 min); Holiday Season Premium
+2 SCR-2 12/20/24 12/31/24 Winter Wonderland Standard 4:30 PM Daily 24 $150 MT $3,600
+(Runtime: 98 min); Matinee Special
+3 SCR-1 12/26/24 12/31/24 New Year Mystery 4DX 9:30 PM Daily 12 $300 PM $3,600
+(Runtime: 116 min); Premium Experience
+Show Details
+| Show Screen | Date Range | Title | Showtime | Days Type | Rate | Revenue |
+| ----------- | ---------- | ----- | -------- | --------- | ---- | ------- |
+1 SCR-1 12/20-12/25 Holiday Spectacular 7:00 PM Daily PM $250 $3,000
+This booking order is subject to cinema availability and standard terms.
+2 SCR-2 12/20-12/31 Winter Wonderland 4:30 PM Daily MT $150 $3,600
+All showtimes are approximate and subject to change.
+3 SCR-1 12/26-12/31 New Year Mystery 9:30 PM Daily PM $300 $3,600
+| Total Revenue: |     |     |     |     |     | $12,500.00 |
+| -------------- | --- | --- | --- | --- | --- | ---------- |
\ No newline at end of file
diff --git a/packages/markitdown/tests/test_files/expected_outputs/test.md b/packages/markitdown/tests/test_files/expected_outputs/test.md
new file mode 100644
index 000000000..2d9c90c07
--- /dev/null
+++ b/packages/markitdown/tests/test_files/expected_outputs/test.md
@@ -0,0 +1,65 @@
+1
+
+Introduction
+
+Large language models (LLMs) are becoming a crucial building block in developing powerful agents
+that utilize LLMs for reasoning, tool usage, and adapting to new observations (Yao et al., 2022; Xi
+et al., 2023; Wang et al., 2023b) in many real-world tasks. Given the expanding tasks that could
+benefit from LLMs and the growing task complexity, an intuitive approach to scale up the power of
+agents is to use multiple agents that cooperate. Prior work suggests that multiple agents can help
+encourage divergent thinking (Liang et al., 2023), improve factuality and reasoning (Du et al., 2023),
+and provide validation (Wu et al., 2023). In light of the intuition and early evidence of promise, it is
+intriguing to ask the following question: how can we facilitate the development of LLM applications
+that could span a broad spectrum of domains and complexities based on the multi-agent approach?
+
+Our insight is to use multi-agent conversations to achieve it. There are at least three reasons con-
+firming its general feasibility and utility thanks to recent advances in LLMs: First, because chat-
+optimized LLMs (e.g., GPT-4) show the ability to incorporate feedback, LLM agents can cooperate
+through conversations with each other or human(s), e.g., a dialog where agents provide and seek rea-
+soning, observations, critiques, and validation. Second, because a single LLM can exhibit a broad
+range of capabilities (especially when configured with the correct prompt and inference settings),
+conversations between differently configured agents can help combine these broad LLM capabilities
+in a modular and complementary manner. Third, LLMs have demonstrated ability to solve complex
+tasks when the tasks are broken into simpler subtasks. Multi-agent conversations can enable this
+partitioning and integration in an intuitive manner. How can we leverage the above insights and
+support different applications with the common requirement of coordinating multiple agents, poten-
+tially backed by LLMs, humans, or tools exhibiting different capacities? We desire a multi-agent
+conversation framework with generic abstraction and effective implementation that has the flexibil-
+ity to satisfy different application needs. Achieving this requires addressing two critical questions:
+(1) How can we design individual agents that are capable, reusable, customizable, and effective in
+multi-agent collaboration? (2) How can we develop a straightforward, unified interface that can
+accommodate a wide range of agent conversation patterns? In practice, applications of varying
+complexities may need distinct sets of agents with specific capabilities, and may require different
+conversation patterns, such as single- or multi-turn dialogs, different human involvement modes, and
+static vs. dynamic conversation. Moreover, developers may prefer the flexibility to program agent
+interactions in natural language or code. Failing to adequately address these two questions would
+limit the framework’s scope of applicability and generality.
+While there is contemporaneous exploration of multi-agent approaches,3 we present AutoGen, a
+generalized multi-agent conversation framework (Figure 1), based on the following new concepts.
+1 Customizable and conversable agents. AutoGen uses a generic design of agents that can lever-
+age LLMs, human inputs, tools, or a combination of them. The result is that developers can
+easily and quickly create agents with different roles (e.g., agents to write code, execute code,
+wire in human feedback, validate outputs, etc.) by selecting and configuring a subset of built-in
+capabilities. The agent’s backend can also be readily extended to allow more custom behaviors.
+To make these agents suitable for multi-agent conversation, every agent is made conversable –
+they can receive, react, and respond to messages. When configured properly, an agent can hold
+multiple turns of conversations with other agents autonomously or solicit human inputs at cer-
+tain rounds, enabling human agency and automation. The conversable agent design leverages the
+strong capability of the most advanced LLMs in taking feedback and making progress via chat
+and also allows combining capabilities of LLMs in a modular fashion. (Section 2.1)
+
+2 Conversation programming. A fundamental insight of AutoGen is to simplify and unify com-
+plex LLM application workflows as multi-agent conversations. So AutoGen adopts a program-
+ming paradigm centered around these inter-agent conversations. We refer to this paradigm as
+conversation programming, which streamlines the development of intricate applications via two
+primary steps: (1) defining a set of conversable agents with specific capabilities and roles (as
+described above); (2) programming the interaction behavior between agents via conversation-
+centric computation and control. Both steps can be achieved via a fusion of natural and pro-
+gramming languages to build applications with a wide range of conversation patterns and agent
+behaviors. AutoGen provides ready-to-use implementations and also allows easy extension and
+experimentation for both steps. (Section 2.2)
+
+3We refer to Appendix A for a detailed discussion.
+
+2
+
diff --git a/packages/markitdown/tests/test_pdf_tables.py b/packages/markitdown/tests/test_pdf_tables.py
index 2f8d96d01..75ac6d381 100644
--- a/packages/markitdown/tests/test_pdf_tables.py
+++ b/packages/markitdown/tests/test_pdf_tables.py
@@ -650,71 +650,6 @@ def test_scanned_pdf_handling(self, markitdown):
             result.text_content.strip() == ""
         ), f"Scanned PDF should have empty extraction, got: '{result.text_content[:100]}...'"
 
-    def test_broadcast_order_pdf_extraction(self, markitdown):
-        """Test extraction of broadcast order PDF with many columns.
-
-        Expected output: Pipe-separated format with order details, agencies,
-        advertisers, and line items in structured tables.
-        """
-        pdf_path = os.path.join(
-            TEST_FILES_DIR, "bf57756e-868b-4a9e-294a-81494d3ab2f1.pdf"
-        )
-
-        if not os.path.exists(pdf_path):
-            pytest.skip(f"Test file not found: {pdf_path}")
-
-        result = markitdown.convert(pdf_path)
-        text_content = result.text_content
-
-        # Validate pipe-separated table format
-        assert "|" in text_content, "Broadcast order should contain pipe separators"
-
-        # Validate key order information
-        expected_strings = [
-            "ORDER",
-            "1940495",  # Order number
-            "MIKE BLOOMBERG 2020 INC",  # Product description
-            "02/17/20 - 02/21/20",  # Flight dates
-            "WOC12391815",  # Alt order number
-        ]
-        validate_strings(result, expected_strings)
-
-        # Validate agency information
-        agency_strings = [
-            "Assembly / POL",  # Agency name
-            "Heather Goldsmith",  # Buying contact
-            "WTLV-TV",  # Station
-            "Jim Quinn",  # Primary AE
-        ]
-        validate_strings(result, agency_strings)
-
-        # Validate advertiser information
-        advertiser_strings = [
-            "POL/ Michael Bloomberg",
-            "A18-49",  # Demographic
-            "PL-Presidential",  # Product codes
-        ]
-        validate_strings(result, advertiser_strings)
-
-        # Validate bill plan totals
-        billing_strings = [
-            "$2,750.00",  # Gross amount
-            "$2,337.50",  # Net amount
-            "February 2020",  # Month
-        ]
-        validate_strings(result, billing_strings)
-
-        # Validate line item details
-        line_item_strings = [
-            "WTLV",  # Channel
-            "Local News @ 6p M-F",
-            "Lincoln Rhyme",
-            "$400.00",  # Rate
-            "$800.00",  # Rate
-            "$1,200.00",  # Amount
-        ]
-        validate_strings(result, line_item_strings)
-
     def test_movie_theater_booking_pdf_extraction(self, markitdown):
         """Test extraction of movie theater booking PDF with complex tables.
 
@@ -785,6 +720,265 @@ def test_movie_theater_booking_pdf_extraction(self, markitdown):
         validate_strings(result, show_strings)
 
 
+class TestPdfFullOutputComparison:
+    """Test that PDF extraction produces expected complete outputs."""
+
+    @pytest.fixture
+    def markitdown(self):
+        """Create MarkItDown instance."""
+        return MarkItDown()
+
+    def test_movie_theater_full_output(self, markitdown):
+        """Test complete output for movie theater booking PDF."""
+        pdf_path = os.path.join(TEST_FILES_DIR, "movie-theater-booking-2024.pdf")
+        expected_path = os.path.join(
+            TEST_FILES_DIR, "expected_outputs", "movie-theater-booking-2024.md"
+        )
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        if not os.path.exists(expected_path):
+            pytest.skip(f"Expected output not found: {expected_path}")
+
+        result = markitdown.convert(pdf_path)
+        actual_output = result.text_content
+
+        with open(expected_path, "r", encoding="utf-8") as f:
+            expected_output = f.read()
+
+        # Compare outputs
+        actual_lines = [line.rstrip() for line in actual_output.split("\n")]
+        expected_lines = [line.rstrip() for line in expected_output.split("\n")]
+
+        # Check line count
+        assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
+            f"Line count mismatch: actual={len(actual_lines)}, "
+            f"expected={len(expected_lines)}"
+        )
+
+        # Check structural elements
+        assert actual_output.count("|") > 80, "Should have many pipe separators"
+        assert actual_output.count("---") > 8, "Should have table separators"
+
+        # Validate critical sections
+        for section in [
+            "BOOKING ORDER",
+            "STARLIGHT CINEMAS",
+            "2024-12-5678",
+            "Holiday Spectacular",
+            "$12,500.00",
+        ]:
+            assert section in actual_output, f"Missing section: {section}"
+
+        # Check table structure
+        table_rows = [line for line in actual_lines if line.startswith("|")]
+        assert len(table_rows) > 15, f"Should have >15 table rows, got {len(table_rows)}"
+
+    def test_sparse_borderless_table_full_output(self, markitdown):
+        """Test complete output for SPARSE borderless table PDF."""
+        pdf_path = os.path.join(
+            TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
+        )
+        expected_path = os.path.join(
+            TEST_FILES_DIR,
+            "expected_outputs",
+            "SPARSE-2024-INV-1234_borderless_table.md",
+        )
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        if not os.path.exists(expected_path):
+            pytest.skip(f"Expected output not found: {expected_path}")
+
+        result = markitdown.convert(pdf_path)
+        actual_output = result.text_content
+
+        with open(expected_path, "r", encoding="utf-8") as f:
+            expected_output = f.read()
+
+        # Compare outputs
+        actual_lines = [line.rstrip() for line in actual_output.split("\n")]
+        expected_lines = [line.rstrip() for line in expected_output.split("\n")]
+
+        # Check line count is close
+        assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
+            f"Line count mismatch: actual={len(actual_lines)}, "
+            f"expected={len(expected_lines)}"
+        )
+
+        # Check structural elements
+        assert actual_output.count("|") > 50, "Should have many pipe separators"
+
+        # Validate critical sections
+        for section in [
+            "INVENTORY RECONCILIATION REPORT",
+            "SPARSE-2024-INV-1234",
+            "SKU-8847",
+            "SKU-9201",
+            "Variance Analysis",
+        ]:
+            assert section in actual_output, f"Missing section: {section}"
+
+    def test_repair_multipage_full_output(self, markitdown):
+        """Test complete output for REPAIR multipage invoice PDF."""
+        pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf")
+        expected_path = os.path.join(
+            TEST_FILES_DIR, "expected_outputs", "REPAIR-2022-INV-001_multipage.md"
+        )
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        if not os.path.exists(expected_path):
+            pytest.skip(f"Expected output not found: {expected_path}")
+
+        result = markitdown.convert(pdf_path)
+        actual_output = result.text_content
+
+        with open(expected_path, "r", encoding="utf-8") as f:
+            expected_output = f.read()
+
+        # Compare outputs
+        actual_lines = [line.rstrip() for line in actual_output.split("\n")]
+        expected_lines = [line.rstrip() for line in expected_output.split("\n")]
+
+        # Check line count is close
+        assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
+            f"Line count mismatch: actual={len(actual_lines)}, "
+            f"expected={len(expected_lines)}"
+        )
+
+        # Check structural elements
+        assert actual_output.count("|") > 40, "Should have many pipe separators"
+
+        # Validate critical sections
+        for section in [
+            "ZAVA AUTO REPAIR",
+            "Gabriel Diaz",
+            "Jeep",
+            "Grand Cherokee",
+            "GRAND TOTAL",
+        ]:
+            assert section in actual_output, f"Missing section: {section}"
+
+    def test_receipt_full_output(self, markitdown):
+        """Test complete output for RECEIPT retail purchase PDF."""
+        pdf_path = os.path.join(
+            TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf"
+        )
+        expected_path = os.path.join(
+            TEST_FILES_DIR,
+            "expected_outputs",
+            "RECEIPT-2024-TXN-98765_retail_purchase.md",
+        )
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        if not os.path.exists(expected_path):
+            pytest.skip(f"Expected output not found: {expected_path}")
+
+        result = markitdown.convert(pdf_path)
+        actual_output = result.text_content
+
+        with open(expected_path, "r", encoding="utf-8") as f:
+            expected_output = f.read()
+
+        # Compare outputs
+        actual_lines = [line.rstrip() for line in actual_output.split("\n")]
+        expected_lines = [line.rstrip() for line in expected_output.split("\n")]
+
+        # Check line count is close
+        assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
+            f"Line count mismatch: actual={len(actual_lines)}, "
+            f"expected={len(expected_lines)}"
+        )
+
+        # Validate critical sections
+        for section in [
+            "TECHMART ELECTRONICS",
+            "TXN-98765-2024",
+            "Sarah Mitchell",
+            "$821.14",
+            "RETURN POLICY",
+        ]:
+            assert section in actual_output, f"Missing section: {section}"
+
+    def test_academic_paper_full_output(self, markitdown):
+        """Test complete output for academic paper PDF."""
+        pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf")
+        expected_path = os.path.join(TEST_FILES_DIR, "expected_outputs", "test.md")
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        if not os.path.exists(expected_path):
+            pytest.skip(f"Expected output not found: {expected_path}")
+
+        result = markitdown.convert(pdf_path)
+        actual_output = result.text_content
+
+        with open(expected_path, "r", encoding="utf-8") as f:
+            expected_output = f.read()
+
+        # Compare outputs
+        actual_lines = [line.rstrip() for line in actual_output.split("\n")]
+        expected_lines = [line.rstrip() for line in expected_output.split("\n")]
+
+        # Check line count is close
+        assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
+            f"Line count mismatch: actual={len(actual_lines)}, "
+            f"expected={len(expected_lines)}"
+        )
+
+        # Academic paper should not have pipe separators
+        assert (
+            actual_output.count("|") == 0
+        ), "Academic paper should not have pipe separators"
+
+        # Validate critical sections
+        for section in [
+            "Introduction",
+            "Large language models",
+            "agents",
+            "multi-agent",
+        ]:
+            assert section in actual_output, f"Missing section: {section}"
+
+    def test_medical_scan_full_output(self, markitdown):
+        """Test complete output for medical report scan PDF (empty, no text layer)."""
+        pdf_path = os.path.join(
+            TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf"
+        )
+        expected_path = os.path.join(
+            TEST_FILES_DIR,
+            "expected_outputs",
+            "MEDRPT-2024-PAT-3847_medical_report_scan.md",
+        )
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        if not os.path.exists(expected_path):
+            pytest.skip(f"Expected output not found: {expected_path}")
+
+        result = markitdown.convert(pdf_path)
+        actual_output = result.text_content
+
+        with open(expected_path, "r", encoding="utf-8") as f:
+            expected_output = f.read()
+
+        # Both should be empty (scanned PDF with no text layer)
+        assert (
+            actual_output.strip() == ""
+        ), "Scanned PDF should produce empty output"
+        assert (
+            expected_output.strip() == ""
+        ), "Expected output should be empty for scanned PDF"
+
+
 class TestPdfTableMarkdownFormat:
     """Test that extracted tables have proper markdown formatting."""
 

From bd20acd7c646a19c4987dce712d2fced5c2f47c5 Mon Sep 17 00:00:00 2001
From: Viktor Lesyk <vilesyk@microsoft.com>
Date: Tue, 10 Feb 2026 12:54:36 +0100
Subject: [PATCH 4/5] fix: correct formatting and improve assertions in PDF
 table tests

---
 packages/markitdown/src/markitdown/__about__.py |  2 +-
 packages/markitdown/tests/test_pdf_tables.py    | 13 ++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
index d499fa78b..ff0280657 100644
--- a/packages/markitdown/src/markitdown/__about__.py
+++ b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.6b1"
\ No newline at end of file
+__version__ = "0.1.6b1"
diff --git a/packages/markitdown/tests/test_pdf_tables.py b/packages/markitdown/tests/test_pdf_tables.py
index 75ac6d381..d26de0cb9 100644
--- a/packages/markitdown/tests/test_pdf_tables.py
+++ b/packages/markitdown/tests/test_pdf_tables.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3 -m pytest
 """Tests for PDF table extraction functionality."""
+
 import os
 import re
 import pytest
@@ -656,9 +657,7 @@ def test_movie_theater_booking_pdf_extraction(self, markitdown):
         Expected output: Pipe-separated format with booking details, agency info,
         customer details, and show schedules in structured tables.
         """
-        pdf_path = os.path.join(
-            TEST_FILES_DIR, "movie-theater-booking-2024.pdf"
-        )
+        pdf_path = os.path.join(TEST_FILES_DIR, "movie-theater-booking-2024.pdf")
 
         if not os.path.exists(pdf_path):
             pytest.skip(f"Test file not found: {pdf_path}")
@@ -773,7 +772,9 @@ def test_movie_theater_full_output(self, markitdown):
 
         # Check table structure
         table_rows = [line for line in actual_lines if line.startswith("|")]
-        assert len(table_rows) > 15, f"Should have >15 table rows, got {len(table_rows)}"
+        assert (
+            len(table_rows) > 15
+        ), f"Should have >15 table rows, got {len(table_rows)}"
 
     def test_sparse_borderless_table_full_output(self, markitdown):
         """Test complete output for SPARSE borderless table PDF."""
@@ -971,9 +972,7 @@ def test_medical_scan_full_output(self, markitdown):
             expected_output = f.read()
 
         # Both should be empty (scanned PDF with no text layer)
-        assert (
-            actual_output.strip() == ""
-        ), "Scanned PDF should produce empty output"
+        assert actual_output.strip() == "", "Scanned PDF should produce empty output"
         assert (
             expected_output.strip() == ""
         ), "Expected output should be empty for scanned PDF"

From 51869c5155a1fe6761553254fcc8c52b077884af Mon Sep 17 00:00:00 2001
From: Viktor Lesyk <vilesyk@microsoft.com>
Date: Thu, 12 Feb 2026 09:53:45 +0100
Subject: [PATCH 5/5] chore: revert version to 0.1.5b2 in __about__.py

---
 packages/markitdown/src/markitdown/__about__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
index ff0280657..e49b8c4d6 100644
--- a/packages/markitdown/src/markitdown/__about__.py
+++ b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.6b1"
+__version__ = "0.1.5b2"