From 76a254a37551a0f01f78283b7e8bfcca40245af0 Mon Sep 17 00:00:00 2001 From: Viktor Lesyk Date: Tue, 10 Feb 2026 12:20:52 +0100 Subject: [PATCH 1/5] feat: enhance PDF table extraction to support complex forms and add new test cases --- .gitattributes | 3 + .../markitdown/converters/_pdf_converter.py | 5 +- .../test_files/movie-theater-booking-2024.pdf | Bin 0 -> 3798 bytes packages/markitdown/tests/test_pdf_tables.py | 134 ++++++++++++++++++ 4 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 packages/markitdown/tests/test_files/movie-theater-booking-2024.pdf diff --git a/.gitattributes b/.gitattributes index f787c0e47..304de55dd 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,5 @@ packages/markitdown/tests/test_files/** linguist-vendored packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored + +# Treat PDF files as binary to prevent line ending conversion +*.pdf binary diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index b692f169f..8e01628c7 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -142,11 +142,12 @@ def _extract_form_content_from_words(page: Any) -> str | None: all_table_x_positions.sort() global_columns: list[float] = [] for x in all_table_x_positions: - if not global_columns or x - global_columns[-1] > 30: + if not global_columns or x - global_columns[-1] > 35: global_columns.append(x) # Too many columns suggests dense text, not a form - if len(global_columns) > 8: + # Increased limit to support complex forms with many columns + if len(global_columns) > 20: return None # Now classify each row as table row or not diff --git a/packages/markitdown/tests/test_files/movie-theater-booking-2024.pdf b/packages/markitdown/tests/test_files/movie-theater-booking-2024.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8555bb833d5edb63aee640e9c201288fbbeb9d46 GIT binary patch literal 3798 zcma)9*|w_66@A}dfel+xkw#Edut8MB9vc+w=v0(mNbb!;ejs_vd%iVz?m0JmoSTt^ z2diq;T5~ODEf_Vy1vh}Pa^(a3$KU_}p2AOjqzyk{S+)`#gDZ-@8J4@XvgnFB9@LU_~u zlNnjM3Lt$~_YBLu0f=O|UH4`E+b~ldGXcc_WEE@CuYR&vC%?)1JtR8`)|XW8)c++F z%zC!*|K!8}&wRv}e7&TUgrpZhz5hReOija5S??D>N!BP(#c-fj$AKe&EY$GLX6-jW z(fp0k7x0bAH#u)=6NE+R=_@lz0Fd`FAZzoNgt(ET-c0uU*!wo{_M+d6_fMO(t*C>qnu#G2T-n)%I>*FsffL zx^Z;(LLjfYp8Kog#V-gzdZDRCmgjQs%JN(TNAW6(Vi!oUhxyZKGt#L_}F{1t$K!}}@`fyB70sEksKm2wvxgX;N2zS``~|L z@EfUL_7GjqO|lm>@z)Wf#Ly6Hdj)FkoeW*UI$`f6Q1!A@4%ABA zrttfQKtisnIp-5z_Q=Dq+Tm#uISeVX{Mo>v!?k1|FNo7^z$iZ`q)ugUZ=mfkq|Xzs zki#C-^1zIQU7VYx8Gh0luXd+nrBc5ap%u7d6^_~*aFz-uXJ4!xrn~t}0(n!a*NY|e z&S$%u%xS#udNNd9z<93ekou!$%W>{Gifd_!6}3Od4#@(!eN|jFMmIrqQ9sRBxl_ zsJJJ~3a#>I?qW^V@oedPUeLaQqC@aes3=|70o09uEK@|5hfffk%l6oMH8u(nRAO_h@LsV{z!4twUzJ_lU|#Y(2Ym3D*#5(|97&40ytuZm(D}b4FM1&;52pj-O4U zSiTa|O$T2`Lx zxTV}Qq_6Wdtmp{*>}#vsrNHh*a>dryi#?4!0xYvc`jX!h!aeh9YRfBsV&Zj!TZ@%4 zh0IzK6jJBapinDbn*@Q8Zl%=kS@@#*sXZJQt3}0NLp8Z0bXZ1ZYB{l>_EWf&qoVzk za(lw0YE>J?4GY@0%ay|zqx0w}>|Y&>Zp`&l+^(^WM$nwQS%EzFYHK3SK(l|x)Swf0A2Gk<>SjDw%( z{dSA%8!u2UG%dl$>3Y;7(Xky@!#OE#wsQJ@$)pL@h$1|Ayl@S}?@KGy5m=uPY`4|G zxJb_zQZsf_;LL6`xyMrcDNLln(rPrHYNp-gs@rn+thC(hx>P<=1`}PcLaCkOwBC3L zv+g)pyXOWr8R}G(+#Ux5i|KX;7jAf$@`EFO^4hJfHk>t@ObUKAO|^EM*oSM;v}9^0 zYiPldeUK`Q&;GTDrSa=Ho@4%O4A!payt7zy9rV+FthQ}0OS1mPvCaMKG*E-{YMOiO z%HF{!RKZ8SAMMMtw1cy`en%6HK@=Envx4}3d+*>2^qOqgfJJ3gP-m`6?zuaTzFvnvY81@(DQFu{&=x>Obd{7^VD-K{>d z_L;Ud?XE&S%lZ)EFf(H`7d}foQ>3Ro=2Sbj*IH|u=Bu)?TkmjIZ4P}#fSqHuM9IAk zQcLO-N+4EN5YBd=e8V4&w%Pf;+-yW`(yWyA*GU;Rc*bm6Exp)c8L0PZ*VfRAW^E&V zmLN8|jq=FgGBKN*$+DChOj9SC8b8~Us&;M-DZz)=!)9uWVFZ!E>md#{U08Hxd^j}3 zRIU2Sr^Pb|A1`_u8}x3nT}YQ|tIakJbktLQZpoZ3?jti8JUck3jFl}xW>e$kh*y~A zYU8i_)H=1C*jA4r;rh{iE`4@u@Y5=MgQ7j#UBvSsJOxizk?usVS)EXIp~a4a%tYFT1N80I|@6NKS-|(s$_B$ObW|M3(^1DO61M= zd)IfN^Gj%6KTx3lf&TsgRg$cbz!mu2R!uJV?+0*yw_#`^k$$q_X#Eei3jW7f5=(BE zKgE*NAMq$6(MCV{)xYMALe;X(@SB_)alnT{n-#2Xka`6rO+%w}^r}@gvu+ZEUd3wE zRc-xmkK+gUMOEJZv#Nw)z_1dX@sqy%Kt}G>1K#RF{78h27XfdzlFV1uU(AH%Z}(m` SjK0xcA<)_f2=?cLkADHZ|4*9$ literal 0 HcmV?d00001 diff --git a/packages/markitdown/tests/test_pdf_tables.py b/packages/markitdown/tests/test_pdf_tables.py index 92a79ddc9..2f8d96d01 100644 --- a/packages/markitdown/tests/test_pdf_tables.py +++ b/packages/markitdown/tests/test_pdf_tables.py @@ -650,6 +650,140 @@ def test_scanned_pdf_handling(self, markitdown): result.text_content.strip() == "" ), f"Scanned PDF should have empty extraction, got: '{result.text_content[:100]}...'" + def test_broadcast_order_pdf_extraction(self, markitdown): + """Test extraction of broadcast order PDF with many columns. + + Expected output: Pipe-separated format with order details, agencies, + advertisers, and line items in structured tables. + """ + pdf_path = os.path.join( + TEST_FILES_DIR, "bf57756e-868b-4a9e-294a-81494d3ab2f1.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Validate pipe-separated table format + assert "|" in text_content, "Broadcast order should contain pipe separators" + + # Validate key order information + expected_strings = [ + "ORDER", + "1940495", # Order number + "MIKE BLOOMBERG 2020 INC", # Product description + "02/17/20 - 02/21/20", # Flight dates + "WOC12391815", # Alt order number + ] + validate_strings(result, expected_strings) + + # Validate agency information + agency_strings = [ + "Assembly / POL", # Agency name + "Heather Goldsmith", # Buying contact + "WTLV-TV", # Station + "Jim Quinn", # Primary AE + ] + validate_strings(result, agency_strings) + + # Validate advertiser information + advertiser_strings = [ + "POL/ Michael Bloomberg", + "A18-49", # Demographic + "PL-Presidential", # Product codes + ] + validate_strings(result, advertiser_strings) + + # Validate bill plan totals + billing_strings = [ + "$2,750.00", # Gross amount + "$2,337.50", # Net amount + "February 2020", # Month + ] + validate_strings(result, billing_strings) + + # Validate line item details + line_item_strings = [ + "WTLV", # Channel + "Local News @ 6p M-F", + "Lincoln Rhyme", + "$400.00", # Rate + "$800.00", # Rate + "$1,200.00", # Amount + ] + validate_strings(result, line_item_strings) + + def test_movie_theater_booking_pdf_extraction(self, markitdown): + """Test extraction of movie theater booking PDF with complex tables. + + Expected output: Pipe-separated format with booking details, agency info, + customer details, and show schedules in structured tables. + """ + pdf_path = os.path.join( + TEST_FILES_DIR, "movie-theater-booking-2024.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Validate pipe-separated table format + assert "|" in text_content, "Booking order should contain pipe separators" + + # Validate key booking information + expected_strings = [ + "BOOKING ORDER", + "2024-12-5678", # Order number + "Holiday Movie Marathon Package", # Product description + "12/20/2024 - 12/31/2024", # Booking dates + "SC-WINTER-2024", # Alt order number + "STARLIGHT CINEMAS", # Cinema brand + ] + validate_strings(result, expected_strings) + + # Validate agency information + agency_strings = [ + "Premier Entertainment Group", # Agency name + "Michael Chen", # Contact + "Sarah Johnson", # Primary contact + "Downtown Multiplex", # Cinema name + ] + validate_strings(result, agency_strings) + + # Validate customer information + customer_strings = [ + "Universal Studios Distribution", # Customer name + "Film Distributor", # Category + "CUST-98765", # Customer ID + ] + validate_strings(result, customer_strings) + + # Validate booking summary totals + booking_strings = [ + "$12,500.00", # Gross amount + "$11,250.00", # Net amount + "December 2024", # Month + "48", # Number of shows + ] + validate_strings(result, booking_strings) + + # Validate show schedule details + show_strings = [ + "Holiday Spectacular", # Movie title + "Winter Wonderland", # Movie title + "New Year Mystery", # Movie title + "IMAX 3D", # Format + "$250", # Rate + "$300", # Rate + "$3,000", # Revenue + "$3,600", # Revenue + ] + validate_strings(result, show_strings) + class TestPdfTableMarkdownFormat: """Test that extracted tables have proper markdown formatting.""" From be94561a617b5ba3072c692097cb7cec50303af1 Mon Sep 17 00:00:00 2001 From: Viktor Lesyk Date: Tue, 10 Feb 2026 12:21:36 +0100 Subject: [PATCH 2/5] chore: update version to 0.1.6b1 --- packages/markitdown/src/markitdown/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index 4c8b68f6d..ff0280657 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.1.4" +__version__ = "0.1.6b1" From a50f2bb98e19db00e45de190a3b8d084d8af6fa2 Mon Sep 17 00:00:00 2001 From: Viktor Lesyk Date: Tue, 10 Feb 2026 12:49:38 +0100 Subject: [PATCH 3/5] feat: enhance PDF table extraction with adaptive column clustering and add comprehensive test cases --- .../markitdown/converters/_pdf_converter.py | 56 ++- ...EDRPT-2024-PAT-3847_medical_report_scan.md | 0 .../RECEIPT-2024-TXN-98765_retail_purchase.md | 81 +++++ .../REPAIR-2022-INV-001_multipage.md | 76 ++++ .../SPARSE-2024-INV-1234_borderless_table.md | 44 +++ .../movie-theater-booking-2024.md | 62 ++++ .../tests/test_files/expected_outputs/test.md | 65 ++++ packages/markitdown/tests/test_pdf_tables.py | 324 ++++++++++++++---- 8 files changed, 638 insertions(+), 70 deletions(-) create mode 100644 packages/markitdown/tests/test_files/expected_outputs/MEDRPT-2024-PAT-3847_medical_report_scan.md create mode 100644 packages/markitdown/tests/test_files/expected_outputs/RECEIPT-2024-TXN-98765_retail_purchase.md create mode 100644 packages/markitdown/tests/test_files/expected_outputs/REPAIR-2022-INV-001_multipage.md create mode 100644 packages/markitdown/tests/test_files/expected_outputs/SPARSE-2024-INV-1234_borderless_table.md create mode 100644 packages/markitdown/tests/test_files/expected_outputs/movie-theater-booking-2024.md create mode 100644 packages/markitdown/tests/test_files/expected_outputs/test.md diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 8e01628c7..5134a1e35 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -138,16 +138,62 @@ def _extract_form_content_from_words(page: Any) -> str | None: if not all_table_x_positions: return None - # Compute global column boundaries + # Compute adaptive column clustering tolerance based on gap analysis all_table_x_positions.sort() + + # Calculate gaps between consecutive x-positions + gaps = [] + for i in range(len(all_table_x_positions) - 1): + gap = all_table_x_positions[i + 1] - all_table_x_positions[i] + if gap > 5: # Only significant gaps + gaps.append(gap) + + # Determine optimal tolerance using statistical analysis + if gaps and len(gaps) >= 3: + # Use 70th percentile of gaps as threshold (balances precision/recall) + sorted_gaps = sorted(gaps) + percentile_70_idx = int(len(sorted_gaps) * 0.70) + adaptive_tolerance = sorted_gaps[percentile_70_idx] + + # Clamp tolerance to reasonable range [25, 50] + adaptive_tolerance = max(25, min(50, adaptive_tolerance)) + else: + # Fallback to conservative value + adaptive_tolerance = 35 + + # Compute global column boundaries using adaptive tolerance global_columns: list[float] = [] for x in all_table_x_positions: - if not global_columns or x - global_columns[-1] > 35: + if not global_columns or x - global_columns[-1] > adaptive_tolerance: global_columns.append(x) - # Too many columns suggests dense text, not a form - # Increased limit to support complex forms with many columns - if len(global_columns) > 20: + # Adaptive max column check based on page characteristics + # Calculate average column width + if len(global_columns) > 1: + content_width = global_columns[-1] - global_columns[0] + avg_col_width = content_width / len(global_columns) + + # Forms with very narrow columns (< 30px) are likely dense text + if avg_col_width < 30: + return None + + # Compute adaptive max based on columns per inch + # Typical forms have 3-8 columns per inch + columns_per_inch = len(global_columns) / (content_width / 72) + + # If density is too high (> 10 cols/inch), likely not a form + if columns_per_inch > 10: + return None + + # Adaptive max: allow more columns for wider pages + # Standard letter is 612pt wide, so scale accordingly + adaptive_max_columns = int(20 * (page_width / 612)) + adaptive_max_columns = max(15, adaptive_max_columns) # At least 15 + + if len(global_columns) > adaptive_max_columns: + return None + else: + # Single column, not a form return None # Now classify each row as table row or not diff --git a/packages/markitdown/tests/test_files/expected_outputs/MEDRPT-2024-PAT-3847_medical_report_scan.md b/packages/markitdown/tests/test_files/expected_outputs/MEDRPT-2024-PAT-3847_medical_report_scan.md new file mode 100644 index 000000000..e69de29bb diff --git a/packages/markitdown/tests/test_files/expected_outputs/RECEIPT-2024-TXN-98765_retail_purchase.md b/packages/markitdown/tests/test_files/expected_outputs/RECEIPT-2024-TXN-98765_retail_purchase.md new file mode 100644 index 000000000..379de4df7 --- /dev/null +++ b/packages/markitdown/tests/test_files/expected_outputs/RECEIPT-2024-TXN-98765_retail_purchase.md @@ -0,0 +1,81 @@ +TECHMART ELECTRONICS +4567 Innovation Blvd +San Francisco, CA 94103 +(415) 555-0199 + +=================================== + +Store #0342 - Downtown SF +11/23/2024 14:32:18 PST +TXN: TXN-98765-2024 +Cashier: Emily Rodriguez +Register: POS-07 + +----------------------------------- + +Wireless Noise-Cancelling +Headphones - Premium Black +AUDIO-5521 1 @ $349.99 +Member Discount $-50.00 +$299.99 +USB-C Hub 7-in-1 Adapter +with HDMI & Ethernet +ACC-8834 2 @ $79.99 +$159.98 +Portable SSD 2TB +Thunderbolt 3 Compatible +STOR-2241 1 @ $289.00 +Member Discount $-29.00 +$260.00 +Ergonomic Wireless Mouse +Rechargeable Battery +ACC-9012 1 @ $59.99 +$59.99 +Screen Cleaning Kit +Professional Grade +CARE-1156 3 @ $12.99 +$38.97 +HDMI 2.1 Cable 6ft +8K Resolution Support +CABLE-7789 2 @ $24.99 +Member Discount $-5.00 +$44.98 +----------------------------------- + +SUBTOTAL $863.91 +Member Discount (15%)-$84.00 +Sales Tax (8.5%) $66.23 +Rewards Applied -$25.00 +=================================== +TOTAL $821.14 +=================================== + +PAYMENT METHOD +Visa Card ending in 4782 +Auth: 847392 +Ref: REF-20241123-98765 + +----------------------------------- + +REWARDS MEMBER +Sarah Mitchell +ID: TM-447821 +Points Earned: 821 +Total Points: 3,247 +Next Reward: $50 gift card +at 5,000 pts (1,753 to go) + +----------------------------------- + +RETURN POLICY +Returns within 30 days +Receipt required +Electronics must be unopened + +*TXN98765202411231432* + +Thank you for shopping! +www.techmart.example.com + +=================================== + diff --git a/packages/markitdown/tests/test_files/expected_outputs/REPAIR-2022-INV-001_multipage.md b/packages/markitdown/tests/test_files/expected_outputs/REPAIR-2022-INV-001_multipage.md new file mode 100644 index 000000000..e80967b45 --- /dev/null +++ b/packages/markitdown/tests/test_files/expected_outputs/REPAIR-2022-INV-001_multipage.md @@ -0,0 +1,76 @@ +ZAVA AUTO REPAIR +Certified Collision Repair +123 Main Street, Redmond, WA 98052 +Phone: (425) 000-0000 +Preliminary Estimate (ID: EST-1008) +| Customer Information | | | Vehicle Information | | +| -------------------- | ------------------- | --- | ------------------- | ----------------- | +| Insured name | Gabriel Diaz | | Year | 2022 | +| Claim # | SF-1008 | | Make | Jeep | +| Policy # | POL-2022-555 | | Model | Grand Cherokee | +| Phone | (425) 111-1111 | | Trim | Limited | +| Email | gabriel@contoso.com | | VIN | 1C4RJFBG2NC123456 | +| | | | Color | White | +| | | | Odometer | 9,800 | +| Repair Order # | RO-20221108 | | Estimator | Ellis Turner | +Estimate Totals +| | | Hours | Rate | Cost | +| ---------------- | --- | ----- | ---- | ----- | +| Parts | | | | 2,100 | +| Body Labor | | 2 | 150 | 300 | +| Paint Labor | | 1.5 | 150 | 225 | +| Mechanical Labor | | - | - | - | +Supplies +| | Paint Supplies | | | 60 | +| ------------- | ------------------------ | --- | ------ | ------ | +| | Body Supplies | | | 30 | +| Other Charges | | | | 15 | +| Subtotal | | | | 2,730 | +| Sales Tax | | | 10.20% | 278.46 | +| GRAND TOTAL | | | | 5,738 | +| Note | Minor rear bumper repair | | | | +This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found +after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be +present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to +models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for +any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is +deemed not related to an insurance claim, vehicle owner will be reponsible for charges. + +ZAVA AUTO REPAIR +Certified Collision Repair +123 Main Street, Redmond, WA 98052 +Phone: (425) 000-0000 +Preliminary Estimate (ID: EST-1008) +Customer Information Vehicle Information +| Insured name | Bruce Wayne | | Year | 2025 | +| -------------- | -------------------------- | --- | --------- | ------------ | +| Claim # | | 999 | Make | Batman | +| Policy # | IM-BATMAN | | Model | Batmobile | +| Phone | (416) 555-1234 | | Trim | Limited | +| Email | batman@wayneindustries.com | | VIN | XXX | +| | | | Color | Black | +| | | | Odometer | 1 | +| Repair Order # | RO-20221108 | | Estimator | Ellis Turner | +Estimate Totals +| | | Hours | Rate | Cost | +| ---------------- | --- | ----- | ---- | ------ | +| Parts | | | | 99,999 | +| Body Labor | | 2 | 150 | 300 | +| Paint Labor | | 1.5 | 150 | 225 | +| Mechanical Labor | | - | - | - | +Supplies +| | Paint Supplies | | | 60 | +| ------------- | ------------------------ | --- | ------ | --------- | +| | Body Supplies | | | 30 | +| Other Charges | | | | 15 | +| Subtotal | | | | 100,629 | +| Sales Tax | | | 10.20% | 10264.158 | +| GRAND TOTAL | | | | 211,522 | +| Note | Minor rear bumper repair | | | | + +This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found +after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be +present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to +models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for +any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is +deemed not related to an insurance claim, vehicle owner will be reponsible for charges. \ No newline at end of file diff --git a/packages/markitdown/tests/test_files/expected_outputs/SPARSE-2024-INV-1234_borderless_table.md b/packages/markitdown/tests/test_files/expected_outputs/SPARSE-2024-INV-1234_borderless_table.md new file mode 100644 index 000000000..797496452 --- /dev/null +++ b/packages/markitdown/tests/test_files/expected_outputs/SPARSE-2024-INV-1234_borderless_table.md @@ -0,0 +1,44 @@ +INVENTORY RECONCILIATION REPORT +Report ID: SPARSE-2024-INV-1234 +Warehouse: Distribution Center East +Report Date: 2024-11-15 +Prepared By: Sarah Martinez +| Product Code | Location | Expected | Actual | Variance | Status | +| ------------ | -------- | -------- | ------ | -------- | -------- | +| SKU-8847 | A-12 | 450 | | | | +| | B-07 | | 289 | -23 | | +| SKU-9201 | | 780 | 778 | | OK | +| | C-15 | | | +15 | | +| SKU-4563 | D-22 | | 156 | | CRITICAL | +| | | 180 | | -24 | | +| SKU-7728 | A-08 | 920 | | | | +| | | | 935 | +15 | OK | +Variance Analysis: +Summary Statistics: +Total Variance Cost: $4,287.50 +Critical Items: 1 +Overall Accuracy: 97.2% +Detailed Analysis by Category: +The inventory reconciliation reveals several key findings. The primary variance driver is SKU-4563, +which shows a -24 unit discrepancy requiring immediate investigation. Location B-07 handling of +SKU-8847 also demonstrates significant variance. Cross-location verification protocols should be + +reviewed to prevent future discrepancies. The overall accuracy rate of 97.2% meets our target +threshold, but critical items require expedited resolution to maintain operational efficiency. +Extended Inventory Review: +| Product Code | Category | Unit Cost | Total Value | Last Audit | Notes | +| ------------ | ----------- | --------- | ----------- | ---------- | ---------- | +| SKU-8847 | Electronics | $45.00 | $13,005.00 | 2024-10-15 | | +| SKU-9201 | Hardware | $32.50 | $25,285.00 | 2024-10-22 | Verified | +| SKU-4563 | Software | $120.00 | $18,720.00 | | Critical | +| SKU-7728 | Accessories | $15.75 | $14,726.25 | 2024-11-01 | | +| SKU-3345 | Electronics | $67.00 | $22,445.00 | 2024-10-18 | | +| SKU-5512 | Hardware | $89.00 | $31,150.00 | | Pending | +| SKU-6678 | Software | $200.00 | $42,000.00 | 2024-10-25 | High Value | +| SKU-7789 | Accessories | $8.50 | $5,950.00 | 2024-11-05 | | +| SKU-2234 | Electronics | $125.00 | $35,000.00 | | | +| SKU-1123 | Hardware | $55.00 | $27,500.00 | 2024-10-30 | Verified | +Recommendations: +1. Immediate review of SKU-4563 handling procedures. 2. Implement additional verification for critical +items. 3. Schedule follow-up audit for high-value products (SKU-6678, SKU-2234). +Approval: \ No newline at end of file diff --git a/packages/markitdown/tests/test_files/expected_outputs/movie-theater-booking-2024.md b/packages/markitdown/tests/test_files/expected_outputs/movie-theater-booking-2024.md new file mode 100644 index 000000000..371cee776 --- /dev/null +++ b/packages/markitdown/tests/test_files/expected_outputs/movie-theater-booking-2024.md @@ -0,0 +1,62 @@ +BOOKING ORDER +Print Date 12/15/2024 14:30:22 +Page 1 of 1 +STARLIGHT CINEMAS +Orders +| Order / Rev: | 2024-12-5678 | | | Cinema: | | Downtown Multiplex | +| ------------ | -------------- | --- | --- | ---------------- | --- | ------------------ | +| Alt Order #: | SC-WINTER-2024 | | | Primary Contact: | | Sarah Johnson | +Product Desc: Holiday Movie Marathon Package Location: NYC-01 +| Estimate: | EST-456 | | | Region: | | NORTHEAST | +| -------------------- | ----------------------- | --- | --- | ------- | --- | --------- | +| Booking Dates: | 12/20/2024 - 12/31/2024 | | | | | | +| Original Date / Rev: | 12/01/24 / 12/10/24 | | | | | | +| Order Type: | Premium Package | | | | | | +Booking Agency +| Name: | Premier Entertainment Group | | | | | | +| ---------------- | --------------------------- | --- | --- | -------------- | --- | --------- | +| | | | | Billing Type: | | Net 30 | +| Contact: | Michael Chen | | | | | | +| | | | | Payment Terms: | | Corporate | +| Billing Contact: | accounting@premierent.com | | | | | | +| | | | | Commission: | | 10% | +555 Broadway Suite 1200 +New York, NY 10012 +Customer +| Name: | Universal Studios Distribution | | | | | | +| -------------- | ------------------------------ | --- | --- | --- | --- | --- | +| Category: | Film Distributor | | | | | | +| Contact Email: | bookings@universalstudios.com | | | | | | +| Customer ID: | CUST-98765 | | | | | | +| Revenue Code: | FILM-PREMIUM | | | | | | +Booking Summary +| Start Date | End Date | # Shows | Gross Amount | Net Amount | | | +| ---------- | -------- | ------- | ------------ | ---------- | --- | --- | +| 12/20/24 | 12/31/24 | 48 | $12,500.00 | $11,250.00 | | | +Totals +| Month | # Shows | Gross Amount | | Net Amount | | Occupancy | +| ------------- | ------- | ------------ | --- | ---------- | --- | --------- | +| December 2024 | 48 | $12,500.00 | | $11,250.00 | | 85% | +| Totals | 48 | $12,500.00 | | $11,250.00 | | 85% | +Account Representatives +Representative Territory Region Start Date / End Date Commission % +| Sarah Johnson | NYC Metro | NORTHEAST | 12/20/24 - 12/31/24 | | 100% | | +| ------------- | --------- | --------- | ------------------- | --- | ---- | --- | +Show Schedule Details +Ln Screen Start End Movie Title Format Showtime Days Shows Rate Type Total +1 SCR-1 12/20/24 12/25/24 Holiday Spectacular IMAX 3D 7:00 PM Daily 12 $250 PM $3,000 +(Runtime: 142 min); Holiday Season Premium +2 SCR-2 12/20/24 12/31/24 Winter Wonderland Standard 4:30 PM Daily 24 $150 MT $3,600 +(Runtime: 98 min); Matinee Special +3 SCR-1 12/26/24 12/31/24 New Year Mystery 4DX 9:30 PM Daily 12 $300 PM $3,600 +(Runtime: 116 min); Premium Experience +Show Details +| Show Screen | Date Range | Title | Showtime | Days Type | Rate | Revenue | +| ----------- | ---------- | ----- | -------- | --------- | ---- | ------- | +1 SCR-1 12/20-12/25 Holiday Spectacular 7:00 PM Daily PM $250 $3,000 +This booking order is subject to cinema availability and standard terms. +2 SCR-2 12/20-12/31 Winter Wonderland 4:30 PM Daily MT $150 $3,600 +All showtimes are approximate and subject to change. +3 SCR-1 12/26-12/31 New Year Mystery 9:30 PM Daily PM $300 $3,600 +| Total Revenue: | | | | | | $12,500.00 | +| -------------- | --- | --- | --- | --- | --- | ---------- | \ No newline at end of file diff --git a/packages/markitdown/tests/test_files/expected_outputs/test.md b/packages/markitdown/tests/test_files/expected_outputs/test.md new file mode 100644 index 000000000..2d9c90c07 --- /dev/null +++ b/packages/markitdown/tests/test_files/expected_outputs/test.md @@ -0,0 +1,65 @@ +1 + +Introduction + +Large language models (LLMs) are becoming a crucial building block in developing powerful agents +that utilize LLMs for reasoning, tool usage, and adapting to new observations (Yao et al., 2022; Xi +et al., 2023; Wang et al., 2023b) in many real-world tasks. Given the expanding tasks that could +benefit from LLMs and the growing task complexity, an intuitive approach to scale up the power of +agents is to use multiple agents that cooperate. Prior work suggests that multiple agents can help +encourage divergent thinking (Liang et al., 2023), improve factuality and reasoning (Du et al., 2023), +and provide validation (Wu et al., 2023). In light of the intuition and early evidence of promise, it is +intriguing to ask the following question: how can we facilitate the development of LLM applications +that could span a broad spectrum of domains and complexities based on the multi-agent approach? + +Our insight is to use multi-agent conversations to achieve it. There are at least three reasons con- +firming its general feasibility and utility thanks to recent advances in LLMs: First, because chat- +optimized LLMs (e.g., GPT-4) show the ability to incorporate feedback, LLM agents can cooperate +through conversations with each other or human(s), e.g., a dialog where agents provide and seek rea- +soning, observations, critiques, and validation. Second, because a single LLM can exhibit a broad +range of capabilities (especially when configured with the correct prompt and inference settings), +conversations between differently configured agents can help combine these broad LLM capabilities +in a modular and complementary manner. Third, LLMs have demonstrated ability to solve complex +tasks when the tasks are broken into simpler subtasks. Multi-agent conversations can enable this +partitioning and integration in an intuitive manner. How can we leverage the above insights and +support different applications with the common requirement of coordinating multiple agents, poten- +tially backed by LLMs, humans, or tools exhibiting different capacities? We desire a multi-agent +conversation framework with generic abstraction and effective implementation that has the flexibil- +ity to satisfy different application needs. Achieving this requires addressing two critical questions: +(1) How can we design individual agents that are capable, reusable, customizable, and effective in +multi-agent collaboration? (2) How can we develop a straightforward, unified interface that can +accommodate a wide range of agent conversation patterns? In practice, applications of varying +complexities may need distinct sets of agents with specific capabilities, and may require different +conversation patterns, such as single- or multi-turn dialogs, different human involvement modes, and +static vs. dynamic conversation. Moreover, developers may prefer the flexibility to program agent +interactions in natural language or code. Failing to adequately address these two questions would +limit the framework’s scope of applicability and generality. +While there is contemporaneous exploration of multi-agent approaches,3 we present AutoGen, a +generalized multi-agent conversation framework (Figure 1), based on the following new concepts. +1 Customizable and conversable agents. AutoGen uses a generic design of agents that can lever- +age LLMs, human inputs, tools, or a combination of them. The result is that developers can +easily and quickly create agents with different roles (e.g., agents to write code, execute code, +wire in human feedback, validate outputs, etc.) by selecting and configuring a subset of built-in +capabilities. The agent’s backend can also be readily extended to allow more custom behaviors. +To make these agents suitable for multi-agent conversation, every agent is made conversable – +they can receive, react, and respond to messages. When configured properly, an agent can hold +multiple turns of conversations with other agents autonomously or solicit human inputs at cer- +tain rounds, enabling human agency and automation. The conversable agent design leverages the +strong capability of the most advanced LLMs in taking feedback and making progress via chat +and also allows combining capabilities of LLMs in a modular fashion. (Section 2.1) + +2 Conversation programming. A fundamental insight of AutoGen is to simplify and unify com- +plex LLM application workflows as multi-agent conversations. So AutoGen adopts a program- +ming paradigm centered around these inter-agent conversations. We refer to this paradigm as +conversation programming, which streamlines the development of intricate applications via two +primary steps: (1) defining a set of conversable agents with specific capabilities and roles (as +described above); (2) programming the interaction behavior between agents via conversation- +centric computation and control. Both steps can be achieved via a fusion of natural and pro- +gramming languages to build applications with a wide range of conversation patterns and agent +behaviors. AutoGen provides ready-to-use implementations and also allows easy extension and +experimentation for both steps. (Section 2.2) + +3We refer to Appendix A for a detailed discussion. + +2 + diff --git a/packages/markitdown/tests/test_pdf_tables.py b/packages/markitdown/tests/test_pdf_tables.py index 2f8d96d01..75ac6d381 100644 --- a/packages/markitdown/tests/test_pdf_tables.py +++ b/packages/markitdown/tests/test_pdf_tables.py @@ -650,71 +650,6 @@ def test_scanned_pdf_handling(self, markitdown): result.text_content.strip() == "" ), f"Scanned PDF should have empty extraction, got: '{result.text_content[:100]}...'" - def test_broadcast_order_pdf_extraction(self, markitdown): - """Test extraction of broadcast order PDF with many columns. - - Expected output: Pipe-separated format with order details, agencies, - advertisers, and line items in structured tables. - """ - pdf_path = os.path.join( - TEST_FILES_DIR, "bf57756e-868b-4a9e-294a-81494d3ab2f1.pdf" - ) - - if not os.path.exists(pdf_path): - pytest.skip(f"Test file not found: {pdf_path}") - - result = markitdown.convert(pdf_path) - text_content = result.text_content - - # Validate pipe-separated table format - assert "|" in text_content, "Broadcast order should contain pipe separators" - - # Validate key order information - expected_strings = [ - "ORDER", - "1940495", # Order number - "MIKE BLOOMBERG 2020 INC", # Product description - "02/17/20 - 02/21/20", # Flight dates - "WOC12391815", # Alt order number - ] - validate_strings(result, expected_strings) - - # Validate agency information - agency_strings = [ - "Assembly / POL", # Agency name - "Heather Goldsmith", # Buying contact - "WTLV-TV", # Station - "Jim Quinn", # Primary AE - ] - validate_strings(result, agency_strings) - - # Validate advertiser information - advertiser_strings = [ - "POL/ Michael Bloomberg", - "A18-49", # Demographic - "PL-Presidential", # Product codes - ] - validate_strings(result, advertiser_strings) - - # Validate bill plan totals - billing_strings = [ - "$2,750.00", # Gross amount - "$2,337.50", # Net amount - "February 2020", # Month - ] - validate_strings(result, billing_strings) - - # Validate line item details - line_item_strings = [ - "WTLV", # Channel - "Local News @ 6p M-F", - "Lincoln Rhyme", - "$400.00", # Rate - "$800.00", # Rate - "$1,200.00", # Amount - ] - validate_strings(result, line_item_strings) - def test_movie_theater_booking_pdf_extraction(self, markitdown): """Test extraction of movie theater booking PDF with complex tables. @@ -785,6 +720,265 @@ def test_movie_theater_booking_pdf_extraction(self, markitdown): validate_strings(result, show_strings) +class TestPdfFullOutputComparison: + """Test that PDF extraction produces expected complete outputs.""" + + @pytest.fixture + def markitdown(self): + """Create MarkItDown instance.""" + return MarkItDown() + + def test_movie_theater_full_output(self, markitdown): + """Test complete output for movie theater booking PDF.""" + pdf_path = os.path.join(TEST_FILES_DIR, "movie-theater-booking-2024.pdf") + expected_path = os.path.join( + TEST_FILES_DIR, "expected_outputs", "movie-theater-booking-2024.md" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + if not os.path.exists(expected_path): + pytest.skip(f"Expected output not found: {expected_path}") + + result = markitdown.convert(pdf_path) + actual_output = result.text_content + + with open(expected_path, "r", encoding="utf-8") as f: + expected_output = f.read() + + # Compare outputs + actual_lines = [line.rstrip() for line in actual_output.split("\n")] + expected_lines = [line.rstrip() for line in expected_output.split("\n")] + + # Check line count + assert abs(len(actual_lines) - len(expected_lines)) <= 2, ( + f"Line count mismatch: actual={len(actual_lines)}, " + f"expected={len(expected_lines)}" + ) + + # Check structural elements + assert actual_output.count("|") > 80, "Should have many pipe separators" + assert actual_output.count("---") > 8, "Should have table separators" + + # Validate critical sections + for section in [ + "BOOKING ORDER", + "STARLIGHT CINEMAS", + "2024-12-5678", + "Holiday Spectacular", + "$12,500.00", + ]: + assert section in actual_output, f"Missing section: {section}" + + # Check table structure + table_rows = [line for line in actual_lines if line.startswith("|")] + assert len(table_rows) > 15, f"Should have >15 table rows, got {len(table_rows)}" + + def test_sparse_borderless_table_full_output(self, markitdown): + """Test complete output for SPARSE borderless table PDF.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + expected_path = os.path.join( + TEST_FILES_DIR, + "expected_outputs", + "SPARSE-2024-INV-1234_borderless_table.md", + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + if not os.path.exists(expected_path): + pytest.skip(f"Expected output not found: {expected_path}") + + result = markitdown.convert(pdf_path) + actual_output = result.text_content + + with open(expected_path, "r", encoding="utf-8") as f: + expected_output = f.read() + + # Compare outputs + actual_lines = [line.rstrip() for line in actual_output.split("\n")] + expected_lines = [line.rstrip() for line in expected_output.split("\n")] + + # Check line count is close + assert abs(len(actual_lines) - len(expected_lines)) <= 2, ( + f"Line count mismatch: actual={len(actual_lines)}, " + f"expected={len(expected_lines)}" + ) + + # Check structural elements + assert actual_output.count("|") > 50, "Should have many pipe separators" + + # Validate critical sections + for section in [ + "INVENTORY RECONCILIATION REPORT", + "SPARSE-2024-INV-1234", + "SKU-8847", + "SKU-9201", + "Variance Analysis", + ]: + assert section in actual_output, f"Missing section: {section}" + + def test_repair_multipage_full_output(self, markitdown): + """Test complete output for REPAIR multipage invoice PDF.""" + pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf") + expected_path = os.path.join( + TEST_FILES_DIR, "expected_outputs", "REPAIR-2022-INV-001_multipage.md" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + if not os.path.exists(expected_path): + pytest.skip(f"Expected output not found: {expected_path}") + + result = markitdown.convert(pdf_path) + actual_output = result.text_content + + with open(expected_path, "r", encoding="utf-8") as f: + expected_output = f.read() + + # Compare outputs + actual_lines = [line.rstrip() for line in actual_output.split("\n")] + expected_lines = [line.rstrip() for line in expected_output.split("\n")] + + # Check line count is close + assert abs(len(actual_lines) - len(expected_lines)) <= 2, ( + f"Line count mismatch: actual={len(actual_lines)}, " + f"expected={len(expected_lines)}" + ) + + # Check structural elements + assert actual_output.count("|") > 40, "Should have many pipe separators" + + # Validate critical sections + for section in [ + "ZAVA AUTO REPAIR", + "Gabriel Diaz", + "Jeep", + "Grand Cherokee", + "GRAND TOTAL", + ]: + assert section in actual_output, f"Missing section: {section}" + + def test_receipt_full_output(self, markitdown): + """Test complete output for RECEIPT retail purchase PDF.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf" + ) + expected_path = os.path.join( + TEST_FILES_DIR, + "expected_outputs", + "RECEIPT-2024-TXN-98765_retail_purchase.md", + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + if not os.path.exists(expected_path): + pytest.skip(f"Expected output not found: {expected_path}") + + result = markitdown.convert(pdf_path) + actual_output = result.text_content + + with open(expected_path, "r", encoding="utf-8") as f: + expected_output = f.read() + + # Compare outputs + actual_lines = [line.rstrip() for line in actual_output.split("\n")] + expected_lines = [line.rstrip() for line in expected_output.split("\n")] + + # Check line count is close + assert abs(len(actual_lines) - len(expected_lines)) <= 2, ( + f"Line count mismatch: actual={len(actual_lines)}, " + f"expected={len(expected_lines)}" + ) + + # Validate critical sections + for section in [ + "TECHMART ELECTRONICS", + "TXN-98765-2024", + "Sarah Mitchell", + "$821.14", + "RETURN POLICY", + ]: + assert section in actual_output, f"Missing section: {section}" + + def test_academic_paper_full_output(self, markitdown): + """Test complete output for academic paper PDF.""" + pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf") + expected_path = os.path.join(TEST_FILES_DIR, "expected_outputs", "test.md") + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + if not os.path.exists(expected_path): + pytest.skip(f"Expected output not found: {expected_path}") + + result = markitdown.convert(pdf_path) + actual_output = result.text_content + + with open(expected_path, "r", encoding="utf-8") as f: + expected_output = f.read() + + # Compare outputs + actual_lines = [line.rstrip() for line in actual_output.split("\n")] + expected_lines = [line.rstrip() for line in expected_output.split("\n")] + + # Check line count is close + assert abs(len(actual_lines) - len(expected_lines)) <= 2, ( + f"Line count mismatch: actual={len(actual_lines)}, " + f"expected={len(expected_lines)}" + ) + + # Academic paper should not have pipe separators + assert ( + actual_output.count("|") == 0 + ), "Academic paper should not have pipe separators" + + # Validate critical sections + for section in [ + "Introduction", + "Large language models", + "agents", + "multi-agent", + ]: + assert section in actual_output, f"Missing section: {section}" + + def test_medical_scan_full_output(self, markitdown): + """Test complete output for medical report scan PDF (empty, no text layer).""" + pdf_path = os.path.join( + TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf" + ) + expected_path = os.path.join( + TEST_FILES_DIR, + "expected_outputs", + "MEDRPT-2024-PAT-3847_medical_report_scan.md", + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + if not os.path.exists(expected_path): + pytest.skip(f"Expected output not found: {expected_path}") + + result = markitdown.convert(pdf_path) + actual_output = result.text_content + + with open(expected_path, "r", encoding="utf-8") as f: + expected_output = f.read() + + # Both should be empty (scanned PDF with no text layer) + assert ( + actual_output.strip() == "" + ), "Scanned PDF should produce empty output" + assert ( + expected_output.strip() == "" + ), "Expected output should be empty for scanned PDF" + + class TestPdfTableMarkdownFormat: """Test that extracted tables have proper markdown formatting.""" From bd20acd7c646a19c4987dce712d2fced5c2f47c5 Mon Sep 17 00:00:00 2001 From: Viktor Lesyk Date: Tue, 10 Feb 2026 12:54:36 +0100 Subject: [PATCH 4/5] fix: correct formatting and improve assertions in PDF table tests --- packages/markitdown/src/markitdown/__about__.py | 2 +- packages/markitdown/tests/test_pdf_tables.py | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index d499fa78b..ff0280657 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.1.6b1" \ No newline at end of file +__version__ = "0.1.6b1" diff --git a/packages/markitdown/tests/test_pdf_tables.py b/packages/markitdown/tests/test_pdf_tables.py index 75ac6d381..d26de0cb9 100644 --- a/packages/markitdown/tests/test_pdf_tables.py +++ b/packages/markitdown/tests/test_pdf_tables.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 -m pytest """Tests for PDF table extraction functionality.""" + import os import re import pytest @@ -656,9 +657,7 @@ def test_movie_theater_booking_pdf_extraction(self, markitdown): Expected output: Pipe-separated format with booking details, agency info, customer details, and show schedules in structured tables. """ - pdf_path = os.path.join( - TEST_FILES_DIR, "movie-theater-booking-2024.pdf" - ) + pdf_path = os.path.join(TEST_FILES_DIR, "movie-theater-booking-2024.pdf") if not os.path.exists(pdf_path): pytest.skip(f"Test file not found: {pdf_path}") @@ -773,7 +772,9 @@ def test_movie_theater_full_output(self, markitdown): # Check table structure table_rows = [line for line in actual_lines if line.startswith("|")] - assert len(table_rows) > 15, f"Should have >15 table rows, got {len(table_rows)}" + assert ( + len(table_rows) > 15 + ), f"Should have >15 table rows, got {len(table_rows)}" def test_sparse_borderless_table_full_output(self, markitdown): """Test complete output for SPARSE borderless table PDF.""" @@ -971,9 +972,7 @@ def test_medical_scan_full_output(self, markitdown): expected_output = f.read() # Both should be empty (scanned PDF with no text layer) - assert ( - actual_output.strip() == "" - ), "Scanned PDF should produce empty output" + assert actual_output.strip() == "", "Scanned PDF should produce empty output" assert ( expected_output.strip() == "" ), "Expected output should be empty for scanned PDF" From 51869c5155a1fe6761553254fcc8c52b077884af Mon Sep 17 00:00:00 2001 From: Viktor Lesyk Date: Thu, 12 Feb 2026 09:53:45 +0100 Subject: [PATCH 5/5] chore: revert version to 0.1.5b2 in __about__.py --- packages/markitdown/src/markitdown/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index ff0280657..e49b8c4d6 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.1.6b1" +__version__ = "0.1.5b2"