From b6ed1af912de6a8ab37df83f63e55020e12e31df Mon Sep 17 00:00:00 2001
From: Spencer Ogden <spencer@spencerogden.com>
Date: Thu, 26 Mar 2026 11:15:27 -0400
Subject: [PATCH] Trim trailing empty rows/columns from Excel sheets before
 rendering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Excel files (especially .xls) often pad to fixed dimensions (256 columns,
65536 rows) even when only a few cells contain data. When every empty cell
is rendered through to_html() → Markdown, this causes extreme memory usage
and enormous output. A real-world 57 KB .xls file produced 95 MB of
Markdown (1,700x expansion) and consumed 13+ GB of RAM.

This trims trailing all-NaN rows and columns before calling to_html().
Only trailing empties are removed, preserving intentional blank rows or
columns used as visual separators within the data area.

Applied to both XlsxConverter and XlsConverter.
---
 .../markitdown/converters/_xlsx_converter.py  | 39 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
index 4186ec773..4a643757e 100644
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -33,6 +33,39 @@
 ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
 
 
+def _trim_trailing_empty(df: "pd.DataFrame") -> "pd.DataFrame":
+    """Trim trailing all-NaN rows and columns from a DataFrame.
+
+    Excel files often pad to fixed dimensions (e.g. 256 columns in .xls)
+    even when only a few columns contain data.  When every empty cell is
+    rendered to HTML/Markdown, this padding can cause extreme memory usage
+    and enormous output (a 57 KB .xls producing 95 MB of Markdown).
+
+    Only *trailing* empties are removed so that intentional blank rows or
+    columns used as visual separators within the data are preserved.
+    """
+    if df.empty:
+        return df
+
+    # Trim trailing all-NaN columns
+    has_data = df.notna().any()  # bool per column
+    if not has_data.any():
+        return df.iloc[:0, :0]
+    last_col = has_data.values[::-1].argmax()  # first True from the right
+    if last_col > 0:
+        df = df.iloc[:, : len(has_data) - last_col]
+
+    # Trim trailing all-NaN rows
+    has_data = df.notna().any(axis=1)  # bool per row
+    if not has_data.any():
+        return df.iloc[:0, :0]
+    last_row = has_data.values[::-1].argmax()  # first True from the bottom
+    if last_row > 0:
+        df = df.iloc[: len(has_data) - last_row]
+
+    return df
+
+
 class XlsxConverter(DocumentConverter):
     """
     Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
@@ -84,7 +117,8 @@ def convert(
         md_content = ""
         for s in sheets:
             md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
+            df = _trim_trailing_empty(sheets[s])
+            html_content = df.to_html(index=False)
             md_content += (
                 self._html_converter.convert_string(
                     html_content, **kwargs
@@ -146,7 +180,8 @@ def convert(
         md_content = ""
         for s in sheets:
             md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
+            df = _trim_trailing_empty(sheets[s])
+            html_content = df.to_html(index=False)
             md_content += (
                 self._html_converter.convert_string(
                     html_content, **kwargs