From b6ed1af912de6a8ab37df83f63e55020e12e31df Mon Sep 17 00:00:00 2001 From: Spencer Ogden Date: Thu, 26 Mar 2026 11:15:27 -0400 Subject: [PATCH] Trim trailing empty rows/columns from Excel sheets before rendering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Excel files (especially .xls) often pad to fixed dimensions (256 columns, 65536 rows) even when only a few cells contain data. When every empty cell is rendered through to_html() → Markdown, this causes extreme memory usage and enormous output. A real-world 57 KB .xls file produced 95 MB of Markdown (1,700x expansion) and consumed 13+ GB of RAM. This trims trailing all-NaN rows and columns before calling to_html(). Only trailing empties are removed, preserving intentional blank rows or columns used as visual separators within the data area. Applied to both XlsxConverter and XlsConverter. --- .../markitdown/converters/_xlsx_converter.py | 39 ++++++++++++++++++- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 4186ec773..4a643757e 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -33,6 +33,39 @@ ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"] +def _trim_trailing_empty(df: "pd.DataFrame") -> "pd.DataFrame": + """Trim trailing all-NaN rows and columns from a DataFrame. + + Excel files often pad to fixed dimensions (e.g. 256 columns in .xls) + even when only a few columns contain data. When every empty cell is + rendered to HTML/Markdown, this padding can cause extreme memory usage + and enormous output (a 57 KB .xls producing 95 MB of Markdown). + + Only *trailing* empties are removed so that intentional blank rows or + columns used as visual separators within the data are preserved. + """ + if df.empty: + return df + + # Trim trailing all-NaN columns + has_data = df.notna().any() # bool per column + if not has_data.any(): + return df.iloc[:0, :0] + last_col = has_data.values[::-1].argmax() # first True from the right + if last_col > 0: + df = df.iloc[:, : len(has_data) - last_col] + + # Trim trailing all-NaN rows + has_data = df.notna().any(axis=1) # bool per row + if not has_data.any(): + return df.iloc[:0, :0] + last_row = has_data.values[::-1].argmax() # first True from the bottom + if last_row > 0: + df = df.iloc[: len(has_data) - last_row] + + return df + + class XlsxConverter(DocumentConverter): """ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. @@ -84,7 +117,8 @@ def convert( md_content = "" for s in sheets: md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) + df = _trim_trailing_empty(sheets[s]) + html_content = df.to_html(index=False) md_content += ( self._html_converter.convert_string( html_content, **kwargs @@ -146,7 +180,8 @@ def convert( md_content = "" for s in sheets: md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) + df = _trim_trailing_empty(sheets[s]) + html_content = df.to_html(index=False) md_content += ( self._html_converter.convert_string( html_content, **kwargs