Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 37 additions & 2 deletions packages/markitdown/src/markitdown/converters/_xlsx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,39 @@
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]


def _trim_trailing_empty(df: "pd.DataFrame") -> "pd.DataFrame":
"""Trim trailing all-NaN rows and columns from a DataFrame.

Excel files often pad to fixed dimensions (e.g. 256 columns in .xls)
even when only a few columns contain data. When every empty cell is
rendered to HTML/Markdown, this padding can cause extreme memory usage
and enormous output (a 57 KB .xls producing 95 MB of Markdown).

Only *trailing* empties are removed so that intentional blank rows or
columns used as visual separators within the data are preserved.
"""
if df.empty:
return df

# Trim trailing all-NaN columns
has_data = df.notna().any() # bool per column
if not has_data.any():
return df.iloc[:0, :0]
last_col = has_data.values[::-1].argmax() # first True from the right
if last_col > 0:
df = df.iloc[:, : len(has_data) - last_col]

# Trim trailing all-NaN rows
has_data = df.notna().any(axis=1) # bool per row
if not has_data.any():
return df.iloc[:0, :0]
last_row = has_data.values[::-1].argmax() # first True from the bottom
if last_row > 0:
df = df.iloc[: len(has_data) - last_row]

return df


class XlsxConverter(DocumentConverter):
"""
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
Expand Down Expand Up @@ -84,7 +117,8 @@ def convert(
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
df = _trim_trailing_empty(sheets[s])
html_content = df.to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
Expand Down Expand Up @@ -146,7 +180,8 @@ def convert(
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
df = _trim_trailing_empty(sheets[s])
html_content = df.to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
Expand Down