Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,7 @@ cython_debug/
src/.DS_Store
.DS_Store
.cursorrules


create_test_file.py
test_currency.xlsx
56 changes: 50 additions & 6 deletions packages/markitdown/src/markitdown/converters/_xlsx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
_xlsx_dependency_exc_info = None
try:
import pandas as pd
import openpyxl # noqa: F401
import openpyxl
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()

Expand All @@ -33,9 +33,44 @@
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]


def _apply_cell_formats(wb, sheets):
"""
Use openpyxl to detect currency-formatted cells and override
the raw numeric values that pandas extracted.
"""
for sheet_name in sheets:
ws = wb[sheet_name]
df = sheets[sheet_name]

for row in ws.iter_rows():
for cell in row:
if cell.value is None:
continue
fmt = cell.number_format or ""
value = cell.value
if not isinstance(value, (int, float)):
continue

# Detect currency symbols in the format string
for symbol in ['$', '€', '¥', '£']:
if symbol in fmt:
row_idx = cell.row - 2 # Convert to DataFrame row index
col_idx = cell.column - 1 # Convert to DataFrame column index
if 0 <= row_idx < len(df):
# Convert column to object type first to allow string values
col_name = df.columns[col_idx]
df[col_name] = df[col_name].astype(object)
if '.00' in fmt or '.0' in fmt:
formatted = f"{symbol}{value:,.2f}"
else:
formatted = f"{symbol}{int(value):,}"
df.iloc[row_idx, col_idx] = formatted
break

class XlsxConverter(DocumentConverter):
"""
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
Currency and other number formats are preserved using openpyxl.
"""

def __init__(self):
Expand All @@ -46,7 +81,7 @@ def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
Expand All @@ -64,7 +99,7 @@ def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> DocumentConverterResult:
# Check the dependencies
if _xlsx_dependency_exc_info is not None:
Expand All @@ -80,7 +115,16 @@ def convert(
_xlsx_dependency_exc_info[2]
)

# Read format info with openpyxl first
wb = openpyxl.load_workbook(file_stream, data_only=True)

# Seek back to start before pandas reads the same stream
file_stream.seek(0)
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")

# Apply currency and number formatting
_apply_cell_formats(wb, sheets)

md_content = ""
for s in sheets:
md_content += f"## {s}\n"
Expand Down Expand Up @@ -108,7 +152,7 @@ def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
Expand All @@ -126,7 +170,7 @@ def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> DocumentConverterResult:
# Load the dependencies
if _xls_dependency_exc_info is not None:
Expand Down Expand Up @@ -154,4 +198,4 @@ def convert(
+ "\n\n"
)

return DocumentConverterResult(markdown=md_content.strip())
return DocumentConverterResult(markdown=md_content.strip())