From 7239c2c20c9f55f3b2524a2af7ce9292acefe8c5 Mon Sep 17 00:00:00 2001 From: zhaorong566 Date: Sat, 21 Mar 2026 16:24:34 +0800 Subject: [PATCH 1/2] fix: preserve currency number format when converting xlsx to markdown - Load workbook with openpyxl to detect number_format on each cell - Convert DataFrame column to object type before assigning formatted string - Supports USD, EUR, JPY, GBP currency symbols Fixes #53 --- .../markitdown/converters/_xlsx_converter.py | 56 +++++++++++++++++-- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 4186ec773..45206c7cf 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -10,7 +10,7 @@ _xlsx_dependency_exc_info = None try: import pandas as pd - import openpyxl # noqa: F401 + import openpyxl except ImportError: _xlsx_dependency_exc_info = sys.exc_info() @@ -33,9 +33,44 @@ ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"] +def _apply_cell_formats(wb, sheets): + """ + Use openpyxl to detect currency-formatted cells and override + the raw numeric values that pandas extracted. + """ + for sheet_name in sheets: + ws = wb[sheet_name] + df = sheets[sheet_name] + + for row in ws.iter_rows(): + for cell in row: + if cell.value is None: + continue + fmt = cell.number_format or "" + value = cell.value + if not isinstance(value, (int, float)): + continue + + # Detect currency symbols in the format string + for symbol in ['$', '€', '¥', '£']: + if symbol in fmt: + row_idx = cell.row - 2 # Convert to DataFrame row index + col_idx = cell.column - 1 # Convert to DataFrame column index + if 0 <= row_idx < len(df): + # Convert column to object type first to allow string values + col_name = df.columns[col_idx] + df[col_name] = df[col_name].astype(object) + if '.00' in fmt or '.0' in fmt: + formatted = f"{symbol}{value:,.2f}" + else: + formatted = f"{symbol}{int(value):,}" + df.iloc[row_idx, col_idx] = formatted + break + class XlsxConverter(DocumentConverter): """ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. + Currency and other number formats are preserved using openpyxl. """ def __init__(self): @@ -46,7 +81,7 @@ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() @@ -64,7 +99,7 @@ def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> DocumentConverterResult: # Check the dependencies if _xlsx_dependency_exc_info is not None: @@ -80,7 +115,16 @@ def convert( _xlsx_dependency_exc_info[2] ) + # Read format info with openpyxl first + wb = openpyxl.load_workbook(file_stream, data_only=True) + + # Seek back to start before pandas reads the same stream + file_stream.seek(0) sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") + + # Apply currency and number formatting + _apply_cell_formats(wb, sheets) + md_content = "" for s in sheets: md_content += f"## {s}\n" @@ -108,7 +152,7 @@ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() @@ -126,7 +170,7 @@ def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> DocumentConverterResult: # Load the dependencies if _xls_dependency_exc_info is not None: @@ -154,4 +198,4 @@ def convert( + "\n\n" ) - return DocumentConverterResult(markdown=md_content.strip()) + return DocumentConverterResult(markdown=md_content.strip()) \ No newline at end of file From dd66732bcf27d328a182dfa703e91d8789a44620 Mon Sep 17 00:00:00 2001 From: zhaorong566 Date: Sat, 21 Mar 2026 16:34:29 +0800 Subject: [PATCH 2/2] chore: ignore test files --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 15613ea8a..9409e2d16 100644 --- a/.gitignore +++ b/.gitignore @@ -166,3 +166,7 @@ cython_debug/ src/.DS_Store .DS_Store .cursorrules + + +create_test_file.py +test_currency.xlsx