From 5be5bee201dab0dc23bd1f8b39bc6edc3c07be86 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:30:35 +0000 Subject: [PATCH 01/24] feat(file-based): Add Calamine-first with Openpyxl fallback for Excel parser Implements a fallback mechanism for Excel file parsing to handle edge cases where Calamine fails (e.g., invalid date values like year 20225). The parser now tries Calamine first for performance, then falls back to Openpyxl if Calamine encounters an error. Changes: - Modified open_and_parse_file() to implement try-catch with fallback logic - Added logger parameter to log when fallback is triggered - Added openpyxl as optional dependency in pyproject.toml - Added openpyxl to file-based extras list This resolves crashes in Google Drive source when processing large numbers of Excel files with malformed data, allowing syncs to complete successfully instead of failing entirely. Fixes: airbytehq/oncall#10097 Co-Authored-By: unknown <> --- .../file_based/file_types/excel_parser.py | 34 ++++++++++++++----- pyproject.toml | 3 +- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index f99ca0180..d3e1be488 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -64,7 +64,7 @@ async def infer_schema( fields: Dict[str, str] = {} with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: - df = self.open_and_parse_file(fp) + df = self.open_and_parse_file(fp, logger, file.uri) for column, df_type in df.dtypes.items(): # Choose the broadest data type if the column's data type differs in dataframes prev_frame_column_type = fields.get(column) # type: ignore [call-overload] @@ -92,7 +92,7 @@ def parse_records( discovered_schema: Optional[Mapping[str, SchemaType]] = None, ) -> Iterable[Dict[str, Any]]: """ - Parses records from an Excel file based on the provided configuration. + Parses records from an Excel file with fallback error handling. Args: config (FileBasedStreamConfig): Configuration for the file-based stream. @@ -111,7 +111,7 @@ def parse_records( try: # Open and parse the file using the stream reader with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: - df = self.open_and_parse_file(fp) + df = self.open_and_parse_file(fp, logger, file.uri) # Yield records as dictionaries # DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson # DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior @@ -181,14 +181,30 @@ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None: raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) @staticmethod - def open_and_parse_file(fp: Union[IOBase, str, Path]) -> pd.DataFrame: + def open_and_parse_file( + fp: Union[IOBase, str, Path], + logger: Optional[logging.Logger] = None, + file_uri: Optional[str] = None, + ) -> pd.DataFrame: """ - Opens and parses the Excel file. - - Args: - fp: File pointer to the Excel file. + Opens and parses the Excel file with Calamine-first and Openpyxl fallback. Returns: pd.DataFrame: Parsed data from the Excel file. """ - return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return] + try: + return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return] + except Exception as calamine_exc: + if logger: + logger.warning( + "Calamine parsing failed for %s, falling back to openpyxl: %s", + file_uri or "file", + str(calamine_exc), + ) + + try: + fp.seek(0) # type: ignore [union-attr] + except (AttributeError, OSError): + pass + + return pd.ExcelFile(fp, engine="openpyxl").parse() # type: ignore [arg-type, call-overload, no-any-return] diff --git a/pyproject.toml b/pyproject.toml index 70d6c8136..636f28d5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ pdf2image = { version = "1.16.3", optional = true } pyarrow = { version = "^19.0.0", optional = true } pytesseract = { version = "0.3.10", optional = true } # Used indirectly by unstructured library python-calamine = { version = "0.2.3", optional = true } # TODO: Remove if unused +openpyxl = { version = "^3.1.0", optional = true } python-snappy = { version = "0.7.3", optional = true } # TODO: remove if unused tiktoken = { version = "0.8.0", optional = true } nltk = { version = "3.9.1", optional = true } @@ -120,7 +121,7 @@ deptry = "^0.23.0" dagger-io = "0.19.0" [tool.poetry.extras] -file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "python-snappy"] +file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "openpyxl", "python-snappy"] vector-db-based = ["langchain_community", "langchain_core", "langchain_text_splitters", "openai", "cohere", "tiktoken"] sql = ["sqlalchemy"] dev = ["pytest"] From 65e8adcb7c447514ca1292eceff79a7843f3a734 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:36:17 +0000 Subject: [PATCH 02/24] chore: Update poetry.lock after adding openpyxl dependency Co-Authored-By: unknown <> --- poetry.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index f746538b0..d316437a1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1277,7 +1277,7 @@ description = "An implementation of lxml.xmlfile for the standard library" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"vector-db-based\"" +markers = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")" files = [ {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, @@ -3460,7 +3460,7 @@ description = "A Python library to read/write Excel 2010 xlsx/xlsm files" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"vector-db-based\"" +markers = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")" files = [ {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, @@ -7003,7 +7003,7 @@ cffi = ["cffi (>=1.17,<2.0)", "cffi (>=2.0.0b)"] [extras] dev = ["pytest"] -file-based = ["avro", "fastavro", "markdown", "pdf2image", "pdfminer.six", "pyarrow", "pytesseract", "python-calamine", "python-snappy", "unstructured", "unstructured.pytesseract"] +file-based = ["avro", "fastavro", "markdown", "openpyxl", "pdf2image", "pdfminer.six", "pyarrow", "pytesseract", "python-calamine", "python-snappy", "unstructured", "unstructured.pytesseract"] manifest-server = ["ddtrace", "fastapi", "uvicorn"] sql = ["sqlalchemy"] vector-db-based = ["cohere", "langchain_community", "langchain_core", "langchain_text_splitters", "openai", "tiktoken"] @@ -7011,4 +7011,4 @@ vector-db-based = ["cohere", "langchain_community", "langchain_core", "langchain [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "c873e640db32d6ca60417764d97eafaa122eb155f004686b21fef4b055b58846" +content-hash = "eb792c4be3d1543876f6bff3a15f2fbc7d235ab7b49279b85089837210000e7e" From a7664ebbf88369a21e88fc2a8a28e9082a0c6649 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:38:48 +0000 Subject: [PATCH 03/24] chore: Add openpyxl to Deptry DEP002 ignore list openpyxl is loaded dynamically by pandas via engine='openpyxl' parameter, so Deptry cannot detect its usage. Adding to ignore list alongside python-calamine which has the same pattern. Co-Authored-By: unknown <> --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 636f28d5b..d0061c31a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -253,6 +253,7 @@ DEP002 = [ "cohere", "markdown", "openai", + "openpyxl", "pdf2image", "pdfminer.six", "pytesseract", From 34fe8922ce710bd158158f4cdeea3a3dcf280305 Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Fri, 14 Nov 2025 13:15:14 -0600 Subject: [PATCH 04/24] fix backoff parser, add better logging, and unit tests --- .../file_based/file_types/excel_parser.py | 72 +++++++++++++++---- airbyte_cdk/sources/file_based/remote_file.py | 5 ++ .../file_types/test_excel_parser.py | 37 ++++++++++ 3 files changed, 101 insertions(+), 13 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index d3e1be488..321a4dc7e 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -3,6 +3,7 @@ # import logging +import warnings from io import IOBase from pathlib import Path from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union @@ -64,7 +65,7 @@ async def infer_schema( fields: Dict[str, str] = {} with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: - df = self.open_and_parse_file(fp, logger, file.uri) + df = self.open_and_parse_file(fp, logger, file) for column, df_type in df.dtypes.items(): # Choose the broadest data type if the column's data type differs in dataframes prev_frame_column_type = fields.get(column) # type: ignore [call-overload] @@ -111,7 +112,7 @@ def parse_records( try: # Open and parse the file using the stream reader with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: - df = self.open_and_parse_file(fp, logger, file.uri) + df = self.open_and_parse_file(fp, logger, file) # Yield records as dictionaries # DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson # DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior @@ -184,7 +185,7 @@ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None: def open_and_parse_file( fp: Union[IOBase, str, Path], logger: Optional[logging.Logger] = None, - file_uri: Optional[str] = None, + file_info: Optional[Union[str, RemoteFile]] = None, ) -> pd.DataFrame: """ Opens and parses the Excel file with Calamine-first and Openpyxl fallback. @@ -192,19 +193,64 @@ def open_and_parse_file( Returns: pd.DataFrame: Parsed data from the Excel file. """ + file_label = "file" + file_url = None + if isinstance(file_info, RemoteFile): + file_label = file_info.file_uri_for_logging + file_url = getattr(file_info, "url", None) + elif isinstance(file_info, str): + file_label = file_info + calamine_exc: Optional[BaseException] = None try: - return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return] - except Exception as calamine_exc: + with pd.ExcelFile(fp, engine="calamine") as excel_file: # type: ignore [arg-type, call-overload] + return excel_file.parse() # type: ignore [no-any-return] + except BaseException as exc: # noqa: BLE001 + if isinstance(exc, (KeyboardInterrupt, SystemExit)): + raise + calamine_exc = exc if logger: logger.warning( - "Calamine parsing failed for %s, falling back to openpyxl: %s", - file_uri or "file", - str(calamine_exc), + ExcelParser._format_message_with_link( + f"Calamine parsing failed for {file_label}, falling back to openpyxl: {exc}", + file_url, + ) ) - try: - fp.seek(0) # type: ignore [union-attr] - except (AttributeError, OSError): - pass + # Fallback to openpyxl + try: + fp.seek(0) # type: ignore [union-attr] + except (AttributeError, OSError): + pass - return pd.ExcelFile(fp, engine="openpyxl").parse() # type: ignore [arg-type, call-overload, no-any-return] + try: + with warnings.catch_warnings(record=True) as warning_records: + warnings.simplefilter("always") + with pd.ExcelFile(fp, engine="openpyxl") as excel_file: # type: ignore [arg-type, call-overload] + df = excel_file.parse() # type: ignore [no-any-return] + if logger: + for warning in warning_records: + logger.warning( + ExcelParser._format_message_with_link( + f"Openpyxl warning for {file_label}: {warning.message}", + file_url, + ) + ) + return df + except BaseException as openpyxl_exc: # noqa: BLE001 + if isinstance(openpyxl_exc, (KeyboardInterrupt, SystemExit)): + raise + # If both engines fail, raise the original calamine exception + if logger: + logger.error( + ExcelParser._format_message_with_link( + f"Both Calamine and Openpyxl parsing failed for {file_label}. Calamine error: {calamine_exc}, Openpyxl error: {openpyxl_exc}", + file_url, + ) + ) + raise calamine_exc if calamine_exc else openpyxl_exc + + @staticmethod + def _format_message_with_link(message: str, file_url: Optional[str]) -> str: + if file_url: + return f"{message} (view: {file_url})" + return message diff --git a/airbyte_cdk/sources/file_based/remote_file.py b/airbyte_cdk/sources/file_based/remote_file.py index b2bfb23f5..4f2ab59c6 100644 --- a/airbyte_cdk/sources/file_based/remote_file.py +++ b/airbyte_cdk/sources/file_based/remote_file.py @@ -17,6 +17,11 @@ class RemoteFile(BaseModel): last_modified: datetime mime_type: Optional[str] = None + @property + def file_uri_for_logging(self) -> str: + """Returns a user-friendly identifier for logging.""" + return self.uri + class UploadableRemoteFile(RemoteFile, ABC): """ diff --git a/unit_tests/sources/file_based/file_types/test_excel_parser.py b/unit_tests/sources/file_based/file_types/test_excel_parser.py index f744e9e8a..286793b73 100644 --- a/unit_tests/sources/file_based/file_types/test_excel_parser.py +++ b/unit_tests/sources/file_based/file_types/test_excel_parser.py @@ -4,6 +4,7 @@ import datetime +import warnings from io import BytesIO from unittest.mock import MagicMock, Mock, mock_open, patch @@ -136,3 +137,39 @@ def test_file_read_error(mock_stream_reader, mock_logger, file_config, remote_fi list( parser.parse_records(file_config, remote_file, mock_stream_reader, mock_logger) ) + + +class FakePanic(BaseException): + """Simulates the PyO3 PanicException which does not inherit from Exception.""" + + +def test_open_and_parse_file_falls_back_to_openpyxl(mock_logger): + parser = ExcelParser() + fp = BytesIO(b"test") + + fallback_df = pd.DataFrame({"a": [1]}) + + calamine_ctx = MagicMock() + calamine_excel_file = MagicMock() + calamine_ctx.__enter__.return_value = calamine_excel_file + calamine_excel_file.parse.side_effect = FakePanic("calamine panic") + + openpyxl_ctx = MagicMock() + openpyxl_excel_file = MagicMock() + openpyxl_ctx.__enter__.return_value = openpyxl_excel_file + + def openpyxl_parse_side_effect(): + warnings.warn("Cell A146 has invalid date", UserWarning) + return fallback_df + + openpyxl_excel_file.parse.side_effect = openpyxl_parse_side_effect + + with patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel: + mock_excel.side_effect = [calamine_ctx, openpyxl_ctx] + + result = parser.open_and_parse_file(fp, mock_logger, "file.xlsx") + + pd.testing.assert_frame_equal(result, fallback_df) + assert mock_logger.warning.call_count == 2 + assert "Openpyxl warning" in mock_logger.warning.call_args_list[1].args[0] + mock_logger.error.assert_not_called() From adfe576153634e1ba153146b90917bbe9f165513 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 14 Nov 2025 19:22:24 +0000 Subject: [PATCH 05/24] refactor: Improve exception handling in Excel parser - Change BaseException to Exception in both except blocks - Remove manual KeyboardInterrupt/SystemExit re-raise (now propagates naturally) - Add explanatory comment for empty except block when seeking file pointer Addresses code quality bot feedback on PR airbytehq/airbyte-python-cdk#850 Co-Authored-By: unknown <> --- .../sources/file_based/file_types/excel_parser.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index 321a4dc7e..dd4bda41e 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -200,13 +200,11 @@ def open_and_parse_file( file_url = getattr(file_info, "url", None) elif isinstance(file_info, str): file_label = file_info - calamine_exc: Optional[BaseException] = None + calamine_exc: Optional[Exception] = None try: with pd.ExcelFile(fp, engine="calamine") as excel_file: # type: ignore [arg-type, call-overload] return excel_file.parse() # type: ignore [no-any-return] - except BaseException as exc: # noqa: BLE001 - if isinstance(exc, (KeyboardInterrupt, SystemExit)): - raise + except Exception as exc: calamine_exc = exc if logger: logger.warning( @@ -220,6 +218,7 @@ def open_and_parse_file( try: fp.seek(0) # type: ignore [union-attr] except (AttributeError, OSError): + # Some file-like objects may not be seekable; attempt openpyxl parsing anyway pass try: @@ -236,9 +235,7 @@ def open_and_parse_file( ) ) return df - except BaseException as openpyxl_exc: # noqa: BLE001 - if isinstance(openpyxl_exc, (KeyboardInterrupt, SystemExit)): - raise + except Exception as openpyxl_exc: # If both engines fail, raise the original calamine exception if logger: logger.error( From 88084addb6bce16c8e91caee4e6eef1079cf123e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 14 Nov 2025 19:27:28 +0000 Subject: [PATCH 06/24] fix: Add sheet_name=0 to ExcelFile.parse() calls to satisfy MyPy type stubs - Explicitly specify sheet_name=0 in both Calamine and Openpyxl parse calls - Behavior unchanged: pandas defaults to first sheet (index 0) when no sheet_name provided - Resolves MyPy call-overload and no-any-return errors - ExcelFormat has no sheet selection parameter, so defaulting to first sheet is correct Co-Authored-By: unknown <> --- airbyte_cdk/sources/file_based/file_types/excel_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index dd4bda41e..fbb79fad7 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -203,7 +203,7 @@ def open_and_parse_file( calamine_exc: Optional[Exception] = None try: with pd.ExcelFile(fp, engine="calamine") as excel_file: # type: ignore [arg-type, call-overload] - return excel_file.parse() # type: ignore [no-any-return] + return excel_file.parse(sheet_name=0) # type: ignore [no-any-return] except Exception as exc: calamine_exc = exc if logger: @@ -225,7 +225,7 @@ def open_and_parse_file( with warnings.catch_warnings(record=True) as warning_records: warnings.simplefilter("always") with pd.ExcelFile(fp, engine="openpyxl") as excel_file: # type: ignore [arg-type, call-overload] - df = excel_file.parse() # type: ignore [no-any-return] + df = excel_file.parse(sheet_name=0) # type: ignore [no-any-return] if logger: for warning in warning_records: logger.warning( From 546bd4635c2f3a23a8feb75f485ca1accf8c965a Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 14 Nov 2025 19:40:56 +0000 Subject: [PATCH 07/24] fix: Implement two-tier exception handling for Calamine panics and update test - Add two-tier exception handling: catch Exception first, then BaseException - PyO3 PanicException from Calamine inherits from BaseException, not Exception - Keep targeted BLE001 suppression with explanatory comment - Re-raise KeyboardInterrupt/SystemExit in BaseException handler - Update calamine_exc type to Optional[BaseException] for MyPy - Update test mocks to accept sheet_name parameter - Verified: test passes and MyPy succeeds locally This preserves the functional requirement to catch Calamine panics while following Python best practices for normal exception handling. Co-Authored-By: unknown <> --- .../sources/file_based/file_types/excel_parser.py | 14 +++++++++++++- .../file_based/file_types/test_excel_parser.py | 8 ++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index fbb79fad7..0142e3b3e 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -200,7 +200,7 @@ def open_and_parse_file( file_url = getattr(file_info, "url", None) elif isinstance(file_info, str): file_label = file_info - calamine_exc: Optional[Exception] = None + calamine_exc: Optional[BaseException] = None try: with pd.ExcelFile(fp, engine="calamine") as excel_file: # type: ignore [arg-type, call-overload] return excel_file.parse(sheet_name=0) # type: ignore [no-any-return] @@ -213,6 +213,18 @@ def open_and_parse_file( file_url, ) ) + except BaseException as exc: # noqa: BLE001 + # PyO3 PanicException from Calamine inherits from BaseException, not Exception + if isinstance(exc, (KeyboardInterrupt, SystemExit)): + raise + calamine_exc = exc + if logger: + logger.warning( + ExcelParser._format_message_with_link( + f"Calamine parsing failed for {file_label}, falling back to openpyxl: {exc}", + file_url, + ) + ) # Fallback to openpyxl try: diff --git a/unit_tests/sources/file_based/file_types/test_excel_parser.py b/unit_tests/sources/file_based/file_types/test_excel_parser.py index 286793b73..04049bfa3 100644 --- a/unit_tests/sources/file_based/file_types/test_excel_parser.py +++ b/unit_tests/sources/file_based/file_types/test_excel_parser.py @@ -152,13 +152,17 @@ def test_open_and_parse_file_falls_back_to_openpyxl(mock_logger): calamine_ctx = MagicMock() calamine_excel_file = MagicMock() calamine_ctx.__enter__.return_value = calamine_excel_file - calamine_excel_file.parse.side_effect = FakePanic("calamine panic") + + def calamine_parse_side_effect(sheet_name=None): + raise FakePanic("calamine panic") + + calamine_excel_file.parse.side_effect = calamine_parse_side_effect openpyxl_ctx = MagicMock() openpyxl_excel_file = MagicMock() openpyxl_ctx.__enter__.return_value = openpyxl_excel_file - def openpyxl_parse_side_effect(): + def openpyxl_parse_side_effect(sheet_name=None): warnings.warn("Cell A146 has invalid date", UserWarning) return fallback_df From 67fa6978fd430b12af59cdbdc638880c3492f1c4 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 14 Nov 2025 19:42:46 +0000 Subject: [PATCH 08/24] style: Fix Ruff formatting - remove trailing whitespace in test file Co-Authored-By: unknown <> --- unit_tests/sources/file_based/file_types/test_excel_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unit_tests/sources/file_based/file_types/test_excel_parser.py b/unit_tests/sources/file_based/file_types/test_excel_parser.py index 04049bfa3..5d881e4ff 100644 --- a/unit_tests/sources/file_based/file_types/test_excel_parser.py +++ b/unit_tests/sources/file_based/file_types/test_excel_parser.py @@ -152,10 +152,10 @@ def test_open_and_parse_file_falls_back_to_openpyxl(mock_logger): calamine_ctx = MagicMock() calamine_excel_file = MagicMock() calamine_ctx.__enter__.return_value = calamine_excel_file - + def calamine_parse_side_effect(sheet_name=None): raise FakePanic("calamine panic") - + calamine_excel_file.parse.side_effect = calamine_parse_side_effect openpyxl_ctx = MagicMock() From e431f9d910d6f9037849f88ce204b868a2ace73c Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 16:08:18 +0000 Subject: [PATCH 09/24] refactor: Remove duplicate file_uri_for_logging property from UploadableRemoteFile The file_uri_for_logging property is now inherited from the parent RemoteFile class, so the duplicate implementation in UploadableRemoteFile is no longer needed. Addresses PR comment from @darynaishchenko Co-Authored-By: unknown <> --- airbyte_cdk/sources/file_based/remote_file.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/airbyte_cdk/sources/file_based/remote_file.py b/airbyte_cdk/sources/file_based/remote_file.py index 4f2ab59c6..1e3eec724 100644 --- a/airbyte_cdk/sources/file_based/remote_file.py +++ b/airbyte_cdk/sources/file_based/remote_file.py @@ -53,10 +53,3 @@ def source_file_relative_path(self) -> str: Returns the relative path of the source file. """ return self.uri - - @property - def file_uri_for_logging(self) -> str: - """ - Returns the URI for the file being logged. - """ - return self.uri From a82a2fa4dcbfd7d4758987af440ee7336826cc4c Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 16:40:41 +0000 Subject: [PATCH 10/24] feat: Add ExcelCalamineParsingError exception for Excel parser fallback Adds a new custom exception ExcelCalamineParsingError that inherits from BaseFileBasedSourceError. This exception is raised when the Calamine engine fails to parse an Excel file, triggering the fallback to Openpyxl. Addresses PR comment from @darynaishchenko Co-Authored-By: unknown <> --- airbyte_cdk/sources/file_based/exceptions.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/airbyte_cdk/sources/file_based/exceptions.py b/airbyte_cdk/sources/file_based/exceptions.py index c75f3257f..cd727463a 100644 --- a/airbyte_cdk/sources/file_based/exceptions.py +++ b/airbyte_cdk/sources/file_based/exceptions.py @@ -92,6 +92,12 @@ class RecordParseError(BaseFileBasedSourceError): pass +class ExcelCalamineParsingError(BaseFileBasedSourceError): + """Raised when Calamine engine fails to parse an Excel file.""" + + pass + + class SchemaInferenceError(BaseFileBasedSourceError): pass From 6a38d553a6d7dd03280429431d7db95189cbfb6a Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 16:41:04 +0000 Subject: [PATCH 11/24] refactor: Separate Excel parsing logic into three focused methods Refactors open_and_parse_file() to improve code organization and maintainability by separating concerns into three methods: 1. _open_and_parse_file_with_calamine(): Handles Calamine parsing, catches all exceptions (including PyO3 PanicException which inherits from BaseException), logs warning, and raises ExcelCalamineParsingError on failure. 2. _open_and_parse_file_with_openpyxl(): Handles Openpyxl parsing with warning capture and logging. 3. open_and_parse_file(): Orchestrates the fallback logic - tries Calamine first, falls back to Openpyxl on ExcelCalamineParsingError, handles file pointer seeking between attempts. Additional changes: - Removed Optional from logger and file_info parameters (always provided) - Changed file_info type from Union[str, RemoteFile] to RemoteFile (always RemoteFile) - Simplified file_url logic to use only file_info.file_uri_for_logging - Updated test to match new method signatures and mock structure - Uses exception chaining (raise ... from exc) for better error traceability Addresses PR comments from @darynaishchenko (Comments 14, 15, 17) Co-Authored-By: unknown <> --- .../file_based/file_types/excel_parser.py | 138 +++++++++--------- .../file_types/test_excel_parser.py | 14 +- 2 files changed, 77 insertions(+), 75 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index 0142e3b3e..d6395d320 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -18,6 +18,7 @@ ) from airbyte_cdk.sources.file_based.exceptions import ( ConfigValidationError, + ExcelCalamineParsingError, FileBasedSourceError, RecordParseError, ) @@ -182,84 +183,89 @@ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None: raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) @staticmethod - def open_and_parse_file( + def _open_and_parse_file_with_calamine( fp: Union[IOBase, str, Path], - logger: Optional[logging.Logger] = None, - file_info: Optional[Union[str, RemoteFile]] = None, + logger: logging.Logger, + file_info: RemoteFile, ) -> pd.DataFrame: - """ - Opens and parses the Excel file with Calamine-first and Openpyxl fallback. + """Opens and parses Excel file using Calamine engine. + + Args: + fp: File pointer to the Excel file. + logger: Logger for logging information and errors. + file_info: Remote file information for logging context. Returns: pd.DataFrame: Parsed data from the Excel file. + + Raises: + ExcelCalamineParsingError: If Calamine fails to parse the file. """ - file_label = "file" - file_url = None - if isinstance(file_info, RemoteFile): - file_label = file_info.file_uri_for_logging - file_url = getattr(file_info, "url", None) - elif isinstance(file_info, str): - file_label = file_info - calamine_exc: Optional[BaseException] = None try: - with pd.ExcelFile(fp, engine="calamine") as excel_file: # type: ignore [arg-type, call-overload] - return excel_file.parse(sheet_name=0) # type: ignore [no-any-return] - except Exception as exc: - calamine_exc = exc - if logger: - logger.warning( - ExcelParser._format_message_with_link( - f"Calamine parsing failed for {file_label}, falling back to openpyxl: {exc}", - file_url, - ) - ) + return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return] except BaseException as exc: # noqa: BLE001 # PyO3 PanicException from Calamine inherits from BaseException, not Exception if isinstance(exc, (KeyboardInterrupt, SystemExit)): raise - calamine_exc = exc - if logger: - logger.warning( - ExcelParser._format_message_with_link( - f"Calamine parsing failed for {file_label}, falling back to openpyxl: {exc}", - file_url, - ) - ) + logger.warning( + f"Calamine parsing failed for {file_info.file_uri_for_logging}, falling back to openpyxl: {exc}" + ) + raise ExcelCalamineParsingError( + f"Calamine engine failed to parse {file_info.file_uri_for_logging}", + filename=file_info.uri, + ) from exc - # Fallback to openpyxl - try: - fp.seek(0) # type: ignore [union-attr] - except (AttributeError, OSError): - # Some file-like objects may not be seekable; attempt openpyxl parsing anyway - pass + @staticmethod + def _open_and_parse_file_with_openpyxl( + fp: Union[IOBase, str, Path], + logger: logging.Logger, + file_info: RemoteFile, + ) -> pd.DataFrame: + """Opens and parses Excel file using Openpyxl engine. - try: - with warnings.catch_warnings(record=True) as warning_records: - warnings.simplefilter("always") - with pd.ExcelFile(fp, engine="openpyxl") as excel_file: # type: ignore [arg-type, call-overload] - df = excel_file.parse(sheet_name=0) # type: ignore [no-any-return] - if logger: - for warning in warning_records: - logger.warning( - ExcelParser._format_message_with_link( - f"Openpyxl warning for {file_label}: {warning.message}", - file_url, - ) - ) - return df - except Exception as openpyxl_exc: - # If both engines fail, raise the original calamine exception - if logger: - logger.error( - ExcelParser._format_message_with_link( - f"Both Calamine and Openpyxl parsing failed for {file_label}. Calamine error: {calamine_exc}, Openpyxl error: {openpyxl_exc}", - file_url, - ) - ) - raise calamine_exc if calamine_exc else openpyxl_exc + Args: + fp: File pointer to the Excel file. + logger: Logger for logging information and errors. + file_info: Remote file information for logging context. + + Returns: + pd.DataFrame: Parsed data from the Excel file. + """ + with warnings.catch_warnings(record=True) as warning_records: + warnings.simplefilter("always") + df = pd.ExcelFile(fp, engine="openpyxl").parse() # type: ignore [arg-type, call-overload] + + for warning in warning_records: + logger.warning( + f"Openpyxl warning for {file_info.file_uri_for_logging}: {warning.message}" + ) + + return df # type: ignore [no-any-return] @staticmethod - def _format_message_with_link(message: str, file_url: Optional[str]) -> str: - if file_url: - return f"{message} (view: {file_url})" - return message + def open_and_parse_file( + fp: Union[IOBase, str, Path], + logger: logging.Logger, + file_info: RemoteFile, + ) -> pd.DataFrame: + """Opens and parses the Excel file with Calamine-first and Openpyxl fallback. + + Args: + fp: File pointer to the Excel file. + logger: Logger for logging information and errors. + file_info: Remote file information for logging context. + + Returns: + pd.DataFrame: Parsed data from the Excel file. + """ + try: + return ExcelParser._open_and_parse_file_with_calamine(fp, logger, file_info) + except ExcelCalamineParsingError: + # Fallback to openpyxl + try: + fp.seek(0) # type: ignore [union-attr] + except (AttributeError, OSError): + # Some file-like objects may not be seekable; attempt openpyxl parsing anyway + pass + + return ExcelParser._open_and_parse_file_with_openpyxl(fp, logger, file_info) diff --git a/unit_tests/sources/file_based/file_types/test_excel_parser.py b/unit_tests/sources/file_based/file_types/test_excel_parser.py index 5d881e4ff..c8415ebc2 100644 --- a/unit_tests/sources/file_based/file_types/test_excel_parser.py +++ b/unit_tests/sources/file_based/file_types/test_excel_parser.py @@ -146,34 +146,30 @@ class FakePanic(BaseException): def test_open_and_parse_file_falls_back_to_openpyxl(mock_logger): parser = ExcelParser() fp = BytesIO(b"test") + remote_file = RemoteFile(uri="s3://mybucket/test.xlsx", last_modified=datetime.datetime.now()) fallback_df = pd.DataFrame({"a": [1]}) - calamine_ctx = MagicMock() calamine_excel_file = MagicMock() - calamine_ctx.__enter__.return_value = calamine_excel_file - def calamine_parse_side_effect(sheet_name=None): + def calamine_parse_side_effect(): raise FakePanic("calamine panic") calamine_excel_file.parse.side_effect = calamine_parse_side_effect - openpyxl_ctx = MagicMock() openpyxl_excel_file = MagicMock() - openpyxl_ctx.__enter__.return_value = openpyxl_excel_file - def openpyxl_parse_side_effect(sheet_name=None): + def openpyxl_parse_side_effect(): warnings.warn("Cell A146 has invalid date", UserWarning) return fallback_df openpyxl_excel_file.parse.side_effect = openpyxl_parse_side_effect with patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel: - mock_excel.side_effect = [calamine_ctx, openpyxl_ctx] + mock_excel.side_effect = [calamine_excel_file, openpyxl_excel_file] - result = parser.open_and_parse_file(fp, mock_logger, "file.xlsx") + result = parser.open_and_parse_file(fp, mock_logger, remote_file) pd.testing.assert_frame_equal(result, fallback_df) assert mock_logger.warning.call_count == 2 assert "Openpyxl warning" in mock_logger.warning.call_args_list[1].args[0] - mock_logger.error.assert_not_called() From fef9ac2e0db8a5d18acefa3db5dff034d51fd1b7 Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Thu, 20 Nov 2025 17:23:59 -0600 Subject: [PATCH 12/24] Add check for CALAMINE_PANIC_EXCEPTIONS --- .../file_based/file_types/excel_parser.py | 19 ++++++++++++++----- .../file_types/test_excel_parser.py | 19 ++++++++++++++++++- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index d6395d320..721c78e36 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -6,7 +6,7 @@ import warnings from io import IOBase from pathlib import Path -from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union +from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Type, Union, cast import orjson import pandas as pd @@ -30,6 +30,17 @@ from airbyte_cdk.sources.file_based.remote_file import RemoteFile from airbyte_cdk.sources.file_based.schema_helpers import SchemaType +try: # pragma: no cover - evaluated at import time + from pyo3_runtime import PanicException as _PyO3PanicException # type: ignore[import-not-found] +except ImportError: # pragma: no cover - optional dependency is not installed + _PyO3PanicException = None # type: ignore[assignment] + +CALAMINE_PANIC_EXCEPTIONS: Tuple[Type[BaseException], ...] = ( + (cast(Type[BaseException], _PyO3PanicException),) + if _PyO3PanicException is not None + else () +) + class ExcelParser(FileTypeParser): ENCODING = None @@ -201,12 +212,10 @@ def _open_and_parse_file_with_calamine( Raises: ExcelCalamineParsingError: If Calamine fails to parse the file. """ + handled_exceptions: Tuple[Type[BaseException], ...] = CALAMINE_PANIC_EXCEPTIONS + (Exception,) try: return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return] - except BaseException as exc: # noqa: BLE001 - # PyO3 PanicException from Calamine inherits from BaseException, not Exception - if isinstance(exc, (KeyboardInterrupt, SystemExit)): - raise + except handled_exceptions as exc: # type: ignore[misc] logger.warning( f"Calamine parsing failed for {file_info.file_uri_for_logging}, falling back to openpyxl: {exc}" ) diff --git a/unit_tests/sources/file_based/file_types/test_excel_parser.py b/unit_tests/sources/file_based/file_types/test_excel_parser.py index c8415ebc2..81b331975 100644 --- a/unit_tests/sources/file_based/file_types/test_excel_parser.py +++ b/unit_tests/sources/file_based/file_types/test_excel_parser.py @@ -165,7 +165,10 @@ def openpyxl_parse_side_effect(): openpyxl_excel_file.parse.side_effect = openpyxl_parse_side_effect - with patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel: + with patch( + "airbyte_cdk.sources.file_based.file_types.excel_parser.CALAMINE_PANIC_EXCEPTIONS", + (FakePanic,), + ), patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel: mock_excel.side_effect = [calamine_excel_file, openpyxl_excel_file] result = parser.open_and_parse_file(fp, mock_logger, remote_file) @@ -173,3 +176,17 @@ def openpyxl_parse_side_effect(): pd.testing.assert_frame_equal(result, fallback_df) assert mock_logger.warning.call_count == 2 assert "Openpyxl warning" in mock_logger.warning.call_args_list[1].args[0] + + +def test_open_and_parse_file_does_not_swallow_keyboard_interrupt(mock_logger): + parser = ExcelParser() + fp = BytesIO(b"test") + remote_file = RemoteFile(uri="s3://mybucket/test.xlsx", last_modified=datetime.datetime.now()) + + with patch( + "airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile" + ) as mock_excel: + mock_excel.return_value.parse.side_effect = KeyboardInterrupt() + + with pytest.raises(KeyboardInterrupt): + parser.open_and_parse_file(fp, mock_logger, remote_file) From fd1939ebc9b58dbeaf750ff90c62c08a468adab6 Mon Sep 17 00:00:00 2001 From: darynaishchenko Date: Fri, 21 Nov 2025 12:39:40 +0200 Subject: [PATCH 13/24] update error handling --- .../file_based/file_types/excel_parser.py | 33 ++++++++----------- .../file_types/test_excel_parser.py | 15 ++++----- 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index 721c78e36..30da7573e 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -30,17 +30,6 @@ from airbyte_cdk.sources.file_based.remote_file import RemoteFile from airbyte_cdk.sources.file_based.schema_helpers import SchemaType -try: # pragma: no cover - evaluated at import time - from pyo3_runtime import PanicException as _PyO3PanicException # type: ignore[import-not-found] -except ImportError: # pragma: no cover - optional dependency is not installed - _PyO3PanicException = None # type: ignore[assignment] - -CALAMINE_PANIC_EXCEPTIONS: Tuple[Type[BaseException], ...] = ( - (cast(Type[BaseException], _PyO3PanicException),) - if _PyO3PanicException is not None - else () -) - class ExcelParser(FileTypeParser): ENCODING = None @@ -212,17 +201,21 @@ def _open_and_parse_file_with_calamine( Raises: ExcelCalamineParsingError: If Calamine fails to parse the file. """ - handled_exceptions: Tuple[Type[BaseException], ...] = CALAMINE_PANIC_EXCEPTIONS + (Exception,) try: return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return] - except handled_exceptions as exc: # type: ignore[misc] - logger.warning( - f"Calamine parsing failed for {file_info.file_uri_for_logging}, falling back to openpyxl: {exc}" - ) - raise ExcelCalamineParsingError( - f"Calamine engine failed to parse {file_info.file_uri_for_logging}", - filename=file_info.uri, - ) from exc + except BaseException as exc: + # Calamine engine raises PanicException(child of BaseException) if Calamine fails to parse the file. + # Checking if ValueError in exception arg to know if it was actually an error during parsing due to invalid values in cells. + # Otherwise, raise an exception. + if "ValueError" in str(exc): + logger.warning( + f"Calamine parsing failed for {file_info.file_uri_for_logging}, falling back to openpyxl: {exc}" + ) + raise ExcelCalamineParsingError( + f"Calamine engine failed to parse {file_info.file_uri_for_logging}", + filename=file_info.uri, + ) from exc + raise exc @staticmethod def _open_and_parse_file_with_openpyxl( diff --git a/unit_tests/sources/file_based/file_types/test_excel_parser.py b/unit_tests/sources/file_based/file_types/test_excel_parser.py index 81b331975..7db1aa2d2 100644 --- a/unit_tests/sources/file_based/file_types/test_excel_parser.py +++ b/unit_tests/sources/file_based/file_types/test_excel_parser.py @@ -153,7 +153,9 @@ def test_open_and_parse_file_falls_back_to_openpyxl(mock_logger): calamine_excel_file = MagicMock() def calamine_parse_side_effect(): - raise FakePanic("calamine panic") + raise FakePanic( + "failed to construct date: PyErr { type: , value: ValueError('year 20225 is out of range'), traceback: None }" + ) calamine_excel_file.parse.side_effect = calamine_parse_side_effect @@ -165,10 +167,9 @@ def openpyxl_parse_side_effect(): openpyxl_excel_file.parse.side_effect = openpyxl_parse_side_effect - with patch( - "airbyte_cdk.sources.file_based.file_types.excel_parser.CALAMINE_PANIC_EXCEPTIONS", - (FakePanic,), - ), patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel: + with ( + patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel, + ): mock_excel.side_effect = [calamine_excel_file, openpyxl_excel_file] result = parser.open_and_parse_file(fp, mock_logger, remote_file) @@ -183,9 +184,7 @@ def test_open_and_parse_file_does_not_swallow_keyboard_interrupt(mock_logger): fp = BytesIO(b"test") remote_file = RemoteFile(uri="s3://mybucket/test.xlsx", last_modified=datetime.datetime.now()) - with patch( - "airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile" - ) as mock_excel: + with patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel: mock_excel.return_value.parse.side_effect = KeyboardInterrupt() with pytest.raises(KeyboardInterrupt): From d2f691a1bad07eb1fab355398a801c79337b0e62 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 21 Nov 2025 20:22:06 +0000 Subject: [PATCH 14/24] refactor: Convert static methods to instance methods and update test Address reviewer feedback from @darynaishchenko: - Comment 27: Remove @staticmethod decorators from all three parsing methods - Update method calls to use self instead of ExcelParser class name - Comment 29: Change test from KeyboardInterrupt to SystemExit - Add explanatory docstring to test about why it's needed All changes verified locally: - MyPy passes - Unit tests pass (4 passed, 1 skipped) - Ruff format and lint pass Co-Authored-By: unknown <> --- .../sources/file_based/file_types/excel_parser.py | 10 +++++----- .../file_based/file_types/test_excel_parser.py | 12 +++++++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index 30da7573e..d38b6247d 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -182,8 +182,8 @@ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None: logger.info(f"Expected ExcelFormat, got {excel_format}") raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) - @staticmethod def _open_and_parse_file_with_calamine( + self, fp: Union[IOBase, str, Path], logger: logging.Logger, file_info: RemoteFile, @@ -217,8 +217,8 @@ def _open_and_parse_file_with_calamine( ) from exc raise exc - @staticmethod def _open_and_parse_file_with_openpyxl( + self, fp: Union[IOBase, str, Path], logger: logging.Logger, file_info: RemoteFile, @@ -244,8 +244,8 @@ def _open_and_parse_file_with_openpyxl( return df # type: ignore [no-any-return] - @staticmethod def open_and_parse_file( + self, fp: Union[IOBase, str, Path], logger: logging.Logger, file_info: RemoteFile, @@ -261,7 +261,7 @@ def open_and_parse_file( pd.DataFrame: Parsed data from the Excel file. """ try: - return ExcelParser._open_and_parse_file_with_calamine(fp, logger, file_info) + return self._open_and_parse_file_with_calamine(fp, logger, file_info) except ExcelCalamineParsingError: # Fallback to openpyxl try: @@ -270,4 +270,4 @@ def open_and_parse_file( # Some file-like objects may not be seekable; attempt openpyxl parsing anyway pass - return ExcelParser._open_and_parse_file_with_openpyxl(fp, logger, file_info) + return self._open_and_parse_file_with_openpyxl(fp, logger, file_info) diff --git a/unit_tests/sources/file_based/file_types/test_excel_parser.py b/unit_tests/sources/file_based/file_types/test_excel_parser.py index 7db1aa2d2..a767dd41e 100644 --- a/unit_tests/sources/file_based/file_types/test_excel_parser.py +++ b/unit_tests/sources/file_based/file_types/test_excel_parser.py @@ -179,13 +179,19 @@ def openpyxl_parse_side_effect(): assert "Openpyxl warning" in mock_logger.warning.call_args_list[1].args[0] -def test_open_and_parse_file_does_not_swallow_keyboard_interrupt(mock_logger): +def test_open_and_parse_file_does_not_swallow_system_exit(mock_logger): + """Test that SystemExit is not caught by the BaseException handler. + + This test ensures that critical system-level exceptions like SystemExit and KeyboardInterrupt + are not accidentally caught and suppressed by our BaseException handler in the Calamine parsing + method. These exceptions should always propagate up to allow proper program termination. + """ parser = ExcelParser() fp = BytesIO(b"test") remote_file = RemoteFile(uri="s3://mybucket/test.xlsx", last_modified=datetime.datetime.now()) with patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel: - mock_excel.return_value.parse.side_effect = KeyboardInterrupt() + mock_excel.return_value.parse.side_effect = SystemExit() - with pytest.raises(KeyboardInterrupt): + with pytest.raises(SystemExit): parser.open_and_parse_file(fp, mock_logger, remote_file) From 63d24a6e26d1ac8c0f056fb4947cecaf17e4f36c Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 24 Nov 2025 10:35:54 +0000 Subject: [PATCH 15/24] refactor: Move seek logic into _open_and_parse_file_with_openpyxl Address reviewer feedback from @darynaishchenko (Comment 2555661755): - Move fp.seek(0) try/except block into _open_and_parse_file_with_openpyxl - Add info-level logging for seek failures instead of silent pass - Remove duplicate seek logic from open_and_parse_file orchestration method - Add hasattr check to avoid AttributeError on non-file-like objects - Simplify orchestration method to focus purely on flow control This centralizes fallback-specific concerns within the openpyxl path and makes the behavior easier to test and reason about. All local checks pass: - Unit tests pass (4 passed, 1 skipped) - MyPy type checking passes - Ruff format and lint pass Co-Authored-By: unknown <> --- .../file_based/file_types/excel_parser.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index d38b6247d..9ee8f362d 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -233,6 +233,16 @@ def _open_and_parse_file_with_openpyxl( Returns: pd.DataFrame: Parsed data from the Excel file. """ + # Some file-like objects are not seekable. + if hasattr(fp, "seek"): + try: + fp.seek(0) # type: ignore [union-attr] + except (AttributeError, OSError) as exc: + logger.info( + f"Could not rewind stream for {file_info.file_uri_for_logging}; " + f"proceeding with openpyxl from current position: {exc}" + ) + with warnings.catch_warnings(record=True) as warning_records: warnings.simplefilter("always") df = pd.ExcelFile(fp, engine="openpyxl").parse() # type: ignore [arg-type, call-overload] @@ -263,11 +273,4 @@ def open_and_parse_file( try: return self._open_and_parse_file_with_calamine(fp, logger, file_info) except ExcelCalamineParsingError: - # Fallback to openpyxl - try: - fp.seek(0) # type: ignore [union-attr] - except (AttributeError, OSError): - # Some file-like objects may not be seekable; attempt openpyxl parsing anyway - pass - return self._open_and_parse_file_with_openpyxl(fp, logger, file_info) From 44f7df1193b1764d4b6a9b2cf321622dc0137041 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 24 Nov 2025 15:54:25 +0000 Subject: [PATCH 16/24] test: Add parametrized test for non-seekable files in openpyxl fallback Address reviewer feedback from @agarctfi (Comment 2556819331): - Add parametrized test covering both AttributeError and OSError cases - Test verifies info log is emitted when seek fails on non-seekable files - Test confirms parsing proceeds from current position when rewind fails - Uses FakeFP class with seek method that raises the desired exception Test coverage: - test_openpyxl_logs_info_when_seek_fails[attribute-error] - test_openpyxl_logs_info_when_seek_fails[os-error] All local checks pass: - Unit tests pass (6 passed, 1 skipped) - Ruff format and lint pass Co-Authored-By: unknown <> --- .../file_types/test_excel_parser.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/unit_tests/sources/file_based/file_types/test_excel_parser.py b/unit_tests/sources/file_based/file_types/test_excel_parser.py index a767dd41e..ce557d649 100644 --- a/unit_tests/sources/file_based/file_types/test_excel_parser.py +++ b/unit_tests/sources/file_based/file_types/test_excel_parser.py @@ -195,3 +195,47 @@ def test_open_and_parse_file_does_not_swallow_system_exit(mock_logger): with pytest.raises(SystemExit): parser.open_and_parse_file(fp, mock_logger, remote_file) + + +@pytest.mark.parametrize( + "exc_cls", + [ + pytest.param(AttributeError, id="attribute-error"), + pytest.param(OSError, id="os-error"), + ], +) +def test_openpyxl_logs_info_when_seek_fails(mock_logger, remote_file, exc_cls): + """Test that openpyxl logs info when seek fails on non-seekable files. + + This test ensures that when falling back to openpyxl, if the file pointer + cannot be rewound (seek fails with AttributeError or OSError), an info-level + log is emitted and parsing proceeds from the current position. + """ + parser = ExcelParser() + fallback_df = pd.DataFrame({"a": [1]}) + + class FakeFP: + """Fake file-like object with a seek method that raises an exception.""" + + def __init__(self, exc): + self._exc = exc + + def seek(self, *args, **kwargs): + raise self._exc("not seekable") + + fp = FakeFP(exc_cls) + + openpyxl_excel_file = MagicMock() + openpyxl_excel_file.parse.return_value = fallback_df + + with patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel: + mock_excel.return_value = openpyxl_excel_file + + result = parser._open_and_parse_file_with_openpyxl(fp, mock_logger, remote_file) + + pd.testing.assert_frame_equal(result, fallback_df) + mock_logger.info.assert_called_once() + msg = mock_logger.info.call_args[0][0] + assert "Could not rewind stream" in msg + assert remote_file.file_uri_for_logging in msg + mock_excel.assert_called_once_with(fp, engine="openpyxl") From 49f3e196e229ab56ca1f743e14b93170f858f7da Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 24 Nov 2025 17:19:37 +0000 Subject: [PATCH 17/24] refactor: Narrow exception handling to OSError only in seek logic Address reviewer feedback from @darynaishchenko (Comment 2557109671): - Remove AttributeError from except clause since hasattr(fp, 'seek') guard already handles the case where seek attribute doesn't exist - After hasattr check passes, only OSError (including io.UnsupportedOperation) can be raised by seek() on non-seekable streams - Update parametrized test to only test OSError case, removing AttributeError case This makes the exception handling more precise and eliminates redundant catching. All local checks pass: - Unit tests pass (5 passed, 1 skipped) - Ruff format and lint pass Co-Authored-By: unknown <> --- airbyte_cdk/sources/file_based/file_types/excel_parser.py | 2 +- .../sources/file_based/file_types/test_excel_parser.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index 9ee8f362d..22c501660 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -237,7 +237,7 @@ def _open_and_parse_file_with_openpyxl( if hasattr(fp, "seek"): try: fp.seek(0) # type: ignore [union-attr] - except (AttributeError, OSError) as exc: + except OSError as exc: logger.info( f"Could not rewind stream for {file_info.file_uri_for_logging}; " f"proceeding with openpyxl from current position: {exc}" diff --git a/unit_tests/sources/file_based/file_types/test_excel_parser.py b/unit_tests/sources/file_based/file_types/test_excel_parser.py index ce557d649..18850e9b0 100644 --- a/unit_tests/sources/file_based/file_types/test_excel_parser.py +++ b/unit_tests/sources/file_based/file_types/test_excel_parser.py @@ -200,7 +200,6 @@ def test_open_and_parse_file_does_not_swallow_system_exit(mock_logger): @pytest.mark.parametrize( "exc_cls", [ - pytest.param(AttributeError, id="attribute-error"), pytest.param(OSError, id="os-error"), ], ) @@ -208,8 +207,8 @@ def test_openpyxl_logs_info_when_seek_fails(mock_logger, remote_file, exc_cls): """Test that openpyxl logs info when seek fails on non-seekable files. This test ensures that when falling back to openpyxl, if the file pointer - cannot be rewound (seek fails with AttributeError or OSError), an info-level - log is emitted and parsing proceeds from the current position. + cannot be rewound (seek fails with OSError), an info-level log is emitted + and parsing proceeds from the current position. """ parser = ExcelParser() fallback_df = pd.DataFrame({"a": [1]}) From fffe0275e8738bba9ae110154eba5aea8ad60495 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 24 Nov 2025 17:22:28 +0000 Subject: [PATCH 18/24] refactor: Rename file_info parameter to file in Excel parsing methods Address reviewer feedback from @darynaishchenko (Comment 2557118668): - Rename file_info parameter to file in all three methods: - _open_and_parse_file_with_calamine - _open_and_parse_file_with_openpyxl - open_and_parse_file - Update all references to use the shorter, clearer name - Since the type is RemoteFile, the shorter name 'file' is more appropriate This makes the code more concise while maintaining clarity. All local checks pass: - Unit tests pass (5 passed, 1 skipped) - Ruff format and lint pass Co-Authored-By: unknown <> --- .../file_based/file_types/excel_parser.py | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index 22c501660..2013eed99 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -186,14 +186,14 @@ def _open_and_parse_file_with_calamine( self, fp: Union[IOBase, str, Path], logger: logging.Logger, - file_info: RemoteFile, + file: RemoteFile, ) -> pd.DataFrame: """Opens and parses Excel file using Calamine engine. Args: fp: File pointer to the Excel file. logger: Logger for logging information and errors. - file_info: Remote file information for logging context. + file: Remote file information for logging context. Returns: pd.DataFrame: Parsed data from the Excel file. @@ -209,11 +209,11 @@ def _open_and_parse_file_with_calamine( # Otherwise, raise an exception. if "ValueError" in str(exc): logger.warning( - f"Calamine parsing failed for {file_info.file_uri_for_logging}, falling back to openpyxl: {exc}" + f"Calamine parsing failed for {file.file_uri_for_logging}, falling back to openpyxl: {exc}" ) raise ExcelCalamineParsingError( - f"Calamine engine failed to parse {file_info.file_uri_for_logging}", - filename=file_info.uri, + f"Calamine engine failed to parse {file.file_uri_for_logging}", + filename=file.uri, ) from exc raise exc @@ -221,14 +221,14 @@ def _open_and_parse_file_with_openpyxl( self, fp: Union[IOBase, str, Path], logger: logging.Logger, - file_info: RemoteFile, + file: RemoteFile, ) -> pd.DataFrame: """Opens and parses Excel file using Openpyxl engine. Args: fp: File pointer to the Excel file. logger: Logger for logging information and errors. - file_info: Remote file information for logging context. + file: Remote file information for logging context. Returns: pd.DataFrame: Parsed data from the Excel file. @@ -239,7 +239,7 @@ def _open_and_parse_file_with_openpyxl( fp.seek(0) # type: ignore [union-attr] except OSError as exc: logger.info( - f"Could not rewind stream for {file_info.file_uri_for_logging}; " + f"Could not rewind stream for {file.file_uri_for_logging}; " f"proceeding with openpyxl from current position: {exc}" ) @@ -248,9 +248,7 @@ def _open_and_parse_file_with_openpyxl( df = pd.ExcelFile(fp, engine="openpyxl").parse() # type: ignore [arg-type, call-overload] for warning in warning_records: - logger.warning( - f"Openpyxl warning for {file_info.file_uri_for_logging}: {warning.message}" - ) + logger.warning(f"Openpyxl warning for {file.file_uri_for_logging}: {warning.message}") return df # type: ignore [no-any-return] @@ -258,19 +256,19 @@ def open_and_parse_file( self, fp: Union[IOBase, str, Path], logger: logging.Logger, - file_info: RemoteFile, + file: RemoteFile, ) -> pd.DataFrame: """Opens and parses the Excel file with Calamine-first and Openpyxl fallback. Args: fp: File pointer to the Excel file. logger: Logger for logging information and errors. - file_info: Remote file information for logging context. + file: Remote file information for logging context. Returns: pd.DataFrame: Parsed data from the Excel file. """ try: - return self._open_and_parse_file_with_calamine(fp, logger, file_info) + return self._open_and_parse_file_with_calamine(fp, logger, file) except ExcelCalamineParsingError: - return self._open_and_parse_file_with_openpyxl(fp, logger, file_info) + return self._open_and_parse_file_with_openpyxl(fp, logger, file) From 0831b048a10b0329e00a92b947dbbb3689c96242 Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Mon, 24 Nov 2025 11:52:19 -0600 Subject: [PATCH 19/24] Fix properties --- airbyte_cdk/sources/file_based/remote_file.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/airbyte_cdk/sources/file_based/remote_file.py b/airbyte_cdk/sources/file_based/remote_file.py index 1e3eec724..4e9fa0dde 100644 --- a/airbyte_cdk/sources/file_based/remote_file.py +++ b/airbyte_cdk/sources/file_based/remote_file.py @@ -22,6 +22,11 @@ def file_uri_for_logging(self) -> str: """Returns a user-friendly identifier for logging.""" return self.uri + @property + def source_uri(self) -> str: + """Returns the canonical source URI.""" + return self.uri + class UploadableRemoteFile(RemoteFile, ABC): """ From 463be27f2202954cf393cc2b49d073b6ce70b848 Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Tue, 9 Dec 2025 10:56:56 -0600 Subject: [PATCH 20/24] Revert "Fix properties" This reverts commit 0831b048a10b0329e00a92b947dbbb3689c96242. --- airbyte_cdk/sources/file_based/remote_file.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/airbyte_cdk/sources/file_based/remote_file.py b/airbyte_cdk/sources/file_based/remote_file.py index 4e9fa0dde..1e3eec724 100644 --- a/airbyte_cdk/sources/file_based/remote_file.py +++ b/airbyte_cdk/sources/file_based/remote_file.py @@ -22,11 +22,6 @@ def file_uri_for_logging(self) -> str: """Returns a user-friendly identifier for logging.""" return self.uri - @property - def source_uri(self) -> str: - """Returns the canonical source URI.""" - return self.uri - class UploadableRemoteFile(RemoteFile, ABC): """ From 95fc5e3662bf6cf8eaff3116a7165a61fe621acd Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Tue, 9 Dec 2025 13:36:35 -0600 Subject: [PATCH 21/24] readd source_uri --- airbyte_cdk/sources/file_based/remote_file.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/airbyte_cdk/sources/file_based/remote_file.py b/airbyte_cdk/sources/file_based/remote_file.py index 1e3eec724..9495ca04d 100644 --- a/airbyte_cdk/sources/file_based/remote_file.py +++ b/airbyte_cdk/sources/file_based/remote_file.py @@ -53,3 +53,9 @@ def source_file_relative_path(self) -> str: Returns the relative path of the source file. """ return self.uri + @property + def source_uri(self) -> str: + """ + Returns the Source URI for the file being logged. + """ + return self.uri From 38a1a1c77e6bf1423a4f6bfa550009f469cd62d2 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Tue, 9 Dec 2025 19:41:38 +0000 Subject: [PATCH 22/24] Auto-fix lint and format issues --- airbyte_cdk/sources/file_based/remote_file.py | 1 + 1 file changed, 1 insertion(+) diff --git a/airbyte_cdk/sources/file_based/remote_file.py b/airbyte_cdk/sources/file_based/remote_file.py index 9495ca04d..8d30a5333 100644 --- a/airbyte_cdk/sources/file_based/remote_file.py +++ b/airbyte_cdk/sources/file_based/remote_file.py @@ -53,6 +53,7 @@ def source_file_relative_path(self) -> str: Returns the relative path of the source file. """ return self.uri + @property def source_uri(self) -> str: """ From 3277b70d50e013f4d6ee28c1d9748d2c84067f20 Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Tue, 9 Dec 2025 13:57:43 -0600 Subject: [PATCH 23/24] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- airbyte_cdk/sources/file_based/file_types/excel_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index 2013eed99..f5618c4fe 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -6,7 +6,7 @@ import warnings from io import IOBase from pathlib import Path -from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Type, Union, cast +from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union, cast import orjson import pandas as pd From 7d73cc6c3e33c00045d7a5b9464349c6bb2f851b Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Tue, 9 Dec 2025 14:09:37 -0600 Subject: [PATCH 24/24] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- airbyte_cdk/sources/file_based/file_types/excel_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index f5618c4fe..93896f14f 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -6,7 +6,7 @@ import warnings from io import IOBase from pathlib import Path -from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union, cast +from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union import orjson import pandas as pd