Skip to content

Commit 43c2949

Browse files
authored
Merge pull request #166 from alliander-opensource/feature/lazy-excel-loading
Lazy excel loading
2 parents 7e498b3 + 5589888 commit 43c2949

File tree

4 files changed

+64
-37
lines changed

4 files changed

+64
-37
lines changed

src/power_grid_model_io/data_stores/excel_file_store.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from power_grid_model_io.data_stores.base_data_store import BaseDataStore
1515
from power_grid_model_io.data_types import TabularData
16+
from power_grid_model_io.data_types.tabular_data import LazyDataFrame
1617

1718

1819
class ExcelFileStore(BaseDataStore[TabularData]):
@@ -24,7 +25,7 @@ class ExcelFileStore(BaseDataStore[TabularData]):
2425
same values) or renamed.
2526
"""
2627

27-
__slots__ = ("_file_paths", "_header_rows")
28+
__slots__ = ("_file_paths", "_excel_files", "_header_rows")
2829

2930
_unnamed_pattern: re.Pattern = re.compile(r"Unnamed: \d+_level_\d+")
3031

@@ -34,6 +35,7 @@ def __init__(self, file_path: Optional[Path] = None, **extra_paths: Path):
3435
# Create a dictionary of all supplied file paths:
3536
# {"": file_path, extra_name[0]: extra_path[0], extra_name[1]: extra_path[1], ...}
3637
self._file_paths: Dict[str, Path] = {}
38+
self._excel_files: Dict[str, pd.ExcelFile] = {}
3739
if file_path is not None:
3840
self._file_paths[""] = file_path
3941
for name, path in extra_paths.items():
@@ -62,21 +64,37 @@ def load(self) -> TabularData:
6264
have no prefix, while the tables of all the extra files will be prefixed with the name of the key word argument
6365
as supplied in the constructor.
6466
"""
65-
data: Dict[str, pd.DataFrame] = {}
67+
data: Dict[str, LazyDataFrame] = {}
6668
for name, path in self._file_paths.items():
67-
with path.open(mode="rb") as file_pointer:
68-
spreadsheet = pd.read_excel(io=file_pointer, sheet_name=None, header=self._header_rows)
69-
for sheet_name, sheet_data in spreadsheet.items():
70-
sheet_data = self._remove_unnamed_column_placeholders(data=sheet_data)
71-
sheet_data = self._handle_duplicate_columns(data=sheet_data, sheet_name=sheet_name)
69+
self._excel_files[name] = pd.ExcelFile(path)
70+
for sheet_name in self._excel_files[name].sheet_names:
71+
loader = self._load_sheet_wrapper(name, sheet_name)
7272
if name:
7373
sheet_name = f"{name}.{sheet_name}"
7474
if sheet_name in data:
7575
raise ValueError(f"Duplicate sheet name '{sheet_name}'")
76-
data[sheet_name] = sheet_data
77-
76+
data[sheet_name] = loader
7877
return TabularData(**data)
7978

79+
def _load_sheet_wrapper(self, name: str, sheet_name: str):
80+
"""
81+
Load a single Excel sheet as a Pandas DataFrame.
82+
83+
Args:
84+
name: the name of the file (empty string for the main sheet)
85+
sheet_name: the name of the sheet
86+
87+
Returns: The contents the specified Excel sheet.
88+
"""
89+
90+
def wrapper():
91+
sheet_data = self._excel_files[name].parse(sheet_name, header=self._header_rows)
92+
sheet_data = self._remove_unnamed_column_placeholders(data=sheet_data)
93+
sheet_data = self._handle_duplicate_columns(data=sheet_data, sheet_name=sheet_name)
94+
return sheet_data
95+
96+
return wrapper
97+
8098
def save(self, data: TabularData) -> None:
8199
"""
82100
Store tabular data as one or more Excel file.

tests/unit/data_stores/test_excel_file_store.py

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from power_grid_model_io.data_stores.excel_file_store import ExcelFileStore
1616
from power_grid_model_io.data_types.tabular_data import TabularData
1717

18-
from ...utils import assert_log_exists
18+
from ...utils import MockExcelFile, assert_log_exists
1919

2020
PandasExcelData = Dict[str, pd.DataFrame]
2121

@@ -103,55 +103,57 @@ def test_files__read_only():
103103

104104
@patch("power_grid_model_io.data_stores.excel_file_store.ExcelFileStore._handle_duplicate_columns")
105105
@patch("power_grid_model_io.data_stores.excel_file_store.ExcelFileStore._remove_unnamed_column_placeholders")
106-
@patch("power_grid_model_io.data_stores.excel_file_store.Path.open", mock_open())
107-
@patch("power_grid_model_io.data_stores.excel_file_store.pd.read_excel")
106+
@patch("power_grid_model_io.data_stores.excel_file_store.pd.ExcelFile")
108107
def test_load(
109-
mock_read_excel: MagicMock,
108+
mock_excel_file: MagicMock,
110109
mock_remove_unnamed_column_placeholders: MagicMock,
111110
mock_handle_duplicate_columns: MagicMock,
112111
objects_excel: PandasExcelData,
113112
):
114-
# Arrange
115113
fs = ExcelFileStore(file_path=Path("input_data.xlsx"))
116-
mock_read_excel.return_value = objects_excel
114+
mock_excel_file.return_value = MockExcelFile(objects_excel)
117115
mock_remove_unnamed_column_placeholders.side_effect = noop
118116
mock_handle_duplicate_columns.side_effect = noop
119117

120118
# Act
121119
data = fs.load()
122120

123121
# Assert
124-
mock_read_excel.assert_called_once()
122+
mock_excel_file.assert_called_once()
123+
pd.testing.assert_frame_equal(data["Nodes"], objects_excel["Nodes"])
124+
pd.testing.assert_frame_equal(data["Lines"], objects_excel["Lines"])
125125
assert mock_remove_unnamed_column_placeholders.call_args_list[0] == call(data=objects_excel["Nodes"])
126126
assert mock_remove_unnamed_column_placeholders.call_args_list[1] == call(data=objects_excel["Lines"])
127127
assert mock_handle_duplicate_columns.call_args_list[0] == call(data=objects_excel["Nodes"], sheet_name="Nodes")
128128
assert mock_handle_duplicate_columns.call_args_list[1] == call(data=objects_excel["Lines"], sheet_name="Lines")
129-
pd.testing.assert_frame_equal(data["Nodes"], objects_excel["Nodes"])
130-
pd.testing.assert_frame_equal(data["Lines"], objects_excel["Lines"])
131129

132130

133131
@patch("power_grid_model_io.data_stores.excel_file_store.ExcelFileStore._handle_duplicate_columns")
134132
@patch("power_grid_model_io.data_stores.excel_file_store.ExcelFileStore._remove_unnamed_column_placeholders")
135-
@patch("power_grid_model_io.data_stores.excel_file_store.Path.open", mock_open())
136-
@patch("power_grid_model_io.data_stores.excel_file_store.pd.read_excel")
133+
@patch("power_grid_model_io.data_stores.excel_file_store.pd.ExcelFile")
137134
def test_load__extra(
138-
mock_read_excel: MagicMock,
135+
mock_excel_file: MagicMock,
139136
mock_remove_unnamed_column_placeholders: MagicMock,
140137
mock_handle_duplicate_columns: MagicMock,
141138
objects_excel: PandasExcelData,
142139
specs_excel: PandasExcelData,
143140
):
141+
144142
# Arrange
145143
fs = ExcelFileStore(Path("input_data.xlsx"), foo=Path("foo_types.xlsx"))
146-
mock_read_excel.side_effect = (objects_excel, specs_excel)
144+
mock_excel_file.side_effect = (MockExcelFile(objects_excel), MockExcelFile(specs_excel))
147145
mock_remove_unnamed_column_placeholders.side_effect = noop
148146
mock_handle_duplicate_columns.side_effect = noop
149147

150148
# Act
151149
data = fs.load()
152150

153151
# Assert
154-
assert mock_read_excel.call_count == 2
152+
assert mock_excel_file.call_count == 2
153+
pd.testing.assert_frame_equal(data["Nodes"], objects_excel["Nodes"])
154+
pd.testing.assert_frame_equal(data["Lines"], objects_excel["Lines"])
155+
pd.testing.assert_frame_equal(data["foo.Colors"], specs_excel["Colors"])
156+
pd.testing.assert_frame_equal(data["foo.Lines"], specs_excel["Lines"])
155157
assert mock_remove_unnamed_column_placeholders.call_args_list[0] == call(data=objects_excel["Nodes"])
156158
assert mock_remove_unnamed_column_placeholders.call_args_list[1] == call(data=objects_excel["Lines"])
157159
assert mock_remove_unnamed_column_placeholders.call_args_list[2] == call(data=specs_excel["Colors"])
@@ -160,26 +162,21 @@ def test_load__extra(
160162
assert mock_handle_duplicate_columns.call_args_list[1] == call(data=objects_excel["Lines"], sheet_name="Lines")
161163
assert mock_handle_duplicate_columns.call_args_list[2] == call(data=specs_excel["Colors"], sheet_name="Colors")
162164
assert mock_handle_duplicate_columns.call_args_list[3] == call(data=specs_excel["Lines"], sheet_name="Lines")
163-
pd.testing.assert_frame_equal(data["Nodes"], objects_excel["Nodes"])
164-
pd.testing.assert_frame_equal(data["Lines"], objects_excel["Lines"])
165-
pd.testing.assert_frame_equal(data["foo.Colors"], specs_excel["Colors"])
166-
pd.testing.assert_frame_equal(data["foo.Lines"], specs_excel["Lines"])
167165

168166

169167
@patch("power_grid_model_io.data_stores.excel_file_store.ExcelFileStore._handle_duplicate_columns")
170168
@patch("power_grid_model_io.data_stores.excel_file_store.ExcelFileStore._remove_unnamed_column_placeholders")
171-
@patch("power_grid_model_io.data_stores.excel_file_store.Path.open", mock_open())
172-
@patch("power_grid_model_io.data_stores.excel_file_store.pd.read_excel")
169+
@patch("power_grid_model_io.data_stores.excel_file_store.pd.ExcelFile")
173170
def test_load__extra__duplicate_sheet_name(
174-
mock_read_excel: MagicMock,
171+
mock_excel_file: MagicMock,
175172
mock_remove_unnamed_column_placeholders: MagicMock,
176173
mock_handle_duplicate_columns: MagicMock,
177174
):
178175
# Arrange
179176
foo_data = {"bar.Nodes": pd.DataFrame()}
180177
bar_data = {"Nodes": pd.DataFrame()}
181178
fs = ExcelFileStore(Path("foo.xlsx"), bar=Path("bar.xlsx"))
182-
mock_read_excel.side_effect = (foo_data, bar_data)
179+
mock_excel_file.side_effect = (MockExcelFile(foo_data), MockExcelFile(bar_data))
183180
mock_remove_unnamed_column_placeholders.side_effect = noop
184181
mock_handle_duplicate_columns.side_effect = noop
185182

tests/unit/data_stores/test_vision_excel_file_store.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,16 @@
77
from power_grid_model_io.data_stores.vision_excel_file_store import VisionExcelFileStore
88

99

10-
@patch("power_grid_model_io.data_stores.excel_file_store.pd.read_excel")
10+
@patch("power_grid_model_io.data_stores.excel_file_store.pd.ExcelFile")
1111
@patch("power_grid_model_io.data_stores.excel_file_store.Path.open", mock_open())
12-
def test_header_rows(read_excel_mock: MagicMock):
12+
def test_header_rows(mock_excel_file: MagicMock):
1313
# Arrange
1414
store = VisionExcelFileStore(file_path=Path("dummy.xlsx"))
15-
read_excel_mock.return_value = {}
15+
mock_excel_file.return_value.sheet_names = ["foo"]
1616

1717
# Act
18-
store.load()
18+
data = store.load()
19+
data["foo"]
1920

2021
# Assert
21-
read_excel_mock.assert_called_once()
22-
assert read_excel_mock.call_args_list[0].kwargs["header"] == [0, 1]
22+
mock_excel_file.return_value.parse.assert_called_once_with("foo", header=[0, 1])

tests/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,3 +256,15 @@ def __len__(self):
256256

257257
def __getitem__(self, item: str):
258258
return MockVal(pd.Series(name=item, dtype=np.float64))
259+
260+
261+
class MockExcelFile:
262+
def __init__(self, data: Dict[str, pd.DataFrame]):
263+
self.data = data
264+
265+
@property
266+
def sheet_names(self) -> List[str]:
267+
return list(self.data.keys())
268+
269+
def parse(self, sheet_name: str, **_kwargs) -> pd.DataFrame:
270+
return self.data[sheet_name]

0 commit comments

Comments
 (0)