Merge pull request #344 from PowerGridModel/feature/optional-extra

Jerry-Jinfeng-Guo · web-flow · commit 02e4a612caee · 2025-12-22T08:46:51.000Z
Logic to handle optional_extra in Vision Excel converter input
diff --git a/docs/converters/vision_converter.md b/docs/converters/vision_converter.md
@@ -89,6 +89,57 @@ fields of interest.
 
 An examplery usage can be found in the example notebook as well as in the test cases.
 
+## Optional extra columns
+
+When working with Vision Excel exports, some metadata columns (like `GUID` or `StationID`) may not always be present,
+especially in partial exports.
+The `optional_extra` feature allows you to specify columns that should be included in `extra_info` if present,
+but won't cause conversion failure if missing.
+
+**Syntax:**
+
+```yaml
+grid:
+  Transformers:
+    transformer:
+      id:
+        auto_id:
+          key: Number
+      # ... other fields ...
+      extra:
+        - ID            # Required - fails if missing
+        - Name          # Required - fails if missing
+        - optional_extra:
+            - GUID      # Optional - skipped if missing
+            - StationID # Optional - skipped if missing
+```
+
+**Behavior:**
+
+- Required columns (listed directly under `extra`) will cause a KeyError if missing
+- Optional columns (nested under `optional_extra`) are silently skipped if not found
+- If some optional columns are present and others missing, only the present ones are included in `extra_info`
+- This feature is particularly useful for handling different Vision export configurations or versions
+
+**Duplicate handling:**
+When a column appears in both the regular `extra` list and within `optional_extra`,
+the regular `extra` entry takes precedence and duplicates are automatically eliminated from `optional_extra`:
+
+```yaml
+extra:
+  - ID              # Regular column - always processed
+  - Name            # Regular column - always processed  
+  - optional_extra:
+      - ID          # Duplicate - automatically removed
+      - GUID        # Unique optional - processed if present
+      - StationID   # Unique optional - processed if present
+```
+
+In this example, `ID` will only be processed once (from the regular `extra` list),
+while `GUID` and `StationID` are processed as optional columns.
+This prevents duplicate data in the resulting `extra_info`
+and ensures consistent behavior regardless of column ordering.
+
 ## Common/Known issues related to Vision
 
 So far we have the following issue known to us related to Vision exported spread sheets.
diff --git a/src/power_grid_model_io/converters/tabular_converter.py b/src/power_grid_model_io/converters/tabular_converter.py
@@ -337,11 +337,14 @@ def _handle_extra_info(  # pylint: disable = too-many-arguments,too-many-positio
         if extra_info is None:
             return
 
+        # Normalize col_def to handle deduplication when optional_extra contains columns also in regular extra
+        normalized_col_def = self._normalize_extra_col_def(col_def)
+
         extra = self._parse_col_def(
             data=data,
             table=table,
             table_mask=table_mask,
-            col_def=col_def,
+            col_def=normalized_col_def,
             extra_info=None,
         ).to_dict(orient="records")
         for i, xtr in zip(uuids, extra):
@@ -356,6 +359,57 @@ def _handle_extra_info(  # pylint: disable = too-many-arguments,too-many-positio
                 else:
                     extra_info[i] = xtr
 
+    def _normalize_extra_col_def(self, col_def: Any) -> Any:
+        """
+        Normalize extra column definition to eliminate duplicates between regular columns and optional_extra.
+        Regular columns take precedence over optional_extra columns.
+        Additionally, ensure no duplicates within optional_extra.
+
+        Args:
+            col_def: Column definition for extra info that may contain optional_extra sections
+
+        Returns:
+            Normalized column definition with duplicates removed from optional_extra
+        """
+        if not isinstance(col_def, list):
+            return col_def
+
+        # Collect all non-optional_extra column names
+        regular_columns = set()
+
+        for item in col_def:
+            if isinstance(item, dict) and len(item) == 1 and "optional_extra" in item:
+                # This is an optional_extra section - we'll process it later
+                pass
+            else:
+                # This is a regular column
+                if isinstance(item, str):
+                    regular_columns.add(item)
+
+        # Now process optional_extra sections and remove duplicates
+        final_list = []
+        for item in col_def:
+            if isinstance(item, dict) and len(item) == 1 and "optional_extra" in item:
+                optional_cols = item["optional_extra"]
+                if isinstance(optional_cols, list):
+                    # Filter out columns that are already in regular columns
+                    filtered_optional_cols = []
+                    for col in optional_cols:
+                        if isinstance(col, str) and col in regular_columns:
+                            continue
+                        if col not in filtered_optional_cols:
+                            filtered_optional_cols.append(col)
+                    # Only include the optional_extra section if it has remaining columns
+                    if filtered_optional_cols:
+                        final_list.append({"optional_extra": filtered_optional_cols})
+                else:
+                    # Keep non-list optional_extra as-is (shouldn't happen but be safe)
+                    final_list.append(item)
+            else:
+                final_list.append(item)
+
+        return final_list
+
     @staticmethod
     def _merge_pgm_data(data: Dict[ComponentType, List[np.ndarray]]) -> Dict[ComponentType, np.ndarray]:
         """During the conversion, multiple numpy arrays can be produced for the same type of component. These arrays
@@ -394,6 +448,8 @@ def _parse_col_def(  # pylint: disable = too-many-arguments,too-many-positional-
         col_def: Any,
         table_mask: Optional[np.ndarray],
         extra_info: Optional[ExtraInfo],
+        *,
+        allow_missing: bool = False,
     ) -> pd.DataFrame:
         """Interpret the column definition and extract/convert/create the data as a pandas DataFrame.
 
@@ -402,15 +458,27 @@ def _parse_col_def(  # pylint: disable = too-many-arguments,too-many-positional-
           table: str:
           col_def: Any:
           extra_info: Optional[ExtraInfo]:
+          allow_missing: bool: If True, missing columns will return empty DataFrame instead of raising KeyError
 
         Returns:
 
         """
         if isinstance(col_def, (int, float)):
             return self._parse_col_def_const(data=data, table=table, col_def=col_def, table_mask=table_mask)
         if isinstance(col_def, str):
-            return self._parse_col_def_column_name(data=data, table=table, col_def=col_def, table_mask=table_mask)
+            return self._parse_col_def_column_name(
+                data=data, table=table, col_def=col_def, table_mask=table_mask, allow_missing=allow_missing
+            )
         if isinstance(col_def, dict):
+            # Check if this is an optional_extra wrapper
+            if len(col_def) == 1 and "optional_extra" in col_def:
+                # Extract the list of optional columns and parse as composite with allow_missing=True
+                optional_cols = col_def["optional_extra"]
+                if not isinstance(optional_cols, list):
+                    raise TypeError(f"optional_extra value must be a list, got {type(optional_cols).__name__}")
+                return self._parse_col_def_composite(
+                    data=data, table=table, col_def=optional_cols, table_mask=table_mask, allow_missing=True
+                )
             return self._parse_col_def_filter(
                 data=data,
                 table=table,
@@ -419,7 +487,9 @@ def _parse_col_def(  # pylint: disable = too-many-arguments,too-many-positional-
                 extra_info=extra_info,
             )
         if isinstance(col_def, list):
-            return self._parse_col_def_composite(data=data, table=table, col_def=col_def, table_mask=table_mask)
+            return self._parse_col_def_composite(
+                data=data, table=table, col_def=col_def, table_mask=table_mask, allow_missing=allow_missing
+            )
         raise TypeError(f"Invalid column definition: {col_def}")
 
     @staticmethod
@@ -452,6 +522,7 @@ def _parse_col_def_column_name(
         table: str,
         col_def: str,
         table_mask: Optional[np.ndarray] = None,
+        allow_missing: bool = False,
     ) -> pd.DataFrame:
         """Extract a column from the data. If the column doesn't exist, check if the col_def is a special float value,
         like 'inf'. If that's the case, create a single column pandas DataFrame containing the const value.
@@ -460,6 +531,7 @@ def _parse_col_def_column_name(
           data: TabularData:
           table: str:
           col_def: str:
+          allow_missing: bool: If True, return empty DataFrame when column is missing instead of raising KeyError
 
         Returns:
 
@@ -480,18 +552,23 @@ def _parse_col_def_column_name(
                 col_data = self._apply_multiplier(table=table, column=col_name, data=col_data)
                 return pd.DataFrame(col_data)
 
-        def _get_float(value: str) -> Optional[float]:
-            try:
-                return float(value)
-            except ValueError:
-                return None
-
-        # Maybe it is not a column name, but a float value like 'inf', let's try to convert the string to a float
-        if (const_value := _get_float(col_def)) is not None:
-            return self._parse_col_def_const(data=data, table=table, col_def=const_value, table_mask=table_mask)
+        try:  # Maybe it is not a column name, but a float value like 'inf', let's try to convert the string to a float
+            const_value = float(col_def)
+        except ValueError as e:
+            if allow_missing:
+                # Return empty DataFrame with correct number of rows when column is optional and missing
+                self._log.debug(
+                    "Optional column not found",
+                    table=table,
+                    columns=" or ".join(f"'{col_name}'" for col_name in columns),
+                )
+                index = table_data.index if isinstance(table_data, pd.DataFrame) else pd.RangeIndex(len(table_data))
+                return pd.DataFrame(index=index)
+            # pylint: disable=raise-missing-from
+            columns_str = " and ".join(f"'{col_name}'" for col_name in columns)
+            raise KeyError(f"Could not find column {columns_str} on table '{table}'") from e
 
-        columns_str = " and ".join(f"'{col_name}'" for col_name in columns)
-        raise KeyError(f"Could not find column {columns_str} on table '{table}'")
+        return self._parse_col_def_const(data=data, table=table, col_def=const_value, table_mask=table_mask)
 
     def _apply_multiplier(self, table: str, column: str, data: pd.Series) -> pd.Series:
         if self._multipliers is None:
@@ -780,13 +857,15 @@ def _parse_col_def_composite(
         table: str,
         col_def: list,
         table_mask: Optional[np.ndarray],
+        allow_missing: bool = False,
     ) -> pd.DataFrame:
         """Select multiple columns (each is created from a column definition) and return them as a new DataFrame.
 
         Args:
           data: TabularData:
           table: str:
           col_def: list:
+          allow_missing: bool: If True, skip missing columns instead of raising errors
 
         Returns:
 
@@ -799,10 +878,20 @@ def _parse_col_def_composite(
                 col_def=sub_def,
                 table_mask=table_mask,
                 extra_info=None,
+                allow_missing=allow_missing,
             )
             for sub_def in col_def
         ]
-        return pd.concat(columns, axis=1)
+        # Filter out DataFrames with no columns (from missing optional columns)
+        non_empty_columns = [col for col in columns if len(col.columns) > 0]
+        if not non_empty_columns:
+            # If all columns are missing, return an empty DataFrame with the correct number of rows
+            table_data = data[table]
+            if table_mask is not None:
+                table_data = table_data[table_mask]
+            index = table_data.index if isinstance(table_data, pd.DataFrame) else pd.RangeIndex(len(table_data))
+            return pd.DataFrame(index=index)
+        return pd.concat(non_empty_columns, axis=1)
 
     def _get_id(self, table: str, key: Mapping[str, int], name: Optional[str]) -> int:
         """
diff --git a/tests/data/config/test_optional_extra_mapping.yaml b/tests/data/config/test_optional_extra_mapping.yaml
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Contributors to the Power Grid Model project <powergridmodel@lfenergy.org>
+#
+# SPDX-License-Identifier: MPL-2.0
+---
+# Test mapping file for optional_extra feature
+grid:
+  nodes:
+    node:
+      id:
+        auto_id:
+          key: node_id
+      u_rated: voltage
+      extra:
+        - ID
+        - Name
+        - optional_extra:
+            - GUID
+            - StationID
+
+units:
+  V:
+    kV: 1000.0
+
+substitutions: {}
diff --git a/tests/data/vision/vision_optional_extra_full.xlsx b/tests/data/vision/vision_optional_extra_full.xlsx
diff --git a/tests/data/vision/vision_optional_extra_full.xlsx.license b/tests/data/vision/vision_optional_extra_full.xlsx.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: Contributors to the Power Grid Model project <powergridmodel@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/tests/data/vision/vision_optional_extra_mapping.yaml b/tests/data/vision/vision_optional_extra_mapping.yaml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Contributors to the Power Grid Model project <powergridmodel@lfenergy.org>
+#
+# SPDX-License-Identifier: MPL-2.0
+---
+# Test mapping file for optional_extra feature with Vision Excel format
+id_reference:
+  nodes_table: Nodes
+  number: Number
+  node_number: Node.Number
+  sub_number: Subnumber
+
+grid:
+  Nodes:
+    node:
+      id:
+        auto_id:
+          key: Number
+      u_rated: Unom
+      extra:
+        - ID
+        - Name
+        - optional_extra:
+            - GUID
+            - StationID
+
+units:
+  V:
+    kV: 1000.0
+
+substitutions: {}
diff --git a/tests/data/vision/vision_optional_extra_mapping.yaml.license b/tests/data/vision/vision_optional_extra_mapping.yaml.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: Contributors to the Power Grid Model project <powergridmodel@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/tests/data/vision/vision_optional_extra_minimal.xlsx b/tests/data/vision/vision_optional_extra_minimal.xlsx
diff --git a/tests/data/vision/vision_optional_extra_minimal.xlsx.license b/tests/data/vision/vision_optional_extra_minimal.xlsx.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: Contributors to the Power Grid Model project <powergridmodel@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/tests/data/vision/vision_optional_extra_ordering_invariance.yaml b/tests/data/vision/vision_optional_extra_ordering_invariance.yaml
@@ -0,0 +1,26 @@
+id_reference:
+  nodes_table: Nodes
+  number: Number
+  node_number: Node.Number
+  sub_number: Subnumber
+
+grid:
+  Nodes:
+    node:
+      id:
+        auto_id:
+          key: Number
+      u_rated: Unom
+      extra:
+        - optional_extra:
+            - GUID
+            - StationID
+        - ID
+        - Name
+        - GUID
+
+units:
+  V:
+    kV: 1000.0
+
+substitutions: {}
diff --git a/tests/data/vision/vision_optional_extra_ordering_invariance.yaml.license b/tests/data/vision/vision_optional_extra_ordering_invariance.yaml.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: Contributors to the Power Grid Model project <powergridmodel@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/tests/data/vision/vision_optional_extra_partial.xlsx b/tests/data/vision/vision_optional_extra_partial.xlsx
diff --git a/tests/data/vision/vision_optional_extra_partial.xlsx.license b/tests/data/vision/vision_optional_extra_partial.xlsx.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: Contributors to the Power Grid Model project <powergridmodel@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/tests/unit/converters/test_tabular_converter.py b/tests/unit/converters/test_tabular_converter.py
diff --git a/tests/unit/converters/test_vision_excel_converter.py b/tests/unit/converters/test_vision_excel_converter.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+SPDX-FileCopyrightText: Contributors to the Power Grid Model project <powergridmodel@lfenergy.org>`
	`2`	`+`
	`3`	`+SPDX-License-Identifier: MPL-2.0`