@@ -337,11 +337,14 @@ def _handle_extra_info( # pylint: disable = too-many-arguments,too-many-positio
337337 if extra_info is None :
338338 return
339339
340+ # Normalize col_def to handle deduplication when optional_extra contains columns also in regular extra
341+ normalized_col_def = self ._normalize_extra_col_def (col_def )
342+
340343 extra = self ._parse_col_def (
341344 data = data ,
342345 table = table ,
343346 table_mask = table_mask ,
344- col_def = col_def ,
347+ col_def = normalized_col_def ,
345348 extra_info = None ,
346349 ).to_dict (orient = "records" )
347350 for i , xtr in zip (uuids , extra ):
@@ -356,6 +359,57 @@ def _handle_extra_info( # pylint: disable = too-many-arguments,too-many-positio
356359 else :
357360 extra_info [i ] = xtr
358361
362+ def _normalize_extra_col_def (self , col_def : Any ) -> Any :
363+ """
364+ Normalize extra column definition to eliminate duplicates between regular columns and optional_extra.
365+ Regular columns take precedence over optional_extra columns.
366+ Additionally, ensure no duplicates within optional_extra.
367+
368+ Args:
369+ col_def: Column definition for extra info that may contain optional_extra sections
370+
371+ Returns:
372+ Normalized column definition with duplicates removed from optional_extra
373+ """
374+ if not isinstance (col_def , list ):
375+ return col_def
376+
377+ # Collect all non-optional_extra column names
378+ regular_columns = set ()
379+
380+ for item in col_def :
381+ if isinstance (item , dict ) and len (item ) == 1 and "optional_extra" in item :
382+ # This is an optional_extra section - we'll process it later
383+ pass
384+ else :
385+ # This is a regular column
386+ if isinstance (item , str ):
387+ regular_columns .add (item )
388+
389+ # Now process optional_extra sections and remove duplicates
390+ final_list = []
391+ for item in col_def :
392+ if isinstance (item , dict ) and len (item ) == 1 and "optional_extra" in item :
393+ optional_cols = item ["optional_extra" ]
394+ if isinstance (optional_cols , list ):
395+ # Filter out columns that are already in regular columns
396+ filtered_optional_cols = []
397+ for col in optional_cols :
398+ if isinstance (col , str ) and col in regular_columns :
399+ continue
400+ if col not in filtered_optional_cols :
401+ filtered_optional_cols .append (col )
402+ # Only include the optional_extra section if it has remaining columns
403+ if filtered_optional_cols :
404+ final_list .append ({"optional_extra" : filtered_optional_cols })
405+ else :
406+ # Keep non-list optional_extra as-is (shouldn't happen but be safe)
407+ final_list .append (item )
408+ else :
409+ final_list .append (item )
410+
411+ return final_list
412+
359413 @staticmethod
360414 def _merge_pgm_data (data : Dict [ComponentType , List [np .ndarray ]]) -> Dict [ComponentType , np .ndarray ]:
361415 """During the conversion, multiple numpy arrays can be produced for the same type of component. These arrays
@@ -394,6 +448,8 @@ def _parse_col_def( # pylint: disable = too-many-arguments,too-many-positional-
394448 col_def : Any ,
395449 table_mask : Optional [np .ndarray ],
396450 extra_info : Optional [ExtraInfo ],
451+ * ,
452+ allow_missing : bool = False ,
397453 ) -> pd .DataFrame :
398454 """Interpret the column definition and extract/convert/create the data as a pandas DataFrame.
399455
@@ -402,15 +458,27 @@ def _parse_col_def( # pylint: disable = too-many-arguments,too-many-positional-
402458 table: str:
403459 col_def: Any:
404460 extra_info: Optional[ExtraInfo]:
461+ allow_missing: bool: If True, missing columns will return empty DataFrame instead of raising KeyError
405462
406463 Returns:
407464
408465 """
409466 if isinstance (col_def , (int , float )):
410467 return self ._parse_col_def_const (data = data , table = table , col_def = col_def , table_mask = table_mask )
411468 if isinstance (col_def , str ):
412- return self ._parse_col_def_column_name (data = data , table = table , col_def = col_def , table_mask = table_mask )
469+ return self ._parse_col_def_column_name (
470+ data = data , table = table , col_def = col_def , table_mask = table_mask , allow_missing = allow_missing
471+ )
413472 if isinstance (col_def , dict ):
473+ # Check if this is an optional_extra wrapper
474+ if len (col_def ) == 1 and "optional_extra" in col_def :
475+ # Extract the list of optional columns and parse as composite with allow_missing=True
476+ optional_cols = col_def ["optional_extra" ]
477+ if not isinstance (optional_cols , list ):
478+ raise TypeError (f"optional_extra value must be a list, got { type (optional_cols ).__name__ } " )
479+ return self ._parse_col_def_composite (
480+ data = data , table = table , col_def = optional_cols , table_mask = table_mask , allow_missing = True
481+ )
414482 return self ._parse_col_def_filter (
415483 data = data ,
416484 table = table ,
@@ -419,7 +487,9 @@ def _parse_col_def( # pylint: disable = too-many-arguments,too-many-positional-
419487 extra_info = extra_info ,
420488 )
421489 if isinstance (col_def , list ):
422- return self ._parse_col_def_composite (data = data , table = table , col_def = col_def , table_mask = table_mask )
490+ return self ._parse_col_def_composite (
491+ data = data , table = table , col_def = col_def , table_mask = table_mask , allow_missing = allow_missing
492+ )
423493 raise TypeError (f"Invalid column definition: { col_def } " )
424494
425495 @staticmethod
@@ -452,6 +522,7 @@ def _parse_col_def_column_name(
452522 table : str ,
453523 col_def : str ,
454524 table_mask : Optional [np .ndarray ] = None ,
525+ allow_missing : bool = False ,
455526 ) -> pd .DataFrame :
456527 """Extract a column from the data. If the column doesn't exist, check if the col_def is a special float value,
457528 like 'inf'. If that's the case, create a single column pandas DataFrame containing the const value.
@@ -460,6 +531,7 @@ def _parse_col_def_column_name(
460531 data: TabularData:
461532 table: str:
462533 col_def: str:
534+ allow_missing: bool: If True, return empty DataFrame when column is missing instead of raising KeyError
463535
464536 Returns:
465537
@@ -480,18 +552,23 @@ def _parse_col_def_column_name(
480552 col_data = self ._apply_multiplier (table = table , column = col_name , data = col_data )
481553 return pd .DataFrame (col_data )
482554
483- def _get_float (value : str ) -> Optional [float ]:
484- try :
485- return float (value )
486- except ValueError :
487- return None
488-
489- # Maybe it is not a column name, but a float value like 'inf', let's try to convert the string to a float
490- if (const_value := _get_float (col_def )) is not None :
491- return self ._parse_col_def_const (data = data , table = table , col_def = const_value , table_mask = table_mask )
555+ try : # Maybe it is not a column name, but a float value like 'inf', let's try to convert the string to a float
556+ const_value = float (col_def )
557+ except ValueError as e :
558+ if allow_missing :
559+ # Return empty DataFrame with correct number of rows when column is optional and missing
560+ self ._log .debug (
561+ "Optional column not found" ,
562+ table = table ,
563+ columns = " or " .join (f"'{ col_name } '" for col_name in columns ),
564+ )
565+ index = table_data .index if isinstance (table_data , pd .DataFrame ) else pd .RangeIndex (len (table_data ))
566+ return pd .DataFrame (index = index )
567+ # pylint: disable=raise-missing-from
568+ columns_str = " and " .join (f"'{ col_name } '" for col_name in columns )
569+ raise KeyError (f"Could not find column { columns_str } on table '{ table } '" ) from e
492570
493- columns_str = " and " .join (f"'{ col_name } '" for col_name in columns )
494- raise KeyError (f"Could not find column { columns_str } on table '{ table } '" )
571+ return self ._parse_col_def_const (data = data , table = table , col_def = const_value , table_mask = table_mask )
495572
496573 def _apply_multiplier (self , table : str , column : str , data : pd .Series ) -> pd .Series :
497574 if self ._multipliers is None :
@@ -780,13 +857,15 @@ def _parse_col_def_composite(
780857 table : str ,
781858 col_def : list ,
782859 table_mask : Optional [np .ndarray ],
860+ allow_missing : bool = False ,
783861 ) -> pd .DataFrame :
784862 """Select multiple columns (each is created from a column definition) and return them as a new DataFrame.
785863
786864 Args:
787865 data: TabularData:
788866 table: str:
789867 col_def: list:
868+ allow_missing: bool: If True, skip missing columns instead of raising errors
790869
791870 Returns:
792871
@@ -799,10 +878,20 @@ def _parse_col_def_composite(
799878 col_def = sub_def ,
800879 table_mask = table_mask ,
801880 extra_info = None ,
881+ allow_missing = allow_missing ,
802882 )
803883 for sub_def in col_def
804884 ]
805- return pd .concat (columns , axis = 1 )
885+ # Filter out DataFrames with no columns (from missing optional columns)
886+ non_empty_columns = [col for col in columns if len (col .columns ) > 0 ]
887+ if not non_empty_columns :
888+ # If all columns are missing, return an empty DataFrame with the correct number of rows
889+ table_data = data [table ]
890+ if table_mask is not None :
891+ table_data = table_data [table_mask ]
892+ index = table_data .index if isinstance (table_data , pd .DataFrame ) else pd .RangeIndex (len (table_data ))
893+ return pd .DataFrame (index = index )
894+ return pd .concat (non_empty_columns , axis = 1 )
806895
807896 def _get_id (self , table : str , key : Mapping [str , int ], name : Optional [str ]) -> int :
808897 """
0 commit comments