@@ -775,34 +775,112 @@ def _detect_time_column(self, df) -> str | None:
775775 return c
776776 return None
777777
778- def _rank_dose_candidates (self , df ) -> list [str ]:
779- """Lightweight ranking of dose-like columns (regex + numeric + cardinality)."""
778+ def _rank_dose_candidates (self , df : pd .DataFrame ) -> list [str ]:
779+ """Rank DataFrame columns by likelihood of containing dose/concentration data.
780+
781+ This method implements a lightweight scoring system to identify and rank
782+ columns that are most likely to contain dose, concentration, or drug-related
783+ data. The ranking is based on multiple heuristics including column naming
784+ patterns, data types, value ranges, and statistical properties.
785+
786+ Parameters
787+ ----------
788+ df : pd.DataFrame
789+ Input DataFrame containing columns to be evaluated and ranked.
790+ Must contain at least one column with data.
791+
792+ Returns
793+ -------
794+ list[str]
795+ Column names sorted by descending likelihood of containing dose data.
796+ Columns with higher scores appear first. In case of tied scores,
797+ columns with fewer unique values are ranked higher.
798+
799+ Notes
800+ -----
801+ The scoring algorithm considers the following criteria:
802+
803+ - **Name matching** (+2.0 points): Column names containing keywords like
804+ 'dose', 'conc', 'concentration', 'drug', 'compound', 'stim', 'input',
805+ or patterns like 'u<digit>' (case-insensitive).
806+
807+ - **Numeric data type** (+1.0 points): Columns with integer or float dtype.
808+
809+ - **Reasonable cardinality** (+0.8 points): Columns with 2-30 unique
810+ non-null values, which is typical for dose series.
811+
812+ - **Non-negative values** (+0.3 points): All values are >= 0 when converted
813+ to numeric (dose/concentration values are typically non-negative).
814+
815+ - **Monotonic tendency** (+0.2 points): At least 70% of consecutive numeric
816+ differences are non-decreasing, indicating potential dose escalation
817+ patterns. Requires at least 5 non-null numeric values.
818+
819+ Raises
820+ ------
821+ AttributeError
822+ If df does not have the expected pandas DataFrame interface.
823+
824+ ValueError
825+ If df is empty or contains no valid columns for evaluation.
826+
827+ See Also
828+ --------
829+ pandas.DataFrame.nunique : Count unique values in each column
830+ pandas.to_numeric : Convert argument to numeric type
831+ numpy.diff : Calculate discrete differences along array
832+
833+ Warning
834+ -------
835+ This function uses broad exception handling to ensure robustness when
836+ processing diverse data types. Individual column evaluation errors are
837+ silently ignored to prevent failure on edge cases like mixed data types
838+ or missing values.
839+ """
840+ # Compile pattern for dose-related column names
780841 patt = re .compile (
781842 r"\b(dose|conc|concentration|drug|compound|stim|input|u\d+)\b" ,
782843 re .IGNORECASE ,
783844 )
784- scores = {}
785- # FIXME: https://github.com/PaulJonasJost/PEtab_GUI/issues/159
786- for col in df .columns : # noqa: B007
845+
846+ scores : dict [ str , float ] = {}
847+ for col in df .columns :
787848 s = 0.0
788- if patt .search (col or "" ):
789- s += 2.0
790- try :
791- if df [col ].dtype .kind in "if" :
792- s += 1.0
793- uniq = df [col ].nunique (dropna = True )
794- if 2 <= uniq <= 30 :
795- s += 0.8
796- if np .all (pd .to_numeric (df [col ], errors = "coerce" ).fillna (0 ) >= 0 ):
797- s += 0.3
798- ser = pd .to_numeric (df [col ], errors = "coerce" ).dropna ()
799- if len (ser ) >= 5 :
800- diffs = np .diff (ser .values )
801- if np .mean (diffs >= 0 ) >= 0.7 :
802- s += 0.2
803- except Exception :
804- pass
805- scores [col ] = s
849+
850+ # Score based on column name pattern matching
851+ if patt .search (col or "" ):
852+ s += 2.0
853+
854+ try :
855+ # Score based on data type (numeric preferred)
856+ if df [col ].dtype .kind in "if" : # integer or float
857+ s += 1.0
858+
859+ # Score based on reasonable number of unique values
860+ uniq = df [col ].nunique (dropna = True )
861+ if 2 <= uniq <= 30 : # Reasonable range for dose series?
862+ s += 0.8
863+
864+ # Score based on non-negative values (typical for doses)
865+ if np .all (
866+ pd .to_numeric (df [col ], errors = "coerce" ).fillna (0 ) >= 0
867+ ):
868+ s += 0.3
869+
870+ # Score based on monotonic tendency (dose escalation pattern)
871+ ser = pd .to_numeric (df [col ], errors = "coerce" ).dropna ()
872+ if len (ser ) >= 5 :
873+ diffs = np .diff (ser .values )
874+ if np .mean (diffs >= 0 ) >= 0.7 : # 70% non-decreasing
875+ s += 0.2
876+
877+ except Exception : # noqa: S110
878+ # Silently handle any data processing errors
879+ pass
880+
881+ scores [col ] = s
882+
883+ # Sort by score (descending) then by unique count (ascending) for ties
806884 return [
807885 c
808886 for c , _ in sorted (
@@ -812,7 +890,7 @@ def _rank_dose_candidates(self, df) -> list[str]:
812890 ]
813891
814892 def _resolve_dose_and_time (self , df ) -> tuple [str | None , str | None , str ]:
815- """Open dialog with ranked dose suggestions and time choices (incl. steady state) ."""
893+ """Open dialog with ranked dose suggestions and time choices."""
816894 header_key = str (hash (tuple (df .columns )))
817895 settings = settings_manager .settings
818896 # TODO: rename settings location
0 commit comments