From 2af666fa6897c142b7669b703c5c4a6b3c84e789 Mon Sep 17 00:00:00 2001 From: PaulJonasJost Date: Tue, 23 Sep 2025 16:35:42 +0200 Subject: [PATCH 1/5] Closes #159. Adjusts example for better reflection. --- example/dose_response.csv | 2 +- .../controllers/table_controllers.py | 37 +++++++++---------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/example/dose_response.csv b/example/dose_response.csv index c56d803..f94c80c 100644 --- a/example/dose_response.csv +++ b/example/dose_response.csv @@ -1,4 +1,4 @@ -pApB,obsA,obsB,obsC +dose,obsA,obsB,obsC 3, 1, 2, 3 5, 4, 5, 6 7, 7, 8, 9 diff --git a/src/petab_gui/controllers/table_controllers.py b/src/petab_gui/controllers/table_controllers.py index bf0b69f..db0ad09 100644 --- a/src/petab_gui/controllers/table_controllers.py +++ b/src/petab_gui/controllers/table_controllers.py @@ -776,27 +776,26 @@ def _rank_dose_candidates(self, df) -> list[str]: re.IGNORECASE, ) scores = {} - # FIXME: https://github.com/PaulJonasJost/PEtab_GUI/issues/159 for col in df.columns: # noqa: B007 s = 0.0 - if patt.search(col or ""): - s += 2.0 - try: - if df[col].dtype.kind in "if": - s += 1.0 - uniq = df[col].nunique(dropna=True) - if 2 <= uniq <= 30: - s += 0.8 - if np.all(pd.to_numeric(df[col], errors="coerce").fillna(0) >= 0): - s += 0.3 - ser = pd.to_numeric(df[col], errors="coerce").dropna() - if len(ser) >= 5: - diffs = np.diff(ser.values) - if np.mean(diffs >= 0) >= 0.7: - s += 0.2 - except Exception: - pass - scores[col] = s + if patt.search(col or ""): + s += 2.0 + try: + if df[col].dtype.kind in "if": + s += 1.0 + uniq = df[col].nunique(dropna=True) + if 2 <= uniq <= 30: + s += 0.8 + if np.all(pd.to_numeric(df[col], errors="coerce").fillna(0) >= 0): + s += 0.3 + ser = pd.to_numeric(df[col], errors="coerce").dropna() + if len(ser) >= 5: + diffs = np.diff(ser.values) + if np.mean(diffs >= 0) >= 0.7: + s += 0.2 + except Exception: + pass + scores[col] = s return [ c for c, _ in sorted( From c96326d999346e02ca1c069244da043319d176ee Mon Sep 17 00:00:00 2001 From: PaulJonasJost Date: Tue, 23 Sep 2025 16:38:14 +0200 Subject: [PATCH 2/5] Reverted example changes --- example/dose_response.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/dose_response.csv b/example/dose_response.csv index f94c80c..c56d803 100644 --- a/example/dose_response.csv +++ b/example/dose_response.csv @@ -1,4 +1,4 @@ -dose,obsA,obsB,obsC +pApB,obsA,obsB,obsC 3, 1, 2, 3 5, 4, 5, 6 7, 7, 8, 9 From a5475db5518892939a23cb9f0724dc73de4ede13 Mon Sep 17 00:00:00 2001 From: PaulJonasJost Date: Tue, 23 Sep 2025 16:42:09 +0200 Subject: [PATCH 3/5] Ruff adherence --- src/petab_gui/controllers/table_controllers.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/petab_gui/controllers/table_controllers.py b/src/petab_gui/controllers/table_controllers.py index db0ad09..a267778 100644 --- a/src/petab_gui/controllers/table_controllers.py +++ b/src/petab_gui/controllers/table_controllers.py @@ -770,7 +770,7 @@ def _detect_time_column(self, df) -> str | None: return None def _rank_dose_candidates(self, df) -> list[str]: - """Lightweight ranking of dose-like columns (regex + numeric + cardinality).""" + """Lightweight ranking of dose-like columns.""" patt = re.compile( r"\b(dose|conc|concentration|drug|compound|stim|input|u\d+)\b", re.IGNORECASE, @@ -786,7 +786,9 @@ def _rank_dose_candidates(self, df) -> list[str]: uniq = df[col].nunique(dropna=True) if 2 <= uniq <= 30: s += 0.8 - if np.all(pd.to_numeric(df[col], errors="coerce").fillna(0) >= 0): + if np.all( + pd.to_numeric(df[col], errors="coerce").fillna(0) >= 0 + ): s += 0.3 ser = pd.to_numeric(df[col], errors="coerce").dropna() if len(ser) >= 5: @@ -805,7 +807,7 @@ def _rank_dose_candidates(self, df) -> list[str]: ] def _resolve_dose_and_time(self, df) -> tuple[str | None, str | None, str]: - """Open dialog with ranked dose suggestions and time choices (incl. steady state).""" + """Open dialog with ranked dose suggestions and time choices.""" header_key = str(hash(tuple(df.columns))) settings = settings_manager.settings # TODO: rename settings location From 4519c8600b9142adaf461808376676e9b4b1af33 Mon Sep 17 00:00:00 2001 From: Paul Jonas Jost <70631928+PaulJonasJost@users.noreply.github.com> Date: Wed, 24 Sep 2025 09:51:40 +0200 Subject: [PATCH 4/5] Update src/petab_gui/controllers/table_controllers.py Co-authored-by: Daniel Weindl --- src/petab_gui/controllers/table_controllers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/petab_gui/controllers/table_controllers.py b/src/petab_gui/controllers/table_controllers.py index a267778..f9e3575 100644 --- a/src/petab_gui/controllers/table_controllers.py +++ b/src/petab_gui/controllers/table_controllers.py @@ -776,7 +776,7 @@ def _rank_dose_candidates(self, df) -> list[str]: re.IGNORECASE, ) scores = {} - for col in df.columns: # noqa: B007 + for col in df.columns: s = 0.0 if patt.search(col or ""): s += 2.0 From 18b1cf8fd53012afc17c95a75fe972b9b544e31c Mon Sep 17 00:00:00 2001 From: PaulJonasJost Date: Wed, 24 Sep 2025 09:58:32 +0200 Subject: [PATCH 5/5] added documentation --- .../controllers/table_controllers.py | 94 +++++++++++++++++-- 1 file changed, 86 insertions(+), 8 deletions(-) diff --git a/src/petab_gui/controllers/table_controllers.py b/src/petab_gui/controllers/table_controllers.py index a267778..3287e9a 100644 --- a/src/petab_gui/controllers/table_controllers.py +++ b/src/petab_gui/controllers/table_controllers.py @@ -769,35 +769,113 @@ def _detect_time_column(self, df) -> str | None: return c return None - def _rank_dose_candidates(self, df) -> list[str]: - """Lightweight ranking of dose-like columns.""" + def _rank_dose_candidates(self, df: pd.DataFrame) -> list[str]: + """Rank DataFrame columns by likelihood of containing dose/concentration data. + + This method implements a lightweight scoring system to identify and rank + columns that are most likely to contain dose, concentration, or drug-related + data. The ranking is based on multiple heuristics including column naming + patterns, data types, value ranges, and statistical properties. + + Parameters + ---------- + df : pd.DataFrame + Input DataFrame containing columns to be evaluated and ranked. + Must contain at least one column with data. + + Returns + ------- + list[str] + Column names sorted by descending likelihood of containing dose data. + Columns with higher scores appear first. In case of tied scores, + columns with fewer unique values are ranked higher. + + Notes + ----- + The scoring algorithm considers the following criteria: + + - **Name matching** (+2.0 points): Column names containing keywords like + 'dose', 'conc', 'concentration', 'drug', 'compound', 'stim', 'input', + or patterns like 'u' (case-insensitive). + + - **Numeric data type** (+1.0 points): Columns with integer or float dtype. + + - **Reasonable cardinality** (+0.8 points): Columns with 2-30 unique + non-null values, which is typical for dose series. + + - **Non-negative values** (+0.3 points): All values are >= 0 when converted + to numeric (dose/concentration values are typically non-negative). + + - **Monotonic tendency** (+0.2 points): At least 70% of consecutive numeric + differences are non-decreasing, indicating potential dose escalation + patterns. Requires at least 5 non-null numeric values. + + Raises + ------ + AttributeError + If df does not have the expected pandas DataFrame interface. + + ValueError + If df is empty or contains no valid columns for evaluation. + + See Also + -------- + pandas.DataFrame.nunique : Count unique values in each column + pandas.to_numeric : Convert argument to numeric type + numpy.diff : Calculate discrete differences along array + + Warning + ------- + This function uses broad exception handling to ensure robustness when + processing diverse data types. Individual column evaluation errors are + silently ignored to prevent failure on edge cases like mixed data types + or missing values. + """ + # Compile pattern for dose-related column names patt = re.compile( r"\b(dose|conc|concentration|drug|compound|stim|input|u\d+)\b", re.IGNORECASE, ) - scores = {} - for col in df.columns: # noqa: B007 + + scores: dict[str, float] = {} + + for col in df.columns: s = 0.0 + + # Score based on column name pattern matching if patt.search(col or ""): s += 2.0 + try: - if df[col].dtype.kind in "if": + # Score based on data type (numeric preferred) + if df[col].dtype.kind in "if": # integer or float s += 1.0 + + # Score based on reasonable number of unique values uniq = df[col].nunique(dropna=True) - if 2 <= uniq <= 30: + if 2 <= uniq <= 30: # Reasonable range for dose series? s += 0.8 + + # Score based on non-negative values (typical for doses) if np.all( pd.to_numeric(df[col], errors="coerce").fillna(0) >= 0 ): s += 0.3 + + # Score based on monotonic tendency (dose escalation pattern) ser = pd.to_numeric(df[col], errors="coerce").dropna() if len(ser) >= 5: diffs = np.diff(ser.values) - if np.mean(diffs >= 0) >= 0.7: + if np.mean(diffs >= 0) >= 0.7: # 70% non-decreasing s += 0.2 - except Exception: + + except Exception: # noqa: S110 + # Silently handle any data processing errors pass + scores[col] = s + + # Sort by score (descending) then by unique count (ascending) for ties return [ c for c, _ in sorted(