Skip to content

Commit e11b47c

Browse files
Dose-Response Scoring (#179)
* Closes #159. Adjusts example for better reflection. * Reverted example changes * Ruff adherence * Update src/petab_gui/controllers/table_controllers.py Co-authored-by: Daniel Weindl <dweindl@users.noreply.github.com> * added documentation --------- Co-authored-by: Daniel Weindl <dweindl@users.noreply.github.com>
1 parent 41d48f4 commit e11b47c

1 file changed

Lines changed: 102 additions & 24 deletions

File tree

src/petab_gui/controllers/table_controllers.py

Lines changed: 102 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -775,34 +775,112 @@ def _detect_time_column(self, df) -> str | None:
775775
return c
776776
return None
777777

778-
def _rank_dose_candidates(self, df) -> list[str]:
779-
"""Lightweight ranking of dose-like columns (regex + numeric + cardinality)."""
778+
def _rank_dose_candidates(self, df: pd.DataFrame) -> list[str]:
779+
"""Rank DataFrame columns by likelihood of containing dose/concentration data.
780+
781+
This method implements a lightweight scoring system to identify and rank
782+
columns that are most likely to contain dose, concentration, or drug-related
783+
data. The ranking is based on multiple heuristics including column naming
784+
patterns, data types, value ranges, and statistical properties.
785+
786+
Parameters
787+
----------
788+
df : pd.DataFrame
789+
Input DataFrame containing columns to be evaluated and ranked.
790+
Must contain at least one column with data.
791+
792+
Returns
793+
-------
794+
list[str]
795+
Column names sorted by descending likelihood of containing dose data.
796+
Columns with higher scores appear first. In case of tied scores,
797+
columns with fewer unique values are ranked higher.
798+
799+
Notes
800+
-----
801+
The scoring algorithm considers the following criteria:
802+
803+
- **Name matching** (+2.0 points): Column names containing keywords like
804+
'dose', 'conc', 'concentration', 'drug', 'compound', 'stim', 'input',
805+
or patterns like 'u<digit>' (case-insensitive).
806+
807+
- **Numeric data type** (+1.0 points): Columns with integer or float dtype.
808+
809+
- **Reasonable cardinality** (+0.8 points): Columns with 2-30 unique
810+
non-null values, which is typical for dose series.
811+
812+
- **Non-negative values** (+0.3 points): All values are >= 0 when converted
813+
to numeric (dose/concentration values are typically non-negative).
814+
815+
- **Monotonic tendency** (+0.2 points): At least 70% of consecutive numeric
816+
differences are non-decreasing, indicating potential dose escalation
817+
patterns. Requires at least 5 non-null numeric values.
818+
819+
Raises
820+
------
821+
AttributeError
822+
If df does not have the expected pandas DataFrame interface.
823+
824+
ValueError
825+
If df is empty or contains no valid columns for evaluation.
826+
827+
See Also
828+
--------
829+
pandas.DataFrame.nunique : Count unique values in each column
830+
pandas.to_numeric : Convert argument to numeric type
831+
numpy.diff : Calculate discrete differences along array
832+
833+
Warning
834+
-------
835+
This function uses broad exception handling to ensure robustness when
836+
processing diverse data types. Individual column evaluation errors are
837+
silently ignored to prevent failure on edge cases like mixed data types
838+
or missing values.
839+
"""
840+
# Compile pattern for dose-related column names
780841
patt = re.compile(
781842
r"\b(dose|conc|concentration|drug|compound|stim|input|u\d+)\b",
782843
re.IGNORECASE,
783844
)
784-
scores = {}
785-
# FIXME: https://github.com/PaulJonasJost/PEtab_GUI/issues/159
786-
for col in df.columns: # noqa: B007
845+
846+
scores: dict[str, float] = {}
847+
for col in df.columns:
787848
s = 0.0
788-
if patt.search(col or ""):
789-
s += 2.0
790-
try:
791-
if df[col].dtype.kind in "if":
792-
s += 1.0
793-
uniq = df[col].nunique(dropna=True)
794-
if 2 <= uniq <= 30:
795-
s += 0.8
796-
if np.all(pd.to_numeric(df[col], errors="coerce").fillna(0) >= 0):
797-
s += 0.3
798-
ser = pd.to_numeric(df[col], errors="coerce").dropna()
799-
if len(ser) >= 5:
800-
diffs = np.diff(ser.values)
801-
if np.mean(diffs >= 0) >= 0.7:
802-
s += 0.2
803-
except Exception:
804-
pass
805-
scores[col] = s
849+
850+
# Score based on column name pattern matching
851+
if patt.search(col or ""):
852+
s += 2.0
853+
854+
try:
855+
# Score based on data type (numeric preferred)
856+
if df[col].dtype.kind in "if": # integer or float
857+
s += 1.0
858+
859+
# Score based on reasonable number of unique values
860+
uniq = df[col].nunique(dropna=True)
861+
if 2 <= uniq <= 30: # Reasonable range for dose series?
862+
s += 0.8
863+
864+
# Score based on non-negative values (typical for doses)
865+
if np.all(
866+
pd.to_numeric(df[col], errors="coerce").fillna(0) >= 0
867+
):
868+
s += 0.3
869+
870+
# Score based on monotonic tendency (dose escalation pattern)
871+
ser = pd.to_numeric(df[col], errors="coerce").dropna()
872+
if len(ser) >= 5:
873+
diffs = np.diff(ser.values)
874+
if np.mean(diffs >= 0) >= 0.7: # 70% non-decreasing
875+
s += 0.2
876+
877+
except Exception: # noqa: S110
878+
# Silently handle any data processing errors
879+
pass
880+
881+
scores[col] = s
882+
883+
# Sort by score (descending) then by unique count (ascending) for ties
806884
return [
807885
c
808886
for c, _ in sorted(
@@ -812,7 +890,7 @@ def _rank_dose_candidates(self, df) -> list[str]:
812890
]
813891

814892
def _resolve_dose_and_time(self, df) -> tuple[str | None, str | None, str]:
815-
"""Open dialog with ranked dose suggestions and time choices (incl. steady state)."""
893+
"""Open dialog with ranked dose suggestions and time choices."""
816894
header_key = str(hash(tuple(df.columns)))
817895
settings = settings_manager.settings
818896
# TODO: rename settings location

0 commit comments

Comments
 (0)