Merge pull request #3 from predomics/feat/metadata-regression

eprifti · web-flow · commit 18c456fa609d · 2026-03-26T00:51:09.000+01:00
feat: metadata upload, variable selection, regression support
diff --git a/backend/app/models/schemas.py b/backend/app/models/schemas.py
@@ -35,6 +35,14 @@ class FitFunction(str, Enum):
     mcc = "mcc"
     f1_score = "f1_score"
     g_mean = "g_mean"
+    # Regression fit functions
+    spearman = "spearman"
+    pearson = "pearson"
+    rmse = "rmse"
+    mutual_information = "mutual_information"
+
+
+REGRESSION_FIT_FUNCTIONS = {"spearman", "pearson", "rmse", "mutual_information"}
 
 
 class JobStatus(str, Enum):
@@ -375,6 +383,12 @@ class DatasetUpdate(BaseModel):
     description: Optional[str] = None
 
 
+class YFromMetadataRequest(BaseModel):
+    """Request to generate a y file from a metadata column."""
+    column: str
+    file_role: str = "ytrain"
+
+
 class ProjectUpdate(BaseModel):
     name: Optional[str] = None
     description: Optional[str] = None
diff --git a/backend/app/routers/analysis.py b/backend/app/routers/analysis.py
@@ -467,14 +467,18 @@ def _run_job(job_id: str, project_id: str, param_path: str, user_id: str = "") -
             raise RuntimeError(f"Worker exited with code {proc.returncode}: {error_msg}")
 
         # Extract best_auc/best_k from results for fast list_jobs
+        # For regression runs, use the fit value as the primary metric
         best_auc_val = None
         best_k_val = None
         if results_path.exists():
             try:
                 with open(results_path) as rf:
                     res = json.load(rf)
                 best = res.get("best_individual", {})
-                best_auc_val = best.get("auc")
+                if res.get("regression"):
+                    best_auc_val = best.get("fit")
+                else:
+                    best_auc_val = best.get("auc")
                 best_k_val = best.get("k")
             except Exception:
                 pass
diff --git a/backend/app/routers/datasets.py b/backend/app/routers/datasets.py
@@ -15,7 +15,7 @@
 from ..core.deps import get_current_user
 from ..core.rate_limit import limiter
 from ..models.db_models import User, Dataset, DatasetFile, ProjectDataset, Project, DatasetVersion
-from ..models.schemas import DatasetResponse, DatasetUpdate, DatasetFileRef
+from ..models.schemas import DatasetResponse, DatasetUpdate, DatasetFileRef, YFromMetadataRequest
 from ..services import storage, audit
 
 _log = logging.getLogger(__name__)
@@ -554,6 +554,236 @@ async def preview_file(
     }
 
 
+# ---------------------------------------------------------------------------
+# Metadata column inspection & y-from-metadata
+# ---------------------------------------------------------------------------
+
+def _find_metadata_file(files) -> Optional[Path]:
+    """Find the metadata file among dataset files."""
+    for f in files:
+        name = f.filename.lower()
+        if f.role == "metadata" or "metadata" in name or "meta" in name:
+            p = Path(f.disk_path)
+            if p.exists():
+                return p
+    return None
+
+
+def _parse_metadata_columns(meta_path: Path) -> list[dict]:
+    """Parse a metadata TSV and return column descriptors with types and stats."""
+    sample = meta_path.read_text(errors="replace")[:4096]
+    delimiter = "\t" if "\t" in sample else ","
+
+    all_rows = []
+    with open(meta_path, "r", errors="replace") as f:
+        reader = csv.reader(f, delimiter=delimiter)
+        for line in reader:
+            all_rows.append(line)
+
+    if len(all_rows) < 2:
+        return []
+
+    header = all_rows[0]
+    data_rows = all_rows[1:]
+    columns = []
+
+    for col_idx, col_name in enumerate(header):
+        if col_idx == 0:
+            continue  # skip sample ID column
+        values = []
+        for row in data_rows:
+            if col_idx < len(row) and row[col_idx].strip():
+                values.append(row[col_idx].strip())
+
+        if not values:
+            continue
+
+        # Try to detect numeric vs categorical
+        numeric_vals = []
+        for v in values:
+            try:
+                numeric_vals.append(float(v))
+            except (ValueError, TypeError):
+                pass
+
+        if len(numeric_vals) > len(values) * 0.8:
+            # Numeric column
+            columns.append({
+                "name": col_name,
+                "type": "numeric",
+                "min": round(min(numeric_vals), 6),
+                "max": round(max(numeric_vals), 6),
+                "n_values": len(numeric_vals),
+                "n_missing": len(data_rows) - len(numeric_vals),
+            })
+        else:
+            # Categorical column
+            unique_vals = sorted(set(values))
+            columns.append({
+                "name": col_name,
+                "type": "categorical",
+                "values": unique_vals[:50],  # cap at 50 unique values
+                "n_unique": len(unique_vals),
+                "n_values": len(values),
+                "n_missing": len(data_rows) - len(values),
+            })
+
+    return columns
+
+
+@router.get("/{dataset_id}/metadata-columns")
+async def get_metadata_columns(
+    dataset_id: str,
+    user: User = Depends(get_current_user),
+    db: AsyncSession = Depends(get_db),
+):
+    """Get metadata column names, types, and summary stats.
+
+    Numeric columns can be used as regression targets, categorical as classification targets.
+    """
+    result = await db.execute(
+        select(Dataset)
+        .where(Dataset.id == dataset_id, Dataset.user_id == user.id)
+        .options(selectinload(Dataset.files))
+    )
+    dataset = result.scalar_one_or_none()
+    if not dataset:
+        raise HTTPException(status_code=404, detail="Dataset not found")
+
+    meta_path = _find_metadata_file(dataset.files)
+    if not meta_path:
+        raise HTTPException(
+            status_code=404,
+            detail="No metadata file found in this dataset. Upload a file with role 'metadata'.",
+        )
+
+    columns = _parse_metadata_columns(meta_path)
+    return {"columns": columns}
+
+
+@router.post("/{dataset_id}/y-from-metadata")
+async def generate_y_from_metadata(
+    dataset_id: str,
+    body: YFromMetadataRequest,
+    user: User = Depends(get_current_user),
+    db: AsyncSession = Depends(get_db),
+):
+    """Generate a y file from a metadata column, matching samples with the X file.
+
+    The extracted column is written as a TSV file and registered in the dataset.
+    """
+    result = await db.execute(
+        select(Dataset)
+        .where(Dataset.id == dataset_id, Dataset.user_id == user.id)
+        .options(selectinload(Dataset.files))
+    )
+    dataset = result.scalar_one_or_none()
+    if not dataset:
+        raise HTTPException(status_code=404, detail="Dataset not found")
+
+    # Find metadata file
+    meta_path = _find_metadata_file(dataset.files)
+    if not meta_path:
+        raise HTTPException(status_code=404, detail="No metadata file found in this dataset.")
+
+    # Find X file to get sample names
+    x_role = "xtrain" if body.file_role == "ytrain" else "xtest"
+    x_file = None
+    for f in dataset.files:
+        if f.role == x_role:
+            x_file = f
+            break
+    if not x_file or not Path(x_file.disk_path).exists():
+        raise HTTPException(
+            status_code=400,
+            detail=f"No {x_role} file found. Upload an X file first.",
+        )
+
+    # Read metadata TSV
+    meta_sample = meta_path.read_text(errors="replace")[:4096]
+    meta_delim = "\t" if "\t" in meta_sample else ","
+    meta_rows = []
+    with open(meta_path, "r", errors="replace") as f:
+        reader = csv.reader(f, delimiter=meta_delim)
+        for line in reader:
+            meta_rows.append(line)
+
+    if len(meta_rows) < 2:
+        raise HTTPException(status_code=400, detail="Metadata file is empty or has no data rows.")
+
+    meta_header = meta_rows[0]
+    if body.column not in meta_header:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Column '{body.column}' not found in metadata. Available: {meta_header[1:]}",
+        )
+    col_idx = meta_header.index(body.column)
+
+    # Build sample -> value map from metadata (first column = sample ID)
+    meta_map = {}
+    for row in meta_rows[1:]:
+        if len(row) > col_idx and row[0].strip() and row[col_idx].strip():
+            meta_map[row[0].strip()] = row[col_idx].strip()
+
+    # Read X file to get sample names (column headers if features_in_rows, else first column)
+    x_sample = Path(x_file.disk_path).read_text(errors="replace")[:4096]
+    x_delim = "\t" if "\t" in x_sample else ","
+    with open(x_file.disk_path, "r", errors="replace") as f:
+        x_reader = csv.reader(f, delimiter=x_delim)
+        x_header = next(x_reader)
+
+    # Assume features in rows: sample names are column headers (skip first)
+    x_sample_names = [s.strip() for s in x_header[1:]]
+
+    # Match samples
+    matched = {}
+    missing = []
+    for sample in x_sample_names:
+        if sample in meta_map:
+            matched[sample] = meta_map[sample]
+        else:
+            missing.append(sample)
+
+    if not matched:
+        raise HTTPException(
+            status_code=400,
+            detail="No matching samples between X file and metadata.",
+        )
+
+    # Write y file as TSV: sample_id\tvalue
+    lines = ["sample_id\t" + body.column]
+    for sample in x_sample_names:
+        if sample in matched:
+            lines.append(f"{sample}\t{matched[sample]}")
+    y_content = "\n".join(lines) + "\n"
+
+    # Register the file in the dataset
+    filename = f"{body.file_role}_{body.column}.tsv"
+    ds_file = DatasetFile(
+        dataset_id=dataset.id,
+        filename=filename,
+        role=body.file_role,
+        disk_path="",
+    )
+    db.add(ds_file)
+    await db.flush()
+
+    disk_path = storage.save_user_dataset_file(user.id, ds_file.id, filename, y_content.encode("utf-8"))
+    ds_file.disk_path = disk_path
+
+    await _create_version_snapshot(db, dataset_id, user.id, note=f"Generate {filename} from metadata")
+
+    # Auto-scan if xtrain + ytrain now present
+    await _try_auto_scan(db, dataset)
+
+    return {
+        "file": DatasetFileRef(id=ds_file.id, filename=ds_file.filename, role=ds_file.role).model_dump(),
+        "matched_samples": len(matched),
+        "missing_samples": len(missing),
+        "total_x_samples": len(x_sample_names),
+    }
+
+
 # ---------------------------------------------------------------------------
 # Project assignment
 # ---------------------------------------------------------------------------
diff --git a/backend/app/services/engine.py b/backend/app/services/engine.py
@@ -12,6 +12,7 @@
 import yaml
 
 from ..core.config import settings as app_settings
+from ..models.schemas import REGRESSION_FIT_FUNCTIONS
 
 logger = logging.getLogger(__name__)
 
@@ -117,6 +118,9 @@ def _merge(section_key, defaults):
         "save_exp": str(Path(output_dir) / "experiment.bin"),
     })
 
+    # When fit is a regression function, disable class-based feature selection
+    is_regression = general.get("fit") in REGRESSION_FIT_FUNCTIONS
+
     data_cfg = config.get("data", {})
 
     cv = _merge("cv", {
@@ -212,6 +216,7 @@ def _merge(section_key, defaults):
             "feature_maximal_adj_pvalue": data_cfg.get("feature_maximal_adj_pvalue", 0.05),
             "feature_minimal_feature_value": data_cfg.get("feature_minimal_feature_value", 0.0),
             **({"classes": data_cfg["classes"]} if data_cfg.get("classes") else {}),
+            **({"feature_maximal_adj_pvalue": 1.0} if is_regression else {}),
         },
         "cv": cv,
         "importance": importance,
diff --git a/backend/app/services/worker.py b/backend/app/services/worker.py
@@ -430,10 +430,14 @@ def main():
     param_path = sys.argv[1]
     results_path = sys.argv[2]
 
+    REGRESSION_FITS = {"spearman", "pearson", "rmse", "mutual_information"}
+
     # Check if this is a sklearn algorithm
     with open(param_path) as _f:
         _param_yaml = yaml.safe_load(_f)
     algo = _param_yaml.get("general", {}).get("algo", "ga")
+    fit_function = _param_yaml.get("general", {}).get("fit", "auc")
+    is_regression = fit_function in REGRESSION_FITS
 
     from .sklearn_runner import is_sklearn_algo
     if is_sklearn_algo(algo):
@@ -574,6 +578,13 @@ def main():
     if stability_data is not None:
         results["stability"] = stability_data
 
+    # Store regression metadata when using a regression fit function
+    if is_regression:
+        results["regression"] = {
+            "fit_function": fit_function,
+            "best_fit": metrics.get("fit"),
+        }
+
     # Clinical integration (if enabled)
     clinical_cfg = _param_yaml.get("clinical", {})
     if clinical_cfg.get("enabled") and clinical_cfg.get("path"):
diff --git a/frontend/src/data/parameterDefs.js b/frontend/src/data/parameterDefs.js
@@ -50,6 +50,8 @@ export const PARAM_DEFS = [
       { value: 'auc', label: 'AUC' }, { value: 'mcc', label: 'MCC' }, { value: 'f1_score', label: 'F1 Score' },
       { value: 'sensitivity', label: 'Sensitivity' }, { value: 'specificity', label: 'Specificity' },
       { value: 'g_mean', label: 'Geometric Mean' }, { value: 'npv', label: 'NPV' }, { value: 'ppv', label: 'PPV' },
+      { value: 'spearman', label: 'Spearman correlation' }, { value: 'pearson', label: 'Pearson correlation' },
+      { value: 'rmse', label: 'RMSE' }, { value: 'mutual_information', label: 'Mutual Information' },
     ],
   },
   {
diff --git a/frontend/src/i18n/locales/en.json b/frontend/src/i18n/locales/en.json
@@ -642,7 +642,11 @@
     "nNeighbors": "Neighbors",
     "minDist": "Min distance",
     "top": "Top",
-    "loadingData": "Loading data..."
+    "loadingData": "Loading data...",
+    "metadata": "Metadata (optional)",
+    "selectYVariable": "Select y variable",
+    "regressionMode": "Regression mode",
+    "metadataUpload": "Upload metadata"
   },
   "dataExplore": {
     "summary": "Summary",
diff --git a/frontend/src/i18n/locales/fr.json b/frontend/src/i18n/locales/fr.json
@@ -642,7 +642,11 @@
     "nNeighbors": "Voisins",
     "minDist": "Distance min.",
     "top": "Top",
-    "loadingData": "Chargement des données..."
+    "loadingData": "Chargement des données...",
+    "metadata": "Métadonnées (optionnel)",
+    "selectYVariable": "Sélectionner la variable y",
+    "regressionMode": "Mode régression",
+    "metadataUpload": "Importer les métadonnées"
   },
   "dataExplore": {
     "summary": "Résumé",
diff --git a/frontend/src/stores/dataset.js b/frontend/src/stores/dataset.js
diff --git a/frontend/src/views/DataTab.vue b/frontend/src/views/DataTab.vue

Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,8 @@ export const PARAM_DEFS = [`
`50`	`50`	`{ value: 'auc', label: 'AUC' }, { value: 'mcc', label: 'MCC' }, { value: 'f1_score', label: 'F1 Score' },`
`51`	`51`	`{ value: 'sensitivity', label: 'Sensitivity' }, { value: 'specificity', label: 'Specificity' },`
`52`	`52`	`{ value: 'g_mean', label: 'Geometric Mean' }, { value: 'npv', label: 'NPV' }, { value: 'ppv', label: 'PPV' },`
	`53`	`+ { value: 'spearman', label: 'Spearman correlation' }, { value: 'pearson', label: 'Pearson correlation' },`
	`54`	`+ { value: 'rmse', label: 'RMSE' }, { value: 'mutual_information', label: 'Mutual Information' },`
`53`	`55`	`],`
`54`	`56`	`},`
`55`	`57`	`{`