ComplexData-MILA
diff --git a/‎app/api/endpoints/analysis_endpoints.py‎
Lines changed: 6 additions & 1 deletion b/‎app/api/endpoints/analysis_endpoints.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎app/core/scoring.py‎
Lines changed: 40 additions & 0 deletions b/‎app/core/scoring.py‎
Lines changed: 40 additions & 0 deletions
@@ -19,6 +19,7 @@
 from app.schemas.analysis_schema import AnalysisRead
 from app.core.exceptions import NotFoundException
 from fastapi.responses import StreamingResponse
+from app.core.scoring import get_percentile
 
 router = APIRouter(prefix="/analysis", tags=["analysis"])
 logger = logging.getLogger(__name__)
@@ -41,7 +42,11 @@ async def get_analysis(
         analysis = await analysis_service.get_analysis(
             analysis_id=analysis_id, include_sources=include_sources, include_feedback=include_feedback
         )
-        return AnalysisRead.model_validate(analysis)
+        raw_score = analysis.confidence_score
+        percentile = (get_percentile(raw_score)) / 100.0
+        analysis = AnalysisRead.model_validate(analysis)
+        analysis.confidence_percentile = percentile
+        return analysis
     except NotFoundException as e:
         raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(e))
 
 
@@ -0,0 +1,40 @@
+import pandas as pd
+import numpy as np
+import logging
+import os
+
+logger = logging.getLogger(__name__)
+
+# Global variable to hold data in RAM
+_REFERENCE_SCORES = np.array([])
+_IS_LOADED = False
+
+
+def load_distribution(csv_path: str):
+    """Loads CSV into the global variable. Fails gracefully."""
+    global _REFERENCE_SCORES, _IS_LOADED
+
+    if not os.path.exists(csv_path):
+        logger.warning(f"Distribution file not found at {csv_path}. Skipping preload.")
+        return  # App continues running, just without normalization
+
+    try:
+        logger.info(f"Loading distribution from {csv_path}...")
+        df = pd.read_csv(csv_path, usecols=["confidence_score"])
+        _REFERENCE_SCORES = df["confidence_score"].dropna().values
+        _REFERENCE_SCORES.sort()
+        _IS_LOADED = True
+        logger.info("Distribution loaded into RAM.")
+    except Exception as e:
+        logger.error(f"ERROR: Failed to load distribution: {e}")
+        # We catch the error so the app doesn't crash
+
+
+def get_percentile(score: float) -> float:
+    """Calculates percentile. Returns 0 if data isn't loaded."""
+    if not _IS_LOADED or len(_REFERENCE_SCORES) == 0:
+        return 0.0
+
+    # Fast binary search
+    idx = np.searchsorted(_REFERENCE_SCORES, score, side="left")
+    return (idx / len(_REFERENCE_SCORES)) * 100