Skip to content

Commit efe0f5c

Browse files
authored
Merge pull request #144 from ComplexData-MILA/IS_139_logits_integration
distribution loading for bijean's experiment
2 parents 15f4803 + 87408bd commit efe0f5c

5 files changed

Lines changed: 4556 additions & 2 deletions

File tree

app/api/endpoints/analysis_endpoints.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from app.schemas.analysis_schema import AnalysisRead
2020
from app.core.exceptions import NotFoundException
2121
from fastapi.responses import StreamingResponse
22+
from app.core.scoring import get_percentile
2223

2324
router = APIRouter(prefix="/analysis", tags=["analysis"])
2425
logger = logging.getLogger(__name__)
@@ -41,7 +42,11 @@ async def get_analysis(
4142
analysis = await analysis_service.get_analysis(
4243
analysis_id=analysis_id, include_sources=include_sources, include_feedback=include_feedback
4344
)
44-
return AnalysisRead.model_validate(analysis)
45+
raw_score = analysis.confidence_score
46+
percentile = (get_percentile(raw_score)) / 100.0
47+
analysis = AnalysisRead.model_validate(analysis)
48+
analysis.confidence_percentile = percentile
49+
return analysis
4550
except NotFoundException as e:
4651
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(e))
4752

app/core/scoring.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import pandas as pd
2+
import numpy as np
3+
import logging
4+
import os
5+
6+
logger = logging.getLogger(__name__)
7+
8+
# Global variable to hold data in RAM
9+
_REFERENCE_SCORES = np.array([])
10+
_IS_LOADED = False
11+
12+
13+
def load_distribution(csv_path: str):
14+
"""Loads CSV into the global variable. Fails gracefully."""
15+
global _REFERENCE_SCORES, _IS_LOADED
16+
17+
if not os.path.exists(csv_path):
18+
logger.warning(f"Distribution file not found at {csv_path}. Skipping preload.")
19+
return # App continues running, just without normalization
20+
21+
try:
22+
logger.info(f"Loading distribution from {csv_path}...")
23+
df = pd.read_csv(csv_path, usecols=["confidence_score"])
24+
_REFERENCE_SCORES = df["confidence_score"].dropna().values
25+
_REFERENCE_SCORES.sort()
26+
_IS_LOADED = True
27+
logger.info("Distribution loaded into RAM.")
28+
except Exception as e:
29+
logger.error(f"ERROR: Failed to load distribution: {e}")
30+
# We catch the error so the app doesn't crash
31+
32+
33+
def get_percentile(score: float) -> float:
34+
"""Calculates percentile. Returns 0 if data isn't loaded."""
35+
if not _IS_LOADED or len(_REFERENCE_SCORES) == 0:
36+
return 0.0
37+
38+
# Fast binary search
39+
idx = np.searchsorted(_REFERENCE_SCORES, score, side="left")
40+
return (idx / len(_REFERENCE_SCORES)) * 100

0 commit comments

Comments
 (0)