fix CC with categorical labels

luizfernandolj · luizfernandolj · commit 635b4672b54e · 2025-12-08T17:40:12.000-03:00
diff --git a/mlquantify/adjust_counting/_adjustment.py b/mlquantify/adjust_counting/_adjustment.py
@@ -105,7 +105,8 @@ def _adjust(self, predictions, train_y_scores, train_y_values):
         thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores)
         threshold, tpr, fpr = self.get_best_threshold(thresholds, tprs, fprs)
 
-        cc_predictions = CC(threshold=threshold).aggregate(predictions, train_y_values)[1]
+        cc_predictions = CC(threshold=threshold).aggregate(predictions, train_y_values)
+        cc_predictions = list(cc_predictions.values())[1]
 
         if tpr - fpr == 0:
             prevalence = cc_predictions
@@ -609,7 +610,7 @@ def _adjust(self, predictions, train_y_scores, train_y_values):
         prevs = []
         for thr, tpr, fpr in zip(thresholds, tprs, fprs):
             cc_predictions = CC(threshold=thr).aggregate(predictions, train_y_values)
-            cc_predictions = cc_predictions[1]
+            cc_predictions = list(cc_predictions.values())[1]
             
             if tpr - fpr == 0:
                 prevalence = cc_predictions
diff --git a/mlquantify/adjust_counting/_counting.py b/mlquantify/adjust_counting/_counting.py
@@ -76,14 +76,15 @@ def __init__(self, learner=None, threshold=0.5):
         self.threshold = threshold
 
     def aggregate(self, predictions, train_y_values=None):
-        predictions = validate_predictions(self, predictions, self.threshold)
+        predictions = validate_predictions(self, predictions, self.threshold, train_y_values)
         
         if train_y_values is None:
             train_y_values = np.unique(predictions)
+            
         self.classes_ = check_classes_attribute(self, np.unique(train_y_values))
         class_counts = np.array([np.count_nonzero(predictions == _class) for _class in self.classes_])
         prevalences = class_counts / len(predictions)
-        
+
         prevalences = validate_prevalences(self, prevalences, self.classes_)
         return prevalences
 
diff --git a/mlquantify/utils/_validation.py b/mlquantify/utils/_validation.py
@@ -96,23 +96,49 @@ def validate_y(quantifier: Any, y: np.ndarray) -> None:
 
 def _get_valid_crisp_predictions(predictions, threshold=0.5):
     predictions = np.asarray(predictions)
-
     dimensions = predictions.ndim
 
+    if train_y_values is not None:
+        classes = np.unique(train_y_values)
+    else:
+        classes = None
+
     if dimensions > 2:
-        predictions = np.argmax(predictions, axis=1)
+        # Assuming the last dimension contains class probabilities
+        crisp_indices = np.argmax(predictions, axis=-1)
+        if classes is not None:
+            predictions = classes[crisp_indices]
+        else:
+            predictions = crisp_indices
     elif dimensions == 2:
-        predictions = (predictions[:, 1] >= threshold).astype(int)
+        # Binary or multi-class probabilities (N, C)
+        if classes is not None and len(classes) == 2:
+            # Binary case with explicit classes
+            predictions = np.where(predictions[:, 1] >= threshold, classes[1], classes[0])
+        elif classes is not None and len(classes) > 2:
+            # Multi-class case with explicit classes
+            crisp_indices = np.argmax(predictions, axis=1)
+            predictions = classes[crisp_indices]
+        else:
+            # Default binary (0 or 1) or multi-class (0 to C-1)
+            if predictions.shape[1] == 2:
+                predictions = (predictions[:, 1] >= threshold).astype(int)
+            else:
+                predictions = np.argmax(predictions, axis=1)
     elif dimensions == 1:
+        # 1D probabilities (e.g., probability of positive class)
         if np.issubdtype(predictions.dtype, np.floating):
-            predictions = (predictions >= threshold).astype(int)
+            if classes is not None and len(classes) == 2:
+                predictions = np.where(predictions >= threshold, classes[1], classes[0])
+            else:
+                predictions = (predictions >= threshold).astype(int)
     else:
         raise ValueError(f"Predictions array has an invalid number of dimensions. Expected 1 or more dimensions, got {predictions.ndim}.")
 
     return predictions
 
 
-def validate_predictions(quantifier: Any, predictions: np.ndarray, threshold: float = 0.5) -> np.ndarray:
+def validate_predictions(quantifier: Any, predictions: np.ndarray, threshold: float = 0.5, train_y_values=None) -> np.ndarray:
     """
     Validate predictions using the quantifier's declared output tags.
     Raises InputValidationError if inconsistent with tags.
@@ -132,7 +158,7 @@ def validate_predictions(quantifier: Any, predictions: np.ndarray, threshold: fl
             f"Soft predictions for {quantifier.__class__.__name__} must be float, got dtype {predictions.dtype}."
         )
     elif estimator_type == "crisp" and np.issubdtype(predictions.dtype, np.floating):
-        predictions = _get_valid_crisp_predictions(predictions, threshold) 
+        predictions = _get_valid_crisp_predictions(predictions, train_y_values, threshold) 
     return predictions