From be247cd046c089a6d526b69430078fec1578bd4d Mon Sep 17 00:00:00 2001 From: YousefZahran1 Date: Fri, 1 May 2026 00:12:25 +0300 Subject: [PATCH 1/2] feat: add zero_division parameter to F1 metric MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sklearn.metrics.f1_score supports zero_division to control the value returned when a label has no predicted or true samples (UndefinedMetricWarning case). The evaluate F1 metric did not expose this argument, causing a TypeError for callers who tried to pass it — even though sklearn's own warning message tells them to do exactly that. precision and recall already accept zero_division; this brings F1 into parity. Default value is 'warn' to preserve backward compatibility. Adds Example 6 to _KWARGS_DESCRIPTION demonstrating the parameter. Fixes #699 --- metrics/f1/f1.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/metrics/f1/f1.py b/metrics/f1/f1.py index 05b0baad..353dc7b7 100644 --- a/metrics/f1/f1.py +++ b/metrics/f1/f1.py @@ -39,6 +39,11 @@ - 'weighted': Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters `'macro'` to account for label imbalance. This option can result in an F-score that is not between precision and recall. - 'samples': Calculate metrics for each instance, and find their average (only meaningful for multilabel classification). sample_weight (`list` of `float`): Sample weights Defaults to None. + zero_division (`int` or `string`): Sets the value to return when there is a zero division. Defaults to `'warn'`. + + - 0: Returns 0 when there is a zero division. + - 1: Returns 1 when there is a zero division. + - `'warn'`: Raises a warning and then returns 0 when there is a zero division. Returns: f1 (`float` or `array` of `float`): F1 score or list of f1 scores, depending on the value passed to `average`. Minimum possible value is 0. Maximum possible value is 1. Higher f1 scores are better. @@ -84,6 +89,13 @@ >>> results = f1_metric.compute(predictions=[[0, 1, 1], [1, 1, 0]], references=[[0, 1, 1], [0, 1, 0]], average="macro") >>> print(round(results['f1'], 2)) 0.67 + + Example 6-The same multiclass example as in Example 4, but with `zero_division` set to `1` for labels with no predicted or true samples. + >>> predictions = [0, 0, 0, 0, 0] + >>> references = [0, 1, 0, 1, 2] + >>> results = f1_metric.compute(predictions=predictions, references=references, average=None, labels=[0, 1, 2, 3], zero_division=1) + >>> print([round(res, 2) for res in results['f1']]) + [0.57, 0.0, 0.0, 1.0] """ @@ -123,8 +135,23 @@ def _info(self): reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html"], ) - def _compute(self, predictions, references, labels=None, pos_label=1, average="binary", sample_weight=None): + def _compute( + self, + predictions, + references, + labels=None, + pos_label=1, + average="binary", + sample_weight=None, + zero_division="warn", + ): score = f1_score( - references, predictions, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight + references, + predictions, + labels=labels, + pos_label=pos_label, + average=average, + sample_weight=sample_weight, + zero_division=zero_division, ) return {"f1": score if getattr(score, "size", 1) > 1 else float(score)} From 5811e9a1bcd8379f37db2e120d660483887c7891 Mon Sep 17 00:00:00 2001 From: YousefZahran1 Date: Thu, 7 May 2026 18:31:03 +0300 Subject: [PATCH 2/2] refactor: use **kwargs in _compute to pass all args to sklearn --- metrics/f1/f1.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/metrics/f1/f1.py b/metrics/f1/f1.py index 353dc7b7..34a31f01 100644 --- a/metrics/f1/f1.py +++ b/metrics/f1/f1.py @@ -39,7 +39,7 @@ - 'weighted': Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters `'macro'` to account for label imbalance. This option can result in an F-score that is not between precision and recall. - 'samples': Calculate metrics for each instance, and find their average (only meaningful for multilabel classification). sample_weight (`list` of `float`): Sample weights Defaults to None. - zero_division (`int` or `string`): Sets the value to return when there is a zero division. Defaults to `'warn'`. + zero_division (`int` or `"warn"`, optional): Passed directly to sklearn's `f1_score`. Controls behavior when a label has no predicted or true samples. Use `0`, `1`, or `"warn"` (default sklearn behavior). - 0: Returns 0 when there is a zero division. - 1: Returns 1 when there is a zero division. @@ -140,18 +140,12 @@ def _compute( predictions, references, labels=None, - pos_label=1, - average="binary", - sample_weight=None, - zero_division="warn", + **kwargs, ): score = f1_score( references, predictions, labels=labels, - pos_label=pos_label, - average=average, - sample_weight=sample_weight, - zero_division=zero_division, + **kwargs, ) return {"f1": score if getattr(score, "size", 1) > 1 else float(score)}