NumpyWDL/base_estimator.py at master · stasi009/NumpyWDL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import numpy as np
# from sklearn.metrics import roc_auc_score, log_loss
from metrics import logloss as log_loss, auc as roc_auc_score
from tqdm import tqdm
import logging


class BaseEstimator:

    def __init__(self, data_source):
        self._data_source = data_source

    def get_metrics(self, scores, labels, prefix):
        scores = np.asarray(scores)
        labels = np.asarray(labels)

        metrics = {'{}_logloss'.format(prefix): log_loss(y_true=labels, y_pred=scores),
                   '{}_auc'.format(prefix): roc_auc_score(y_true=labels, y_score=scores)}

        pred_labels = (scores > 0.5).astype(int)
        metrics['{}_accuracy'.format(prefix)] = np.sum(pred_labels == labels) / len(labels)

        return metrics

    def train_batch(self, features, labels):
        """
        :param features: dict, field_name ==> dense matrix or SparseInput
        :param labels: [batch_size] ndarray
        :return: [batch_size] ndarray of predicted probabilities in that batch
        """
        raise NotImplementedError()

    def predict(self, features):
        """
        :param features: dict, field_name ==> dense matrix or SparseInput
        :return: [batch_size] ndarray of predicted probabilities in that batch
        """
        raise NotImplementedError()

    def _train_epoch(self):
        scores = []
        labels = []

        batch_stream = self._data_source.train_batches_per_epoch()
        for batch_features, batch_labels in tqdm(batch_stream):
            pred_probas = self.train_batch(batch_features, batch_labels)

            scores.extend(pred_probas)
            labels.extend(batch_labels)

        return self.get_metrics(scores=scores, labels=labels, prefix='train')

    def _eval_epoch(self):
        scores = []
        labels = []

        batch_stream = self._data_source.test_batches_per_epoch()
        for batch_features, batch_labels in tqdm(batch_stream):
            pred_probas = self.predict(batch_features)

            scores.extend(pred_probas)
            labels.extend(batch_labels)

        return self.get_metrics(scores=scores, labels=labels, prefix='test')

    def train(self, n_epochs):
        metrics_history = []
        for epoch_idx in range(n_epochs):
            logging.info("\n=============== {}-th EPOCH".format(epoch_idx + 1))

            metrics = {}
            metrics.update(self._train_epoch())
            metrics.update(self._eval_epoch())

            logging.info(metrics)
            metrics_history.append(metrics)

        return metrics_history