Hyperparameter_optimization_FABOLAS-HyperBand/fabolas2.py at main · marcopibbes/Hyperparameter_optimization_FABOLAS-HyperBand · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
from hpo import HPO
import torch
import gpytorch
import botorch
import time
import random
import math
from botorch.models import SingleTaskGP
from gpytorch.mlls import ExactMarginalLogLikelihood
from botorch.fit import fit_gpytorch_mll
from botorch.utils.transforms import normalize, unnormalize
from botorch.acquisition import qUpperConfidenceBound
from botorch.optim import optimize_acqf
from botorch.acquisition import AcquisitionFunction


class FabolasAcquisition(AcquisitionFunction):

    def __init__(
        self,
        model_loss: SingleTaskGP,
        model_cost: SingleTaskGP,
    ):
        super().__init__(model=model_loss)
        self.model_loss = model_loss
        self.model_cost = model_cost

    @botorch.utils.transforms.t_batch_mode_transform(expected_q=1)
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        # X ha forma (b, q, d), es. [512, 1, 5]

        # --- Parte della Loss (numeratore) ---
        posterior_loss = self.model_loss.posterior(X)

        # posterior_loss.mean ha forma (b, q, 1), es. [512, 1, 1]
        mean_loss = posterior_loss.mean.squeeze(-1)  # -> shape [512, 1]

        # posterior_loss.variance ha forma (b, q, 1), es. [512, 1, 1]
        sigma_loss = posterior_loss.variance.clamp_min(1e-9).sqrt().squeeze(-1) # -> shape [512, 1]

        lcb = mean_loss - 2.0 * sigma_loss # -> shape [512, 1]
        acq_numerator = -lcb # -> shape [512, 1]

        # --- Parte del Costo (denominatore) ---
        posterior_cost = self.model_cost.posterior(X)

        # posterior_cost.mean ha forma (b, q, 1), es. [512, 1, 1]
        mean_cost = posterior_cost.mean.squeeze(-1) # -> shape [512, 1]

        cost_pred = torch.exp(mean_cost) # -> shape [512, 1]
        total_cost = cost_pred.clamp_min(1e-9) # -> shape [512, 1]

        # Divisione elemento per elemento
        final_acq = acq_numerator / total_cost # -> shape [512, 1]

        # ----- LA CORREZIONE CHIAVE È QUI -----
        # BoTorch si aspetta un output di forma (b,), es. [512]
        # Dato che q=1, dobbiamo rimuovere anche la dimensione 'q'.
        return final_acq.squeeze(-1) # -> Rimuove l'ultima dim, passando da [512, 1] a [512]

# ==============================================================================
# Custom Kernel for Subset Size
# ==============================================================================

class FabolasDatasetKernel(gpytorch.kernels.Kernel):
    """
    Kernel custom per la dimensione del dataset 's' in FABOLAS.

    Implementa un kernel degenere (a rango finito) basato su funzioni di base φ(s).
    Questo è matematicamente equivalente a un kernel lineare applicato allo spazio
    delle feature generate da φ(s).

    k(s1, s2) = φ(s1) @ φ(s2)^T
    """
    is_stationary = False  # Il kernel non dipende solo dalla distanza s1 - s2

    def __init__(self, basis_functions, **kwargs):
        """
        Args:
            basis_functions (callable): Una funzione (es. loss_basis_functions)
                                        che prende un tensore di 's' e restituisce
                                        le feature mappate.
        """
        super().__init__(**kwargs)
        self.basis_functions = basis_functions

    def forward(self, s1: torch.Tensor, s2: torch.Tensor, diag: bool = False, **params) -> torch.Tensor:
        """
        Calcola la matrice di covarianza.

        Args:
            s1 (torch.Tensor): Primo set di input per 's'. Shape: (..., n, 1)
            s2 (torch.Tensor): Secondo set di input per 's'. Shape: (..., m, 1)
            diag (bool): Se True, calcola solo la diagonale della matrice di covarianza.

        Returns:
            torch.Tensor: La matrice di covarianza di shape (..., n, m).
        """
        # GPyTorch passa i dati con una dimensione finale di 1.
        # Le nostre funzioni di base si aspettano un vettore 1D, quindi usiamo .squeeze(-1).
        phi_s1 = self.basis_functions(s1.squeeze(-1))
        phi_s2 = self.basis_functions(s2.squeeze(-1))

        # Calcola il prodotto scalare (dot product) per ottenere la matrice di covarianza.
        # Questa è l'operazione centrale del kernel lineare nello spazio delle feature.
        covar = phi_s1 @ phi_s2.transpose(-1, -2)

        if diag:
            # Se richiesto, restituisce solo la diagonale. Utile per calcolare la varianza.
            return covar.diag()

        return covar


def loss_basis_functions(s: torch.Tensor) -> torch.Tensor:
            """
            Base functions for modeling the loss.
            Assumes a parabolic learning curve.
            φ_f(s) = [1, (1-s)^2]


            Args:
                s (torch.Tensor): A 1D tensor with the relative sizes of the subsets.


            Returns:
                torch.Tensor: A tensor of shape [len(s), 2] with the mapped features.

            """
            # Ensures that 's' is a tensor
            if not isinstance(s, torch.Tensor):
                s = torch.tensor(s)

            # Builds the two features and stacks them column-wise

            ones = torch.ones_like(s)
            parabolic_term = (1 - s).pow(2)
            return torch.stack([ones, parabolic_term], dim=-1)

def cost_basis_functions(s: torch.Tensor) -> torch.Tensor:
            """
            Funzioni di base per modellare il log-costo.
            Assume una crescita lineare del log-costo (cioè una crescita polinomiale/esponenziale del costo).
            φ_c(s) = [1, s]

            Args:
                s (torch.Tensor): Un tensore 1D con le dimensioni relative dei subset.

            Returns:
                torch.Tensor: Un tensore di shape [len(s), 2] con le feature mappate.
            """
            if not isinstance(s, torch.Tensor):
                s = torch.tensor(s)

            ones = torch.ones_like(s)
            linear_term = s
            return torch.stack([ones, linear_term], dim=-1)


class FabolasHPO(HPO):
    def __init__(
        self,
        domains,
        max_resources,
        resource_type = 'epochs',
        global_loop_type = 'time',
        metric_to_monitor = "loss",
        monitor_mode = min,
        resource_per_config = 20,
        n_configs = 400,
        dataset_name = 'mnist',
        s = [],
        time_unit = 1,
        n_runs = 1
    ):
        assert monitor_mode in [min, max], "monitor_mode must be either 'min' or 'max'"
        super().__init__(
            metric_to_monitor=metric_to_monitor,
            monitor_mode=monitor_mode,
            resource_type=resource_type,
            domains=domains,
            dataset_name=dataset_name,
            time_unit = time_unit,
            n_runs = n_runs
        )

        self.global_loop_type = global_loop_type
        self.max_resources = max_resources #iteration/time
        self.resource_per_config = resource_per_config #epochs/time
        self.n_configs = n_configs
        self.s = s if s else [2**i for i in range(-9, 0)]


    def _run_optimization(self):

        ####### INITIALIZE DATA D_0 #######

        # configs are tuples (x, s) where x is the config and s is the relative dataset size
        initial_configs = self._get_random_configs(self.n_configs)
        initial_rsizes = [random.choice(self.s[:(len(self.s)//3)]) for _ in range(self.n_configs)]
        configs = list(zip(initial_configs, initial_rsizes))
        best_loss_so_far = float('inf')

        initial_data=[]
        # Train initial models
        for config, rsize in configs:
            model = self.build_model(config)
            size = int(rsize *  len(self.train))
            z0= time.time()
            self._training_pipeline(model, config, resources=self.resource_per_config, samples=size, save_log=True)
            z = time.time() - z0
            loss = self._evaluate_model(model)["loss"]
            if not math.isfinite(loss):
                print(f"[WARNING] - Training failed for config {config}. Loss is {loss}. Skipping this point.")
                # Non aggiungere questo punto ai dati e passa alla prossima configurazione
                continue
            initial_data.append([config, rsize, loss, z])


        ###### PREPARING DATA FOR GAUSSIAN PROCESS ######
        #print("[INFO] - Preparing data for Gaussian Process model...")
        numeric_params = []
        bounds_list = []
        for name, domain in self.domains.items():
            if len(domain) == 3 and domain[2] in ["continuous", "integer"]:
                numeric_params.append(name)
                bounds_list.append((domain[0], domain[1]))


        train_x_list = []
        train_y_loss_list = []
        train_y_cost_list = []

        for config, rsize, loss, z in initial_data:
            numeric_config_vals = [config[param] for param in numeric_params]
            x_vec = numeric_config_vals + [rsize]
            train_x_list.append(x_vec)
            train_y_loss_list.append(loss)
            train_y_cost_list.append(math.log(z + 1e-9))  # Adding a small constant to avoid log(0)

        train_x= torch.tensor(train_x_list, dtype=torch.double)
        train_y_loss= torch.tensor(train_y_loss_list, dtype=torch.double).view(-1, 1)
        train_y_cost = torch.tensor(train_y_cost_list, dtype=torch.double).view(-1, 1)

        bounds_tensor= torch.tensor(bounds_list + [[min(self.s), max(self.s)]], dtype=torch.double).t()
        train_x_normalized = normalize(train_x, bounds_tensor)

        ####### FITTING THE GAUSSIAN PROCESS MODEL #######
        #start_time = time.time()

        # Nota: usiamo l'operatore ternario per l'if/else in una sola riga

        #condition_checker = lambda it: (time.time() - start_time < self.max_resources) \
        #                               if self.global_loop_type == 'time' \
        #                               else (it < self.max_resources)
        condition_checker = lambda it: (self.run_time < self.max_resources) \
                                       if self.global_loop_type == 'time' \
                                       else (it < self.max_resources)

        iteration = 0

        while condition_checker(iteration):
            #print("[INFO] -  Building Kernels for Gaussian Process model... (loss and cost)")
            num_numeric_params = len(numeric_params)
            param_dims= list(range(num_numeric_params))
            subset_dim=[num_numeric_params]


            # Kernel for parameters (params_kernel), common in both loss and cost kernels
            params_kernel = gpytorch.kernels.ScaleKernel(
                gpytorch.kernels.MaternKernel(nu=2.5, ard_num_dims=len(param_dims), active_dims=param_dims),
            )


            # Kernel for loss (params_kernel * loss_kernel)
            subset1_kernel = gpytorch.kernels.ScaleKernel(
                FabolasDatasetKernel(loss_basis_functions, active_dims=subset_dim),
            )

            loss_kernel = gpytorch.kernels.ProductKernel(
                params_kernel,
                subset1_kernel
            )

            # Kernel for cost (params_kernel * cost_kernel)
            subset2_kernel = gpytorch.kernels.ScaleKernel(
                FabolasDatasetKernel(cost_basis_functions, active_dims=subset_dim),
            )
            cost_kernel = gpytorch.kernels.ProductKernel(
                params_kernel,
                subset2_kernel
            )

            #print("[INFO] -  Fitting Gaussian Process model for loss...")
            gp_loss = SingleTaskGP(
                train_X=train_x_normalized,
                train_Y=train_y_loss,
                covar_module=loss_kernel
            )

            mll_loss = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss)

            # Train the Gaussian Process model for loss
            gp_loss.train()
            mll_loss.train()

            # Fit the model
            fit_gpytorch_mll(mll_loss)
            #print("[INFO] - Loss GP is ready.")

            #print("[INFO] -  Fitting Gaussian Process model for cost...")
            gp_cost = SingleTaskGP(
                train_X=train_x_normalized,
                train_Y=train_y_cost,
                covar_module=cost_kernel
                )

            # Crea la funzione di verosimiglianza
            mll_cost = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost)

            # Train mode
            gp_cost.train()
            mll_cost.train()

            # Fit the model
            fit_gpytorch_mll(mll_cost)
            #print("[INFO] - Cost GP is ready.")


            gp_loss.eval()
            gp_cost.eval()

            '''def fabolas_acquisition_function(X_normalized: torch.Tensor) -> torch.Tensor:


                # Lower confidence bound (LCB) for the loss
                posterior_loss = gp_loss.posterior(X_normalized)
                mean_loss = posterior_loss.mean
                sigma_loss = posterior_loss.variance.clamp_min(1e-9).sqrt()
                lcb = mean_loss - 2.0 * sigma_loss # beta=2.0

                # Want to optimize the negative LCB
                acq_numerator = -lcb

                # Cost
                posterior_cost = gp_cost.posterior(X_normalized)
                cost_pred = torch.exp(posterior_cost.mean)
                total_cost = cost_pred.clamp_min(1e-9)

                return acq_numerator / total_cost'''


            ######## ACQUISITION FUNCTION FOR FABOLAS ########

            # Define a custom function that BoTorch can optimize.
            # This function will take a tensor X of candidates and return
            # the FABOLAS acquisition value for each candidate.


            # 1. Istanzia la tua classe di acquisizione
            fabolas_acq = FabolasAcquisition(model_loss=gp_loss, model_cost=gp_cost)

            # 2. Chiama optimize_acqf con l'istanza della classe
            normalized_bounds = torch.tensor([[0.0] * (num_numeric_params + 1), [1.0] * (num_numeric_params + 1)], dtype=torch.double)

            candidate_normalized, acq_value = optimize_acqf(
                acq_function=fabolas_acq,  # <-- Passa l'oggetto, non la funzione
                bounds=normalized_bounds,
                q=1,
                num_restarts=10,
                raw_samples=512,
                )

            '''
        # --- 3.C. FIND NEXT CANDIDATE ---
            print("[INFO] - Optimizing acquisition function to find the next candidate...")
            normalized_bounds = torch.tensor([[0.0] * (num_numeric_params + 1), [1.0] * (num_numeric_params + 1)], dtype=torch.double)
            candidate_normalized, acq_value = optimize_acqf(
                acq_function=fabolas_acquisition_function,
                bounds=normalized_bounds,
                q=1, num_restarts=10, raw_samples=512,
            )
            '''

            # --- 3.D. DE-NORMALIZE AND EVALUATE CANDIDATE ---
            candidate_denormalized = unnormalize(candidate_normalized, bounds=bounds_tensor)
            new_params_vec = candidate_denormalized[0, :-1]
            new_s = candidate_denormalized[0, -1].item()

            new_config = self._get_random_config()

            #print(f"[INFO] - Evaluating next candidate: s={new_s:.4f}, config={new_config}")
            for i, p_name in enumerate(numeric_params):
                value = new_params_vec[i].item()
                if len(self.domains[p_name]) == 2 :new_config[p_name] = self.domains[p_name][0][int(round(value))]
                elif self.domains[p_name][2] == "integer" : new_config[p_name] = int(round(value))
                else : new_config[p_name] = value


            model = self.build_model(new_config)

            size = int(new_s * len(self.train))
            if size < 1:
                print("[WARNING] - Selected subset size too small. Skipping evaluation.")
                continue

            z0 = time.time()
            self._training_pipeline(model, new_config, resources=self.resource_per_config, samples=size)
            z = time.time() - z0
            loss = self._evaluate_model(model)["loss"]

            if not math.isfinite(loss):
                print(f"[WARNING] - Training failed for config {config}. Loss is {loss}. Skipping this point.")
                # Non aggiungere questo punto ai dati e passa alla prossima configurazione
                continue
            #print(f"[INFO] - New candidate evaluated: s={new_s:.4f}, config={new_config}, loss={loss:.4f}, time={z:.2f}s")
            # --- 3.E. AUGMENT DATA ---
            new_x_vec = torch.cat([new_params_vec, torch.tensor([new_s], dtype=torch.double)])
            train_x = torch.cat([train_x, new_x_vec.unsqueeze(0)])
            train_y_loss = torch.cat([train_y_loss, torch.tensor([[loss]], dtype=torch.double)])
            train_y_cost = torch.cat([train_y_cost, torch.tensor([[math.log(z + 1e-9)]], dtype=torch.double)])
            train_x_normalized = normalize(train_x, bounds=bounds_tensor)

            # --- 3.F. CHOOSE INCUMBENT (BEST CONFIG SO FAR) ---
            #print("[INFO] - Identifying the best configuration so far (incumbent)...")
            unique_configs_norm, _ = torch.unique(train_x_normalized[:, :-1], dim=0, return_inverse=True)
            s_full_dataset_norm = normalize(torch.tensor([[1.0]], dtype=torch.double), bounds=bounds_tensor[:, -1:])
            points_to_predict = torch.cat([unique_configs_norm, s_full_dataset_norm.expand(unique_configs_norm.shape[0], 1)], dim=1)

            with torch.no_grad(), gpytorch.settings.fast_pred_var():
                predicted_loss_at_s1 = gp_loss.posterior(points_to_predict).mean

            best_pred_loss, best_idx = torch.min(predicted_loss_at_s1, dim=0)

            if best_pred_loss.item() < best_loss_so_far:
                best_loss_so_far = best_pred_loss.item()


                # Ricostruisci il dizionario della migliore configurazione
                incumbent_config = self._get_random_config()


                best_config_so_far = incumbent_config

                #print(f"[INFO] - New best incumbent found! Predicted loss at s=1: {best_loss_so_far:.4f}")

            iteration+= 1

    # --- 4. RETURN FINAL RESULT ---
        #print("\n[INFO] - Time budget exhausted. Returning best incumbent found.")
        return self.history