diff --git a/benchmark_utils/__init__.py b/benchmark_utils/__init__.py
index 5f8fa37..eaeac2f 100644
--- a/benchmark_utils/__init__.py
+++ b/benchmark_utils/__init__.py
@@ -3,11 +3,9 @@
 # name `benchmark_utils`, and code defined inside will be importable using
 # the usual import syntax
 
-from benchopt import safe_import_context
 from pathlib import Path
 
-with safe_import_context() as import_ctx:
-    import numpy as np
+import numpy as np
 
 
 def mean_overlaping_pred(predictions, stride):
@@ -24,7 +22,9 @@ def mean_overlaping_pred(predictions, stride):
     np.ndarray: Averaged predictions for each feature.
     """
     n_windows, H, n_features = predictions.shape
-    total_length = (n_windows-1) * stride + H - 1
+    # The last window starts at (n_windows-1)*stride and covers H samples, so
+    # the reconstructed signal spans (n_windows-1)*stride + H positions.
+    total_length = (n_windows - 1) * stride + H
 
     # Array to store accumulated predictions for each feature
     accumulated = np.zeros((total_length, n_features))
diff --git a/benchmark_utils/metrics.py b/benchmark_utils/metrics.py
index 4607670..9bcad02 100644
--- a/benchmark_utils/metrics.py
+++ b/benchmark_utils/metrics.py
@@ -1,7 +1,24 @@
-from benchopt import safe_import_context
+import numpy as np
 
-with safe_import_context() as import_ctx:
-    import numpy as np
+
+def _dilate(mask: np.ndarray, radius: int) -> np.ndarray:
+    """Binary dilation with a centered window of half-width ``radius``.
+
+    ``out[i]`` is True iff any entry of ``mask`` in ``[i-radius, i+radius]``
+    (clipped to the array) is truthy. Matches the half-open slice
+    ``mask[max(0, i-r):min(n, i+r+1)]`` used by the soft metrics.
+    """
+    mask = np.asarray(mask)
+    n = mask.shape[0]
+    if n == 0:
+        return np.zeros(0, dtype=bool)
+    if radius <= 0:
+        return mask.astype(bool, copy=False)
+    cum = np.concatenate(([0], np.cumsum(mask.astype(np.int64))))
+    idx = np.arange(n)
+    left = np.maximum(0, idx - radius)
+    right = np.minimum(n, idx + radius + 1)
+    return (cum[right] - cum[left]) > 0
 
 
 def soft_precision(y_true: np.ndarray,
@@ -35,47 +52,34 @@ def soft_precision(y_true: np.ndarray,
         fa : int
             Number of false anomalies
     """
-    # EM : Exact Match
-    em = 0
-    # DA : Detected Anomaly
-    da = 0
-    # FA : False Anomaly
-    fa = 0
-
-    # TFDIR = (EM + DA) / (EM + DA + FA)
+    y_true = np.asarray(y_true)
+    y_pred = np.asarray(y_pred)
 
-    # Counting exact matches
-    for i in range(len(y_true)):
-        if y_true[i] == 1 and (y_true[i] == y_pred[i]):
-            em += 1
+    true_mask = y_true == 1
+    pred_mask = y_pred == 1
 
-    # False anomaly and detected anomalies
-    for i in range(len(y_true)):
+    # TFDIR = (EM + DA) / (EM + DA + FA)
 
-        left = max(0, i-detection_range)
-        right = min(len(y_true), i+detection_range+1)
+    # EM : Exact Match
+    em = int(np.sum(true_mask & pred_mask))
 
-        if y_pred[i] == 1 and (
-                y_true[left:right] == 0).all():
-            fa += 1
+    true_dil = _dilate(true_mask, detection_range)
+    pred_dil = _dilate(pred_mask, detection_range)
 
-        if y_true[i] == 1 and (
-                y_pred[left:right] == 1).any():
-            da += 1
+    # DA : Detected Anomaly
+    fa = int(np.sum(pred_mask & ~true_dil))
 
+    # FA : False Anomaly
     # Removing exact matches from detected anomalies because they are
     # counted twice
-    da -= em
+    da = int(np.sum(true_mask & pred_dil)) - em
 
-    if return_counts:
-        if em + da + fa == 0:
-            return 0, em, da, fa
-
-        return (em + da) / (em + da + fa), em, da, fa
+    total = em + da + fa
+    score = (em + da) / total if total else 0
 
-    if em + da + fa == 0:
-        return 0
-    return (em + da) / (em + da + fa)
+    if return_counts:
+        return score, em, da, fa
+    return score
 
 
 def soft_recall(y_true: np.ndarray,
@@ -104,46 +108,25 @@ def soft_recall(y_true: np.ndarray,
         ma : int
             Number of missed anomalies
     """
-    # EM : Exact Match
-    em = 0
-    # DA : Detected Anomaly
-    da = 0
-    # MA : Missed Anomaly
-    ma = 0
-    # DAIR = (EM + DA) / (EM + DA + MA)
+    y_true = np.asarray(y_true)
+    y_pred = np.asarray(y_pred)
 
-    # Counting exact matches
-    for i in range(len(y_true)):
-        if y_true[i] == 1 and (y_true[i] == y_pred[i]):
-            em += 1
+    true_mask = y_true == 1
+    pred_mask = y_pred == 1
 
-    # Missing values and detected anomalies
-    for i in range(len(y_true)):
+    em = int(np.sum(true_mask & pred_mask))
 
-        left = max(0, i-detection_range)
-        right = min(len(y_true), i+detection_range+1)
+    pred_dil = _dilate(pred_mask, detection_range)
 
-        if y_true[i] == 1 and (
-                y_pred[left:right] == 0).all():
-            ma += 1
+    ma = int(np.sum(true_mask & ~pred_dil))
+    da = int(np.sum(true_mask & pred_dil)) - em
 
-        if y_true[i] == 1 and (
-                y_pred[left:right] == 1).any():
-            da += 1
-
-    # Removing exact matches from detected anomalies because they are
-    # counted twice
-    da -= em
+    total = em + da + ma
+    score = (em + da) / total if total else 0
 
     if return_counts:
-        if em + da + ma == 0:
-            return 0, em, da, ma
-
-        return (em + da) / (em + da + ma), em, da, ma
-
-    if em + da + ma == 0:
-        return 0
-    return (em + da) / (em + da + ma)
+        return score, em, da, ma
+    return score
 
 
 def ctt(y_true: np.ndarray, y_pred: np.ndarray, return_signed: bool = False):
@@ -240,22 +223,34 @@ def ttc(y_true: np.ndarray, y_pred: np.ndarray, return_signed: bool = False):
     return tot_dist / np.sum(y_true)
 
 
-def soft_f1(precision, recall):
+def soft_f1(precision, recall, detection_range=None):
     """
     Calculate the F1 score from precision and recall.
 
     Parameters
     ----------
-        precision : float
+        precision : float or np.ndarray
             Precision score
-        recall : float
+        recall : float or np.ndarray
             Recall score
+        detection_range : int, optional
+            If provided, ``precision`` and ``recall`` are interpreted as the
+            true and predicted label arrays used by ``soft_precision`` and
+            ``soft_recall``.
 
     Returns
     -------
         f1 : float
             F1 score
     """
+    if detection_range is not None:
+        precision_score = soft_precision(
+            precision, recall, detection_range=detection_range
+        )
+        recall_score = soft_recall(
+            precision, recall, detection_range=detection_range)
+        precision, recall = precision_score, recall_score
+
     if precision + recall == 0:
         return 0
     return 2 * (precision * recall) / (precision + recall)
@@ -280,21 +275,15 @@ def extract_anomaly_ranges(labels: list[int]):
                 Each tuple represents a range (start_index, end_index)
                 where anomalies are present.
     """
-    ranges = []
-    start = None
-
-    for i, label in enumerate(labels):
-        if label == 1 and start is None:
-            start = i  # Start of a new anomaly range
-        elif label == 0 and start is not None:
-            ranges.append((start, i - 1))  # End of the current anomaly range
-            start = None
-
-    # Handle the case where the series ends with an anomaly
-    if start is not None:
-        ranges.append((start, len(labels) - 1))
-
-    return ranges
+    arr = np.asarray(labels)
+    if arr.size == 0:
+        return []
+    binary = (arr == 1).astype(np.int8)
+    padded = np.concatenate(([0], binary, [0]))
+    diff = np.diff(padded)
+    starts = np.where(diff == 1)[0]
+    ends = np.where(diff == -1)[0] - 1
+    return list(zip(starts.tolist(), ends.tolist()))
 
 
 def existence_reward(real_range, predicted_ranges):
diff --git a/benchmark_utils/models.py b/benchmark_utils/models.py
index 7bcf100..a2b4f2a 100644
--- a/benchmark_utils/models.py
+++ b/benchmark_utils/models.py
@@ -1,4 +1,10 @@
 from torch import nn
+from sklearn.preprocessing import MinMaxScaler
+import torch
+import torch.optim as optim
+from torch.utils.data import DataLoader, Dataset
+import numpy as np
+from tqdm import tqdm
 
 
 class ARModel(nn.Module):
@@ -122,3 +128,253 @@ def forward(self, x):
         x, (_, _) = self.decoder(x)
 
         return x
+
+
+class SlidingWindowDataset(Dataset):
+    def __init__(self, data, window_size):
+        self.data = data
+        self.window_size = window_size
+
+    def __len__(self):
+        return len(self.data) - self.window_size + 1
+
+    def __getitem__(self, idx):
+        window = self.data[idx:idx + self.window_size]
+        return window  # Input and target are the same for autoencoder
+
+
+class Autoencoder(nn.Module):
+    def __init__(
+            self,
+            input_size=32,
+            hidden_size=32,
+            latent_size=16,
+            sliding_window=10
+    ):
+        super(Autoencoder, self).__init__()
+
+        self.sliding_window = sliding_window
+        self.decision_scores_ = None
+
+        # Encoder
+        self.encoder = nn.Sequential(
+            nn.Linear(input_size, hidden_size),
+            nn.ReLU(),
+            nn.BatchNorm1d(hidden_size),
+            nn.Linear(hidden_size, latent_size),
+            nn.ReLU(),
+            nn.BatchNorm1d(latent_size),
+        )
+
+        # Decoder
+        self.decoder = nn.Sequential(
+            nn.Linear(latent_size, hidden_size),
+            nn.ReLU(),
+            nn.Linear(hidden_size, input_size),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        # Flatten input if needed
+        x = x.view(x.size(0), -1)
+
+        # Encode
+        encoded = self.encoder(x)
+
+        # Decode
+        decoded = self.decoder(encoded)
+
+        return decoded
+
+    def encode(self, x):
+        x = x.view(x.size(0), -1)
+        return self.encoder(x)
+
+    def _create_sliding_windows(self, X):
+        """Create sliding windows from input data"""
+        if isinstance(X, np.ndarray):
+            X = torch.from_numpy(X).float()
+
+        # If X is 1D, reshape to 2D
+        if X.dim() == 1:
+            X = X.unsqueeze(1)
+
+        windows = []
+        for i in range(len(X) - self.sliding_window + 1):
+            window = X[i:i + self.sliding_window].flatten()
+            windows.append(window)
+
+        return torch.stack(windows)
+
+    def fit(
+        self,
+        X,
+        num_epochs=50,
+        learning_rate=1e-3,
+        device=None,
+        batch_size=32
+    ):
+        """
+        Train the autoencoder on the provided data.
+
+        Args:
+            X: Input data tensor or numpy array shape (n_samples, n_features)
+            num_epochs: Number of training epochs
+            learning_rate: Learning rate for optimizer
+            device: Device to train on ('cuda' or 'cpu')
+            batch_size: Batch size for training
+
+        Returns:
+            List of training losses per epoch
+        """
+        if device is None:
+            device = torch.device(
+                "cuda" if torch.cuda.is_available() else "cpu"
+            )
+
+        # Convert to tensor if numpy array
+        if isinstance(X, np.ndarray):
+            X = torch.from_numpy(X).float()
+
+        # Ensure X is 2D
+        if X.dim() == 1:
+            X = X.unsqueeze(1)
+        if X.dim() == 3:
+            # (n_samples, n_timesteps, n_features)
+            X = X.view(-1, 1)
+
+        # Create sliding windows
+        windowed_data = self._create_sliding_windows(X)
+
+        # Create dataset and dataloader
+        # window_size=1 since we already created windows
+        dataset = SlidingWindowDataset(windowed_data, window_size=1)
+        dataloader = DataLoader(
+            dataset, batch_size=batch_size, shuffle=True, drop_last=True)
+
+        self.to(device)
+        criterion = nn.MSELoss()
+        optimizer = optim.Adam(self.parameters(), lr=learning_rate)
+
+        self.train()
+        losses = []
+
+        # Progress bar for epochs
+        epoch_pbar = tqdm(range(num_epochs), desc="Training", unit="epoch")
+
+        for epoch in epoch_pbar:
+            epoch_loss = 0.0
+
+            # Progress bar for batches
+            batch_pbar = tqdm(
+                dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
+
+            for batch_idx, (data) in enumerate(batch_pbar):
+                data = data.to(device)
+
+                # Forward pass
+                output = self(data)
+                loss = criterion(output, data)
+
+                # Backward pass
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+                epoch_loss += loss.item()
+
+                # Update batch progress bar
+                batch_pbar.set_postfix({"Batch Loss": f"{loss.item():.4f}"})
+
+            avg_loss = epoch_loss / len(dataloader)
+            losses.append(avg_loss)
+
+            # Update epoch progress bar
+            epoch_pbar.set_postfix({"Avg Loss": f"{avg_loss:.4f}"})
+
+        return losses
+
+    def predict(self, X_test, X_dirty=None, device=None):
+        """
+        Predict anomaly scores for time series data.
+
+        Args:
+            X_test: Test data for reconstruction
+            X_dirty: Original dirty data (if None, uses X_test)
+            device: Device to run inference on
+
+        Returns:
+            Reconstructed data and sets decision_scores_ attribute
+        """
+        if device is None:
+            device = torch.device(
+                "cuda" if torch.cuda.is_available() else "cpu")
+
+        self.eval()
+        self.to(device)
+
+        # Create sliding windows for test data
+        if isinstance(X_test, np.ndarray):
+            X_test = torch.from_numpy(X_test).float()
+
+        windowed_test = self._create_sliding_windows(X_test)
+        windowed_test = windowed_test.to(device)
+
+        with torch.no_grad():
+            test_predict = self(windowed_test).cpu().numpy()
+
+        # Calculate MAE loss
+        test_mae_loss = np.mean(
+            np.abs(test_predict - windowed_test.cpu().numpy()), axis=1)
+
+        # Normalize MAE loss
+        nor_test_mae_loss = MinMaxScaler().fit_transform(
+            test_mae_loss.reshape(-1, 1)).ravel()
+
+        # Use X_dirty if provided, otherwise use original X_test
+        if X_dirty is None:
+            if isinstance(X_test, torch.Tensor):
+                X_dirty = X_test.cpu().numpy()
+            else:
+                X_dirty = X_test
+
+        # Initialize score array
+        score = np.zeros(len(X_dirty))
+
+        # Fill the score array with sliding window approach
+        score[self.sliding_window // 2:self.sliding_window //
+              2 + len(test_mae_loss)] = nor_test_mae_loss
+        score[:self.sliding_window // 2] = nor_test_mae_loss[0]
+        score[self.sliding_window // 2 +
+              len(test_mae_loss):] = nor_test_mae_loss[-1]
+
+        # Store decision scores
+        self.decision_scores_ = score
+
+        return test_predict
+
+    def encode_data(self, x, device=None):
+        """
+        Encode input data to latent representation.
+
+        Args:
+            x: Input tensor or numpy array
+            device: Device to run inference on
+
+        Returns:
+            Encoded data as numpy array
+        """
+        if device is None:
+            device = torch.device(
+                "cuda" if torch.cuda.is_available() else "cpu")
+
+        self.eval()
+        self.to(device)
+
+        # Convert to tensor if numpy array
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x).float()
+        x = x.to(device)
+        with torch.no_grad():
+            encoded = self.encode(x)
+        return encoded.cpu().numpy()
diff --git a/benchmark_utils/predictions.py b/benchmark_utils/predictions.py
new file mode 100644
index 0000000..7517dbc
--- /dev/null
+++ b/benchmark_utils/predictions.py
@@ -0,0 +1,34 @@
+import numpy as np
+
+
+def cutoff_scores(anomaly_scores, cutoff=None):
+    """Turn anomaly scores into binary predictions using a contamination rate.
+
+    Larger scores are assumed to be more anomalous. NaN entries are preserved
+    as ``-1`` ignore labels so they are masked by the objective.
+    """
+    if cutoff is None:
+        return None
+
+    validate_cutoff(cutoff)
+
+    scores = np.asarray(anomaly_scores)
+    predictions = np.full(scores.shape, -1, dtype=int)
+    valid = ~np.isnan(scores)
+    if not np.any(valid):
+        return predictions
+
+    threshold = np.quantile(scores[valid], 1 - cutoff)
+
+    predictions[valid] = (scores[valid] >= threshold).astype(int)
+    return predictions
+
+
+def validate_cutoff(cutoff):
+    if cutoff is None:
+        raise ValueError("cutoff must be provided.")
+    if not 0 < cutoff < 1:
+        raise ValueError(
+            "cutoff must be in (0, 1), "
+            f"got {cutoff!r}."
+        )
diff --git a/benchmark_utils/windowing.py b/benchmark_utils/windowing.py
new file mode 100644
index 0000000..362135b
--- /dev/null
+++ b/benchmark_utils/windowing.py
@@ -0,0 +1,156 @@
+import numpy as np
+import torch
+from torch.utils.data import TensorDataset
+
+
+def find_period_length(data, default=125):
+    """Estimate a reasonable period length from autocorrelation.
+
+    This local helper replaces the small ``TSB_AD`` utility previously used by
+    several solvers, avoiding a heavy optional dependency for solvers that only
+    need automatic window sizing.
+    """
+    data = np.asarray(data)
+    if data.ndim > 1:
+        return 0
+
+    data = data[: min(20_000, len(data))]
+    if len(data) < 6:
+        return 0
+
+    centered = data - data.mean()
+    norm = np.dot(centered, centered)
+    if norm == 0:
+        return default
+
+    max_lag = min(400, len(centered) - 1)
+    autocorr = np.correlate(centered, centered, mode="full")
+    autocorr = autocorr[len(centered) - 1: len(centered) + max_lag] / norm
+
+    base = 3
+    values = autocorr[base:]
+    if len(values) < 3:
+        return default
+
+    local_max = (
+        np.where((values[1:-1] > values[:-2]) &
+                 (values[1:-1] > values[2:]))[0] + 1
+    )
+
+    if len(local_max) == 0:
+        return default
+
+    lag = local_max[np.argmax(values[local_max])] + base
+    if lag < 3 or lag > 300:
+        return default
+    return int(lag)
+
+
+def make_windows(X, window_size=32, stride=1, padding=False):
+    """Create a windowed view of the data.
+
+    Parameters
+    ----------
+    X : np.ndarray
+        Input data of shape (n_samples, n_features, n_times).
+    window_size : int
+        Size of the sliding window.
+    stride : int
+        Stride of the sliding window.
+
+    Returns
+    -------
+    windows : np.ndarray
+        A windowed view of the data in shape:
+        (n_eff_samples, window_size, n_features)
+    """
+
+    if padding:
+        n_samples, n_features, n_times = X.shape
+        n_pad = (window_size - stride + n_times % stride) % stride
+        pad_width = ((0, 0), (0, 0), (0, n_pad))
+        X = np.pad(X, pad_width=pad_width, mode='constant')
+
+    return np.lib.stride_tricks.sliding_window_view(
+        X, window_shape=window_size, axis=-1
+    )[..., ::stride, :].transpose(0, 2, 1, 3).reshape(
+        -1, X.shape[1], window_size
+    ).transpose(0, 2, 1)
+
+
+def make_windowed_dataset(X, y=None, window_size=32, stride=1):
+    """
+    Create a DataLoader with windowed views of the data.
+
+    Parameters
+    ----------
+    X : np.ndarray
+        Input data of shape (n_samples, n_features, n_times).
+    y : np.ndarray, optional
+        Target data of shape (n_samples, n_times).
+    window_size : int
+        Size of the sliding window.
+    stride : int
+        Stride of the sliding window.
+
+    Returns
+    -------
+    Dataset
+        A PyTorch Dataset with windowed data in shape:
+        (n_eff_samples, window_size, n_features)
+    """
+
+    if window_size is not None:
+        X = make_windows(X, window_size, stride)
+
+    X_tensor = torch.tensor(X, dtype=torch.float32)
+
+    if y is not None:
+        if window_size is not None:
+            y = np.lib.stride_tricks.sliding_window_view(
+                y, window_shape=window_size, axis=-1
+            )[..., ::stride, :].reshape(-1, window_size)
+
+        y_tensor = torch.tensor(y, dtype=torch.float32)
+        dataset = TensorDataset(X_tensor, y_tensor)
+    else:
+        dataset = TensorDataset(X_tensor)
+
+    return dataset
+
+
+def reconstruct_from_windows(windows, stride, batch, n_features):
+    """Reconstruct the original signal from overlapping windows
+
+    Parameters
+    ----------
+    windows : np.ndarray
+        The overlapping windows of shape (batch*n_windows, window_size, n_feat)
+    stride : int
+        The stride used to create the windows
+    batch : int
+        The batch size used when creating the windows
+    n_features : int
+        The number of features in the original signal
+    """
+    # windows: (batch*n_windows, window_size, n_features)
+    w = windows.shape[1]
+    windows = windows.reshape(batch, -1, w, n_features)
+    b, nw, ws, nf = windows.shape
+    nt = (nw - 1) * stride + ws
+
+    # allocate accumulator + counts for correct overlap averaging
+    acc = np.zeros((b, nf, nt))
+    cnt = np.zeros((nt,), dtype=int)
+
+    # build index map for overlap positions
+    idx = np.arange(ws)[:, None] + stride * np.arange(nw)
+
+    # add windows efficiently
+    np.add.at(acc, (slice(None), slice(None), idx.ravel()),
+              windows.transpose(0, 3, 1, 2).reshape(b, nf, -1))
+
+    # count contributions
+    np.add.at(cnt, idx.ravel(), 1)
+
+    return acc / cnt
diff --git a/datasets/daphnet.py b/datasets/daphnet.py
new file mode 100644
index 0000000..e2d42bc
--- /dev/null
+++ b/datasets/daphnet.py
@@ -0,0 +1,158 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+PATH = config.get_data_path("DAPHNET")
+
+
+def load_data(db_path, record_ids=None, verbose=False, number=-1):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+        verbose: If True, print loading progress information.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is not None and number > 0:
+        print(
+            "Warning: 'number' parameter is "
+            "ignored when 'record_ids' is provided."
+        )
+
+    if record_ids is None:
+        # Get all available record files with .test.csv@X.out pattern
+        record_files = list(db_path.glob("*.test.csv@*.out"))
+        record_ids = [f.name.split(".")[0] for f in record_files]
+        if number > 0:
+            record_ids = record_ids[:number]
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        # Find all files matching the pattern for the given record_id
+        record_files = list(db_path.glob(f"{record_id}.test.csv@*.out"))
+
+        if not record_files:
+            if verbose:
+                print(f"No record files found for ID: {record_id}")
+            continue
+
+        for record_file in record_files:
+            if verbose:
+                print(f"Loading record file: {record_file}")
+            # Load the record data
+            record_data = pd.read_csv(
+                record_file, header=None).dropna().to_numpy()
+            # Assuming first column is the data, second column is labels
+            if record_data.shape[1] >= 2:
+                data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(
+                        f"Insufficient columns "
+                        f"for record file {record_file.name}"
+                    )
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_length = max(len(data) for data in data_list)
+
+    # Pad all sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Pad with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "DAPHNET"
+
+    parameters = {
+        # "recordings_id": [["S01R02E0"]],
+        "recordings_id": [None],  # [["S01R02E0"]],
+        "number": [-1],
+        "debug": [False],
+    }
+
+    test_parameters = {
+        "debug": [True],
+    }
+
+    def get_data(self):
+        """Load the DAPHNET dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        if self.recordings_id in (["all"], "all"):
+            self.recordings_id = None
+        X, y_true = load_data(PATH, self.recordings_id, number=self.number)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X_train.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        plt.figure(figsize=(6, 3))
+        plt.plot(X_train[0, 0, :500], linewidth=1.2)
+        plt.plot(range(297, 305),
+                 X_train[0, 0, 297:305], color="orange", linewidth=3)
+        plt.title("Daphnet dataset")
+        plt.tight_layout()
+        plt.savefig("daphnet_example.png")
+        plt.close()
+
+        print("PLOT SAVED")
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/dodgers.py b/datasets/dodgers.py
new file mode 100644
index 0000000..f3c6879
--- /dev/null
+++ b/datasets/dodgers.py
@@ -0,0 +1,128 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("DODGERS")
+
+
+def load_data(db_path, record_ids=None, verbose=False):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+        verbose: If True, print loading progress information.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is None:
+        # Get all available record files with freeway-traffic pattern
+        record_files = list(db_path.glob("*-freeway-traffic.test.out"))
+        record_ids = [f.name for f in record_files]
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        # Handle direct filename or construct pattern
+        if record_id.endswith('-freeway-traffic.test.out'):
+            record_file = db_path / record_id
+        else:
+            record_file = db_path / f"{record_id}-freeway-traffic.test.out"
+        if record_file.exists():
+            # Load the record data
+            record_data = pd.read_csv(
+                record_file, header=None).dropna().to_numpy()
+            # Assuming first column is the data, second column is labels
+            if record_data.shape[1] >= 2:
+                data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(f"Insufficient columns for record {record_id}")
+        else:
+            if verbose:
+                print(f"Record file not found: {record_file}")
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_length = max(len(data) for data in data_list)
+
+    # Pad all sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Pad with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "DODGERS"
+
+    parameters = {
+        # "recordings_id": [["101"]],
+        "recordings_id": [None],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the DODGERS dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        if self.recordings_id in (["all"], "all"):
+            self.recordings_id = None
+        X, y_true = load_data(PATH, self.recordings_id)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X_train.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/ecg.py b/datasets/ecg.py
new file mode 100644
index 0000000..81f1986
--- /dev/null
+++ b/datasets/ecg.py
@@ -0,0 +1,133 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("ECG")
+
+
+def load_data(db_path, record_ids=None, verbose=False, number=-1):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+        verbose: If True, print loading progress information.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is not None and number > 0:
+        print("Warning: 'number' parameter is "
+              "ignored when 'record_ids' is provided.")
+
+    if record_ids is None:
+        # Get all available record files
+        record_files = list(db_path.glob("*.out"))
+        record_ids = [f.stem for f in record_files]
+
+        if "MBA_ECG14046_data" in record_ids:
+            record_ids.remove("MBA_ECG14046_data")
+            if verbose:
+                print("Removed MBA_ECG14046_data from records due to issues")
+
+        if number > 0:
+            record_ids = record_ids[:number]
+            print(record_ids)
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        record_file = db_path / f"{record_id}.out"
+        if record_file.exists():
+            # Load the record data
+            record_data = pd.read_csv(
+                record_file, header=None).dropna().to_numpy()
+            # Assuming first column is the data, second column is labels
+            if record_data.shape[1] >= 2:
+                data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(f"Insufficient columns for record {record_id}")
+        else:
+            if verbose:
+                print(f"Record file not found: {record_file}")
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_length = max(len(data) for data in data_list)
+
+    # Pad all sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Pad with last value for data and 0 for labels
+            padded_data.append(np.pad(
+                data,
+                (0, max_length - len(data)),
+                mode='constant',
+                constant_values=data[-1])
+            )
+            padded_labels.append(np.pad(
+                labels,
+                (0, max_length - len(labels)),
+                mode='constant',
+                constant_values=0),
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "ECG"
+
+    parameters = {
+        "recordings_id": [["1", "2"]],
+        "debug": [False],
+        "number": [-1],
+    }
+
+    def get_data(self):
+        """Load the MITDB dataset."""
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        if self.recordings_id in (["all"], "all"):
+            self.recordings_id = None
+        X, y_true = load_data(PATH, self.recordings_id, number=self.number)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            size = 5000
+            X_train = X_train[:, :size]
+            X_test = X_test[:, :size]
+            y_test = y_test[:, :size]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X_train.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/genesis.py b/datasets/genesis.py
new file mode 100644
index 0000000..8425d89
--- /dev/null
+++ b/datasets/genesis.py
@@ -0,0 +1,126 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("GENESIS")
+
+
+def load_data(db_path, record_ids=None, verbose=False):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+        verbose: If True, print loading progress information.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is None:
+        # Get all available record files with genesis pattern
+        record_files = list(db_path.glob("genesis-*.out"))
+        record_ids = [f.name for f in record_files]
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        # Handle direct filename or construct genesis pattern
+        if record_id.startswith('genesis-') and record_id.endswith('.out'):
+            record_file = db_path / record_id
+        else:
+            record_file = db_path / \
+                f"genesis-anomalies.test.csv@{record_id}.out"
+        if record_file.exists():
+            # Load the record data
+            record_data = pd.read_csv(
+                record_file, header=None).dropna().to_numpy()
+            # Assuming first column is the data, second column is labels
+            if record_data.shape[1] >= 2:
+                data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(f"Insufficient columns for record {record_id}")
+        else:
+            if verbose:
+                print(f"Record file not found: {record_file}")
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_length = max(len(data) for data in data_list)
+
+    # Pad all sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Pad with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "GENESIS"
+
+    parameters = {
+        "recordings_id": [["1", "2"]],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the GENESIS dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        X, y_true = load_data(PATH, self.recordings_id)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/ghl.py b/datasets/ghl.py
new file mode 100644
index 0000000..dd102dd
--- /dev/null
+++ b/datasets/ghl.py
@@ -0,0 +1,130 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("GHL")
+
+
+def load_data(db_path, record_ids=None, verbose=False):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+        verbose: If True, print loading progress information.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is None:
+        # Get all available record files with GHL pattern
+        record_files = list(db_path.glob(
+            "*_Lev_fault_Temp_corr_*.test.csv@*.out"))
+        record_ids = [f.name for f in record_files]
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        # Handle direct filename or construct pattern
+        if '_Lev_fault_Temp_corr_' in record_id and record_id.endswith('.out'):
+            record_file = db_path / record_id
+        else:
+            # Try to find matching files with pattern
+            matching_files = list(db_path.glob(
+                f"*{record_id}*_Lev_fault_Temp_corr_*.out"))
+            record_file = matching_files[0] if matching_files else db_path / \
+                f"{record_id}.out"
+        if record_file.exists():
+            # Load the record data
+            record_data = pd.read_csv(
+                record_file, header=None).dropna().to_numpy()
+            # Assuming first column is the data, second column is labels
+            if record_data.shape[1] >= 2:
+                data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(f"Insufficient columns for record {record_id}")
+        else:
+            if verbose:
+                print(f"Record file not found: {record_file}")
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_length = max(len(data) for data in data_list)
+
+    # Pad all sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Pad with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "GHL"
+
+    parameters = {
+        "recordings_id": [["1", "2"]],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the GHL dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        X, y_true = load_data(PATH, self.recordings_id)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X_train.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/iops.py b/datasets/iops.py
new file mode 100644
index 0000000..12df7ef
--- /dev/null
+++ b/datasets/iops.py
@@ -0,0 +1,139 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("IOPS")
+PATH = "/data/parietal/store2/data/tsb-uad/TSB-UAD-Public/IOPS/"
+
+
+def load_data(db_path, verbose=False):
+    """
+    Load train and test data from the database path.
+
+    Args:
+        db_path: Path to the database directory
+        verbose: If True, print loading progress information.
+
+    Returns:
+        tuple: (X_train, X_test, y_test) where:
+            - X_train: nd.array of shape (num_records, num_samples)
+            - X_test: nd.array of shape (num_records, num_samples)
+            - y_test: nd.array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    # Get all train and test files
+    train_files = list(db_path.glob("KPI-*.train.out"))
+    test_files = list(db_path.glob("KPI-*.test.out"))
+
+    if not train_files or not test_files:
+        raise ValueError("No train or test files found")
+
+    # Load train data
+    train_data_list = []
+    for train_file in train_files:
+        record_data = pd.read_csv(train_file, header=None).dropna().to_numpy()
+        if record_data.shape[1] >= 1:
+            train_data_list.append(record_data[:, 0].astype(float))
+        else:
+            if verbose:
+                print(f"Insufficient columns for train file {train_file}")
+
+    # Load test data and labels
+    test_data_list = []
+    test_labels_list = []
+    for test_file in test_files:
+        record_data = pd.read_csv(test_file, header=None).dropna().to_numpy()
+        if record_data.shape[1] >= 2:
+            test_data_list.append(record_data[:, 0].astype(float))
+            test_labels_list.append(record_data[:, 1].astype(int))
+        else:
+            if verbose:
+                print(f"Insufficient columns for test file {test_file}")
+
+    if not train_data_list or not test_data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_train_length = max(len(data) for data in train_data_list)
+    max_test_length = max(len(data) for data in test_data_list)
+
+    # Pad train sequences
+    padded_train_data = []
+    for data in train_data_list:
+        if len(data) < max_train_length:
+            padded_train_data.append(
+                np.pad(
+                    data,
+                    (0, max_train_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+        else:
+            padded_train_data.append(data[:max_train_length])
+
+    # Pad test sequences and labels
+    padded_test_data = []
+    padded_test_labels = []
+    for data, labels in zip(test_data_list, test_labels_list):
+        if len(data) < max_test_length:
+            padded_test_data.append(
+                np.pad(
+                    data,
+                    (0, max_test_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_test_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_test_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_test_data.append(data[:max_test_length])
+            padded_test_labels.append(labels[:max_test_length])
+
+    return (
+        np.array(padded_train_data),
+        np.array(padded_test_data),
+        np.array(padded_test_labels)
+    )
+
+
+class Dataset(BaseDataset):
+    name = "IOPS"
+
+    parameters = {
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the IOPS dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        X_train, X_test, y_test = load_data(PATH)
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X_train.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/kdd21.py b/datasets/kdd21.py
new file mode 100644
index 0000000..3d0da0b
--- /dev/null
+++ b/datasets/kdd21.py
@@ -0,0 +1,124 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("KDD21")
+
+
+def load_data(db_path, record_ids=None, verbose=False):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+        verbose: If True, print loading progress information.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+    if record_ids is None:
+        # Get all available record files
+        record_files = list(db_path.glob("*.out"))
+        record_ids = [f.name.split('_')[0] for f in record_files]
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        # Convert record_id to 3-digit format
+        formatted_id = str(record_id).zfill(3)
+        # Find file that starts with the formatted record_id
+        matching_files = list(db_path.glob(f"{formatted_id}_*.out"))
+        if matching_files:
+            record_file = matching_files[0]  # Take the first matching file
+            # Load the record data
+            record_data = pd.read_csv(
+                record_file, header=None).dropna().to_numpy()
+            # Assuming first column is the data, second column is labels
+            if record_data.shape[1] >= 2:
+                data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(f"Insufficient columns for record {record_id}")
+        else:
+            if verbose:
+                print(f"Record file not found for ID: {record_id}")
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_length = max(len(data) for data in data_list)
+
+    # Pad all sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Pad with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "KDD21"
+
+    parameters = {
+        "recordings_id": [["1", "2"]],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the KDD21 dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        X, y_true = load_data(PATH, self.recordings_id)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X_train.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/mgab.py b/datasets/mgab.py
new file mode 100644
index 0000000..ac00972
--- /dev/null
+++ b/datasets/mgab.py
@@ -0,0 +1,121 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("MGAB")
+
+
+def load_data(db_path, record_ids=None, verbose=False):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+        verbose: If True, print loading progress information.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is None:
+        # Get all available record files
+        record_files = list(db_path.glob("*.test.out"))
+        record_ids = [f.name.split(".")[0] for f in record_files]
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        record_file = db_path / f"{record_id}.test.out"
+        if record_file.exists():
+            # Load the record data
+            record_data = pd.read_csv(
+                record_file, header=None).dropna().to_numpy()
+            # Assuming first column is the data, second column is labels
+            if record_data.shape[1] >= 2:
+                data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(f"Insufficient columns for record {record_id}")
+        else:
+            if verbose:
+                print(f"Record file not found: {record_file}")
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_length = max(len(data) for data in data_list)
+
+    # Pad all sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Pad with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "MGAB"
+
+    parameters = {
+        "recordings_id": [["1", "2"]],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the MITDB dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        X, y_true = load_data(PATH, self.recordings_id)
+        n_recordings, _ = X.shape
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/mitdb.py b/datasets/mitdb.py
new file mode 100644
index 0000000..7f811d0
--- /dev/null
+++ b/datasets/mitdb.py
@@ -0,0 +1,135 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("MITDB")
+
+
+def load_mitdb_data(db_path, record_ids=None, verbose=False):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is None:
+        # Get all available record files with format like 100.test.csv@1.out
+        record_files = list(db_path.glob("*.out"))
+        record_ids = [str(f.name).split(".")[0] for f in record_files]
+
+    if verbose:
+        print(f"Loading records: {record_ids}")
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        # Find file starting with record_id and ending with .out
+        record_files = list(db_path.glob(f"{record_id}*.out"))
+        if record_files:
+            if len(record_files) > 1:
+                if verbose:
+                    print(
+                        f"Multiple files found for record ID {record_id}, "
+                        f"using the first one: {record_files[0]}"
+                    )
+            record_file = record_files[0]
+            # Load the record data
+            record_data = pd.read_csv(
+                db_path / record_file, header=None).dropna().to_numpy()
+            # Assuming first column is the data, second column is labels
+            if verbose:
+                print(
+                    f"Loaded record {record_id} "
+                    f"with shape {record_data.shape}")
+            if record_data.shape[1] >= 2:
+                if verbose:
+                    print(f"Record {record_id} has sufficient columns")
+                data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(f"Insufficient columns for record {record_id}")
+        else:
+            if verbose:
+                print(f"Record file not found for ID: {db_path / record_id}")
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_length = max(len(data) for data in data_list)
+
+    # Pad all sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Pad with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "MITDB"
+
+    parameters = {
+        "recordings_id": [["100", "201", "109", "105", "111", "221"]],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the MITDB dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        if self.recordings_id in (["all"], "all"):
+            self.recordings_id = None
+        X, y_true = load_mitdb_data(PATH, self.recordings_id)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, : int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, -2000:]
+            X_test = X_test[:, -2000:]
+            y_test = y_test[:, -2000:]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(X_train=X_train, y_test=y_test, X_test=X_test)
diff --git a/datasets/msl.py b/datasets/msl.py
index fe177ba..aaa2fe2 100644
--- a/datasets/msl.py
+++ b/datasets/msl.py
@@ -1,8 +1,7 @@
-from benchopt import BaseDataset, safe_import_context, config
+from benchopt import BaseDataset, config
 
-with safe_import_context() as import_ctx:
-    import numpy as np
-    import requests
+import numpy as np
+import requests
 
 # Create global variables to store the urls
 URL_XTRAIN = (
@@ -58,7 +57,12 @@ def get_data(self):
             X_test = X_test[:1000]
             y_test = y_test[:1000]
 
-        print(X_train.shape, X_test.shape, y_test.shape)
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        # For MSL, treat as single recording
+        n_features = X_train.shape[1]
+        X_train = X_train.T.reshape(1, n_features, -1)
+        X_test = X_test.T.reshape(1, n_features, -1)
+        y_test = y_test.reshape(1, -1)
 
         return dict(
             X_train=X_train, y_test=y_test, X_test=X_test
diff --git a/datasets/nab.py b/datasets/nab.py
new file mode 100644
index 0000000..20a0960
--- /dev/null
+++ b/datasets/nab.py
@@ -0,0 +1,123 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("NAB")
+
+
+def load_data(db_path, record_ids=None, verbose=False):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+        verbose: If True, print loading progress information.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is None:
+        # Get all available record files
+        record_files = list(db_path.glob("NAB_data_*.out"))
+        record_ids = [f.name.split('_')[2] for f in record_files]
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        record_files = list(db_path.glob(f"NAB_data_{record_id}_*.out"))
+        if record_files:
+            # Take the first matching file
+            record_file = record_files[0]
+            # Load the record data
+            record_data = pd.read_csv(
+                record_file, header=None).dropna().to_numpy()
+            # Assuming first column is the data, second column is labels
+            if record_data.shape[1] >= 2:
+                data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(f"Insufficient columns for record {record_id}")
+        else:
+            if verbose:
+                print(f"Record file not found for: {record_id}")
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_length = max(len(data) for data in data_list)
+
+    # Pad all sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Pad with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "NAB"
+
+    parameters = {
+        "recordings_id": [["art0"], ["art1"], ["CloudWatch"]],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the NAB dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        X, y_true = load_data(PATH, self.recordings_id)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X_train.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/occupancy.py b/datasets/occupancy.py
new file mode 100644
index 0000000..cddb6e5
--- /dev/null
+++ b/datasets/occupancy.py
@@ -0,0 +1,141 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("OCCUPANCY")
+
+
+def load_data(db_path, record_ids=None, verbose=False):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load for testing.
+        verbose: If True, print loading progress information.
+
+    Returns:
+        tuple: (X_train, X_test, y_test) where:
+            - X_train: numpy array of shape (num_records, num_samples)
+            - X_test: numpy array of shape (num_records, num_samples)
+            - y_test: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    # Load training data
+    train_files = sorted(list(db_path.glob("room-occupancy.train.csv@*.out")))
+    if verbose:
+        print(train_files)
+    if not train_files:
+        raise FileNotFoundError("No training files found.")
+    train_data_list = [
+        pd.read_csv(f, header=None).dropna().to_numpy()[:, 0].astype(float)
+        for f in train_files
+    ]
+    # Concatenate all training series into a single array
+    X_train = np.concatenate(train_data_list)
+
+    # Load testing data
+    if record_ids is None:
+        record_ids = sorted(
+            list(set(
+                f.name.split('.')[0].split('-')[-1]
+                for f in db_path.glob("room-occupancy-*.test.csv@*.out")
+            ))
+        )
+
+    test_data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        test_files = sorted(
+            list(db_path.glob(f"room-occupancy-{record_id}.test.csv@*.out"))
+        )
+        if not test_files:
+            if verbose:
+                print(f"No test files found for record_id {record_id}")
+            continue
+
+        for test_file in test_files:
+            record_data = pd.read_csv(
+                test_file, header=None).dropna().to_numpy()
+            if record_data.shape[1] >= 2:
+                test_data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(
+                        f"Insufficient columns "
+                        f"for record file {test_file.name}")
+
+    if not test_data_list:
+        raise ValueError("No valid test data found")
+
+    # Find maximum length for padding test data
+    max_length = max(len(data) for data in test_data_list)
+
+    # Pad all test sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(test_data_list, labels_list):
+        pad_width = max_length - len(data)
+        if pad_width > 0:
+            padded_data.append(
+                np.pad(
+                    data, (
+                        0,
+                        pad_width),
+                    mode="constant",
+                    constant_values=data[-1]
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels, (0, pad_width), mode="constant", constant_values=0
+                )
+            )
+        else:
+            padded_data.append(data)
+            padded_labels.append(labels)
+
+    X_test = np.array(padded_data)
+    y_test = np.array(padded_labels)
+
+    # Reshape X_train to be 2D
+    X_train = X_train.reshape(1, -1)
+
+    return X_train, X_test, y_test
+
+
+class Dataset(BaseDataset):
+    name = "OCCUPANCY"
+
+    parameters = {
+        "recordings_id": [None],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the OCCUPANCY dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        X_train, X_test, y_test = load_data(PATH, self.recordings_id)
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X_train.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/opportunity.py b/datasets/opportunity.py
new file mode 100644
index 0000000..3968a2b
--- /dev/null
+++ b/datasets/opportunity.py
@@ -0,0 +1,126 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("OPPORTUNITY")
+
+
+def load_data(db_path, record_ids=None, verbose=False):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+        verbose: If True, print loading progress information.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is None:
+        # Get all available record files with S*-ADL*.test.csv@*.out pattern
+        record_files = list(db_path.glob("S*-ADL*.test.csv@*.out"))
+        # Extract record_id from filename
+        record_ids = [f.name.split('-')[0][1:] for f in record_files]
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        # Find files matching the pattern S{record_id}-ADL*.test.csv@*.out
+        pattern = f"S{record_id}-ADL*.test.csv@*.out"
+        matching_files = list(db_path.glob(pattern))
+
+        if matching_files:
+            record_file = matching_files[0]  # Take first match
+            # Load the record data
+            record_data = pd.read_csv(
+                record_file, header=None).dropna().to_numpy()
+            # Assuming first column is the data, second column is labels
+            if record_data.shape[1] >= 2:
+                data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(f"Insufficient columns for record {record_id}")
+        else:
+            if verbose:
+                print(f"Record file not found for pattern: {pattern}")
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_length = max(len(data) for data in data_list)
+
+    # Pad all sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Pad with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "OPPORTUNITY"
+
+    parameters = {
+        "recordings_id": [["1", "2"]],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the OPPORTUNITY dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        X, y_true = load_data(PATH, self.recordings_id)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/pattern.py b/datasets/pattern.py
new file mode 100644
index 0000000..42b4cd3
--- /dev/null
+++ b/datasets/pattern.py
@@ -0,0 +1,62 @@
+from benchopt import BaseDataset
+
+import numpy as np
+from rosecdl.utils.utils_signal import generate_experiment
+
+
+class Dataset(BaseDataset):
+    name = "Pattern"
+
+    parameters = {
+        "n_samples": [10],
+        "n_times": [5000],
+        "debug": [False],
+        "random_state": [42],
+        "n_times_atom": [250],
+    }
+
+    def get_data(self):
+        if self.debug:
+            self.n_samples = 2
+            self.n_times = 1000
+
+        contamination_params = {
+            "n_atoms": 2,
+            "sparsity": 3,
+            "init_z": "constant",
+            "init_z_kwargs": {"value": 50},
+        }
+
+        simulation_params = {
+            "n_trials": self.n_samples * 2,
+            "n_channels": 2,
+            "n_times": self.n_times,
+            "n_atoms": 2,
+            "n_times_atom": self.n_times_atom,
+            "n_atoms_extra": 2,  # extra atoms in the learned dictionary
+            "D_init": "random",
+            "window": True,
+            "contamination_params": contamination_params,
+            "init_d": "shapes",
+            "init_d_kwargs": {"shapes": ["sin", "gaussian"]},
+            "init_z": "constant",
+            "init_z_kwargs": {"value": 1},
+            "noise_std": 0.01,
+            "rng": self.random_state,
+            "sparsity": 20,
+        }
+
+        X, _, _, _, info_contam = generate_experiment(
+            simulation_params=simulation_params,
+            return_info_contam=True,
+        )
+
+        X_train, X_test = X[: self.n_samples], X[self.n_samples:]
+        y_test = info_contam["outliers_mask"][self.n_samples:]
+        y_test = np.any(y_test, axis=1)
+
+        print(f"X_train shape: {X_train.shape}")
+        print(f"X_test shape: {X_test.shape}")
+        print(f"y_test shape: {y_test.shape}")
+
+        return dict(X_train=X_train, y_test=y_test, X_test=X_test)
diff --git a/datasets/psm.py b/datasets/psm.py
index bd5e60f..b5ce22c 100644
--- a/datasets/psm.py
+++ b/datasets/psm.py
@@ -1,8 +1,7 @@
-from benchopt import BaseDataset, safe_import_context, config
+from benchopt import BaseDataset, config
 
-with safe_import_context() as import_ctx:
-    import requests
-    import pandas as pd
+import requests
+import pandas as pd
 
 URL_XTRAIN = (
     "https://drive.google.com/uc?&id=1d3tAbYTj0CZLhB7z3IDTfTRg3E7qj_tw"
diff --git a/datasets/sensorscope.py b/datasets/sensorscope.py
new file mode 100644
index 0000000..7bcdb9a
--- /dev/null
+++ b/datasets/sensorscope.py
@@ -0,0 +1,125 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("SENSORSCOPE")
+
+
+def load_data(db_path, record_ids=None, verbose=False):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+        verbose: If True, print loading progress information.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is None:
+        # Get all available record files with stb pattern
+        record_files = list(db_path.glob("stb-*.test.out"))
+        record_ids = [f.name for f in record_files]
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        # Handle direct filename or construct pattern
+        if record_id.startswith('stb-') and record_id.endswith('.test.out'):
+            record_file = db_path / record_id
+        else:
+            record_file = db_path / f"stb-{record_id}.test.out"
+        if record_file.exists():
+            # Load the record data
+            record_data = pd.read_csv(
+                record_file, header=None).dropna().to_numpy()
+            # Assuming first column is the data, second column is labels
+            if record_data.shape[1] >= 2:
+                data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(f"Insufficient columns for record {record_id}")
+        else:
+            if verbose:
+                print(f"Record file not found: {record_file}")
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_length = max(len(data) for data in data_list)
+
+    # Pad all sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Pad with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "SENSORSCOPE"
+
+    parameters = {
+        "recordings_id": [["10", "11"]],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the SENSORSCOPE dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        X, y_true = load_data(PATH, self.recordings_id)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X_train.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/simulated.py b/datasets/simulated.py
index 7f48524..a91b101 100644
--- a/datasets/simulated.py
+++ b/datasets/simulated.py
@@ -1,8 +1,7 @@
-from benchopt import BaseDataset, safe_import_context
+from benchopt import BaseDataset
 
-with safe_import_context() as import_ctx:
-    from sklearn.datasets import make_regression
-    import numpy as np
+from sklearn.datasets import make_regression
+import numpy as np
 
 
 class Dataset(BaseDataset):
@@ -12,17 +11,17 @@ class Dataset(BaseDataset):
     requirements = ["scikit-learn"]
 
     parameters = {
-        "n_samples": [10000],
-        "n_features": [5],
+        "n_samples": [100_000],
+        "n_features": [6],
         "noise": [0.1],
-        "n_anomaly": [90],
+        "n_anomaly": [15_000],
     }
 
     test_parameters = {
-        "n_samples": [500],
-        "n_features": [5],
+        "n_samples": [64],
+        "n_features": [2],
         "noise": [0.1],
-        "n_anomaly": [90],
+        "n_anomaly": [9],
     }
 
     def get_data(self):
@@ -46,7 +45,7 @@ def get_data(self):
 
         # Adding anomalies
         y_test = np.zeros(self.n_samples)
-        for i in range(self.n_anomaly):
+        for _ in range(self.n_anomaly):
             idx = np.random.randint(self.n_samples)
             y_test[idx] = 1
 
@@ -57,4 +56,10 @@ def get_data(self):
             * 10
         )
 
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        # For simulated data, treat as single recording
+        X_train = X_train.T.reshape(1, self.n_features, -1)
+        X_test = X_test.T.reshape(1, self.n_features, -1)
+        y_test = y_test.reshape(1, -1)
+
         return dict(X_train=X_train, y_test=y_test, X_test=X_test)
diff --git a/datasets/smap.py b/datasets/smap.py
index 86dd691..756250e 100644
--- a/datasets/smap.py
+++ b/datasets/smap.py
@@ -1,9 +1,8 @@
-from benchopt import BaseDataset, safe_import_context, config
+from benchopt import BaseDataset, config
 
-with safe_import_context() as import_ctx:
-    import numpy as np
-    import requests
-    # from sklearn.model_selection import TimeSeriesSplit
+import numpy as np
+import requests
+# from sklearn.model_selection import TimeSeriesSplit
 
 URL_XTRAIN = (
     "https://drive.google.com/uc?&id=1e_JhpIURD"
@@ -63,6 +62,13 @@ def get_data(self):
             X_test = X_test[:1000]
             y_test = y_test[:1000]
 
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        # For SMAP, treat as single recording
+        n_features = X_train.shape[1]
+        X_train = X_train.T.reshape(1, n_features, -1)
+        X_test = X_test.T.reshape(1, n_features, -1)
+        y_test = y_test.reshape(1, -1)
+
         return dict(
             X_train=X_train, y_test=y_test, X_test=X_test
         )
diff --git a/datasets/smd.py b/datasets/smd.py
new file mode 100644
index 0000000..d258391
--- /dev/null
+++ b/datasets/smd.py
@@ -0,0 +1,130 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("SMD")
+
+
+def load_data(db_path, record_ids=None):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is None:
+        # Get all available record files matching the pattern
+        record_files = list(db_path.glob("machine-*-*.test.csv*"))
+        # Extract record IDs from filenames
+        record_ids = []
+        for f in record_files:
+            # Extract from machine-{record_id}-*.test.csv
+            parts = f.stem.split('-')
+            if len(parts) >= 3:
+                record_ids.append(parts[1])
+        record_ids = list(set(record_ids))  # Remove duplicates
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        # Find files matching the pattern
+        pattern = f"machine-{record_id}-*.test.csv*"
+        record_files = list(db_path.glob(pattern))
+
+        for record_file in record_files:
+            if record_file.exists():
+                # Load the record data
+                record_data = pd.read_csv(
+                    record_file, header=None).dropna().to_numpy()
+                # Assuming first column is the data, second column is labels
+                if record_data.shape[1] >= 2:
+                    data_list.append(record_data[:, 0].astype(float))
+                    labels_list.append(record_data[:, 1].astype(int))
+                else:
+                    print(f"Insufficient columns for record {record_id}")
+            else:
+                print(f"Record file not found: {record_file}")
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    # Find maximum length for padding
+    max_length = max(len(data) for data in data_list)
+
+    # Pad all sequences to the same length
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Pad with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "SMD"
+
+    parameters = {
+        "recordings_id": [["1", "2"]],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the SMD dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        X, y_true = load_data(PATH, self.recordings_id)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        # For SMD, treat as single recording
+        n_features = X_train.shape[1]
+        X_train = X_train.T.reshape(1, n_features, -1)
+        X_test = X_test.T.reshape(1, n_features, -1)
+        y_test = y_test.reshape(1, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/svdb.py b/datasets/svdb.py
new file mode 100644
index 0000000..ea127c6
--- /dev/null
+++ b/datasets/svdb.py
@@ -0,0 +1,151 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+PATH = config.get_data_path("SVDB")
+
+
+def load_data(db_path, record_ids=None, verbose=False, number=-1):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is not None and number > 0:
+        print("Warning: 'number' parameter is "
+              "ignored when 'record_ids' is provided.")
+
+    if record_ids is None:
+        record_files = list(db_path.glob("*.test.csv@*.out"))
+        record_ids = [f.name.split(".")[0] for f in record_files]
+        if number > 0:
+            record_ids = record_ids[:number]
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        # Handle case where record_id already includes the pattern
+        record_files = list(db_path.glob(f"{record_id}*test.csv@*.out"))
+        if record_files:
+            if len(record_files) > 1:
+                if verbose:
+                    print(
+                        f"Multiple files found for record ID {record_id}, "
+                        f"using the first one: {record_files[0]}"
+                    )
+            record_file = record_files[0]
+            # Load the record data
+            record_data = pd.read_csv(
+                db_path / record_file, header=None).dropna().to_numpy()
+            # Assuming first column is the data, second column is labels
+            if verbose:
+                print(
+                    f"Loaded record {record_id} "
+                    f"with shape {record_data.shape}")
+            if record_data.shape[1] >= 2:
+                if verbose:
+                    print(f"Record {record_id} has sufficient columns")
+                data_list.append(record_data[:, 0].astype(float))
+                labels_list.append(record_data[:, 1].astype(int))
+            else:
+                if verbose:
+                    print(f"Insufficient columns for record {record_id}")
+        else:
+            if verbose:
+                print(f"Record file not found for ID: {db_path / record_id}")
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    max_length = max(len(data) for data in data_list)
+
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Padding with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "SVDB"
+
+    parameters = {
+        "recordings_id": [["801"]],
+        "number": [-1],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the SVDB dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        if self.recordings_id in (["all"], "all"):
+            self.recordings_id = None
+        X, y_true = load_data(PATH, self.recordings_id, number=self.number)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X_train.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        plt.figure(figsize=(6, 3))
+        plt.plot(X_train[0, 0, :500], linewidth=1.2)
+        plt.plot(range(350, 360),
+                 X_train[0, 0, 350:360], color="orange", linewidth=3)
+        plt.title("SVDB dataset")
+        plt.tight_layout()
+        plt.savefig("svdb_example.png")
+        plt.close()
+
+        print("PLOT SAVED")
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/datasets/swat.py b/datasets/swat.py
index 5400d4f..943aad6 100644
--- a/datasets/swat.py
+++ b/datasets/swat.py
@@ -1,14 +1,13 @@
-from benchopt import BaseDataset, safe_import_context
+from benchopt import BaseDataset
 from benchopt.config import get_data_path
 from benchmark_utils import check_data
 
-with safe_import_context() as import_ctx:
-    import pandas as pd
+import pandas as pd
 
-    # Checking if the data is available
-    PATH = get_data_path(key="SWaT")
-    TRAIN_PATH = check_data(PATH, "SWaT", "train")
-    TEST_PATH = check_data(PATH, "SWaT", "test")
+# Checking if the data is available
+PATH = get_data_path(key="SWaT")
+TRAIN_PATH = check_data(PATH, "SWaT", "train")
+TEST_PATH = check_data(PATH, "SWaT", "test")
 
 
 class Dataset(BaseDataset):
@@ -45,6 +44,13 @@ def get_data(self):
             X_test = X_test[:1000]
             y_test = y_test[:1000]
 
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        # For SWaT, treat as single recording
+        n_features = X_train.shape[1]
+        X_train = X_train.T.reshape(1, n_features, -1)
+        X_test = X_test.T.reshape(1, n_features, -1)
+        y_test = y_test.reshape(1, -1)
+
         return dict(
             X_train=X_train, y_test=y_test, X_test=X_test
         )
diff --git a/datasets/trend.py b/datasets/trend.py
new file mode 100644
index 0000000..44db101
--- /dev/null
+++ b/datasets/trend.py
@@ -0,0 +1,79 @@
+from benchopt import BaseDataset
+
+import numpy as np
+from rosecdl.utils.utils_signal import generate_experiment
+
+
+class Dataset(BaseDataset):
+    name = "Trend"
+
+    parameters = {
+        "n_samples": [10],
+        "n_times": [5000],
+        "debug": [False],
+        "random_state": [42],
+        "n_times_atom": [250],
+        "trend_scale": [9],
+        "freq": [4],  # frequency multiplier for the trend
+    }
+
+    def get_data(self):
+        if self.debug:
+            self.n_samples = 2
+            self.n_times = 1000
+
+        contamination_params = {
+            "n_atoms": 2,
+            "sparsity": 3,
+            "init_z": "constant",
+            "init_z_kwargs": {"value": 50},
+        }
+
+        simulation_params = {
+            "n_trials": self.n_samples * 2,
+            "n_channels": 2,
+            "n_times": self.n_times,
+            "n_atoms": 2,
+            "n_times_atom": self.n_times_atom,
+            "n_atoms_extra": 2,  # extra atoms in the learned dictionary
+            "D_init": "random",
+            "window": True,
+            "contamination_params": contamination_params,
+            "init_d": "shapes",
+            "init_d_kwargs": {"shapes": ["sin", "gaussian"]},
+            "init_z": "constant",
+            "init_z_kwargs": {"value": 1},
+            "noise_std": 0.01,
+            "rng": self.random_state,
+            "sparsity": 20,
+        }
+
+        X, _, _, _, info_contam = generate_experiment(
+            simulation_params=simulation_params,
+            return_info_contam=True,
+        )
+
+        # Add low frequency sinusoidal trend
+        t = np.linspace(0, self.freq * np.pi, self.n_times)
+        trend = self.trend_scale * np.sin(t)
+        X += trend[None, None, :]
+
+        X_train, X_test = X[: self.n_samples], X[self.n_samples:]
+        y_test = info_contam["outliers_mask"][self.n_samples:]
+        y_test = np.any(y_test, axis=1)
+
+        import matplotlib.pyplot as plt
+        # Plot example time series with trend
+        plt.figure(figsize=(10, 4))
+        plt.plot(X_train[0, 0, :])
+        plt.title('Example Time Series with Added Trend')
+        plt.xlabel('Time')
+        plt.ylabel('Value')
+        plt.legend()
+        plt.show()
+
+        print(f"X_train shape: {X_train.shape}")
+        print(f"X_test shape: {X_test.shape}")
+        print(f"y_test shape: {y_test.shape}")
+
+        return dict(X_train=X_train, y_test=y_test, X_test=X_test)
diff --git a/datasets/wadi.py b/datasets/wadi.py
index d890ec6..1c5c502 100644
--- a/datasets/wadi.py
+++ b/datasets/wadi.py
@@ -1,14 +1,13 @@
-from benchopt import BaseDataset, safe_import_context
+from benchopt import BaseDataset
 from benchopt.config import get_data_path
 from benchmark_utils import check_data
 
-with safe_import_context() as import_ctx:
-    import pandas as pd
+import pandas as pd
 
-    # Checking if the data is available
-    PATH = get_data_path(key="WADI")
-    TRAIN_PATH = check_data(PATH, "WADI", "train")
-    TEST_PATH = check_data(PATH, "WADI", "test")
+# Checking if the data is available
+PATH = get_data_path(key="WADI")
+TRAIN_PATH = check_data(PATH, "WADI", "train")
+TEST_PATH = check_data(PATH, "WADI", "test")
 
 
 class Dataset(BaseDataset):
@@ -66,6 +65,13 @@ def get_data(self):
             X_test = X_test[:1000]
             y_test = y_test[:1000]
 
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        # For WADI, treat as single recording
+        n_features = X_train.shape[1]
+        X_train = X_train.T.reshape(1, n_features, -1)
+        X_test = X_test.T.reshape(1, n_features, -1)
+        y_test = y_test.reshape(1, 1, -1)
+
         return dict(
             X_train=X_train, y_test=y_test, X_test=X_test
         )
diff --git a/datasets/yahoo.py b/datasets/yahoo.py
new file mode 100644
index 0000000..181ef0e
--- /dev/null
+++ b/datasets/yahoo.py
@@ -0,0 +1,140 @@
+from benchopt import BaseDataset, config
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+PATH = config.get_data_path("YAHOO")
+
+
+def load_data(db_path, record_ids=None, verbose=False):
+    """
+    Load data from the database path for specified record IDs.
+
+    Args:
+        db_path: Path to the database directory
+        record_ids: List of record IDs to load.
+        If None, loads all available records.
+
+    Returns:
+        tuple: (X, y_true) where:
+            - X: numpy array of shape (num_records, num_samples)
+            - y_true: numpy array of shape (num_records, num_samples)
+    """
+    db_path = Path(db_path)
+
+    if record_ids is None:
+        record_files = list(db_path.glob("*.data.out"))
+        record_ids = [f.name for f in record_files]
+
+    data_list = []
+    labels_list = []
+    for record_id in record_ids:
+        # Handle case where record_id already includes the pattern
+        if record_id.endswith('.data.out'):
+            pattern = record_id
+        else:
+            # Create pattern based on the A{record_id} format
+            patterns = [
+                f"Yahoo_A{record_id}real_*_data.out",
+                f"Yahoo_A{record_id}synthetic_*_data.out",
+                f"YahooA{record_id}Benchmark-TS*_data.out"
+            ]
+
+        # Find all matching files for this record_id
+        matching_files = []
+        if record_id.endswith('.data.out'):
+            matching_files = list(db_path.glob(pattern))
+        else:
+            for pattern in patterns:
+                matching_files.extend(list(db_path.glob(pattern)))
+
+        if not matching_files:
+            if verbose:
+                print(f"No files found for record {record_id}")
+            continue
+
+        for record_file in matching_files:
+            if record_file.exists():
+                record_data = pd.read_csv(
+                    record_file, header=None).dropna().to_numpy()
+                # First column is the data, second column is labels
+                if record_data.shape[1] >= 2:
+                    data_list.append(record_data[:, 0].astype(float))
+                    labels_list.append(record_data[:, 1].astype(int))
+                else:
+                    if verbose:
+                        print(f"Insufficient columns for file {record_file}")
+            else:
+                if verbose:
+                    print(f"Record file not found: {record_file}")
+
+    if not data_list:
+        raise ValueError("No valid data found")
+
+    max_length = max(len(data) for data in data_list)
+
+    padded_data = []
+    padded_labels = []
+    for data, labels in zip(data_list, labels_list):
+        if len(data) < max_length:
+            # Padding with last value for data and 0 for labels
+            padded_data.append(
+                np.pad(
+                    data,
+                    (0, max_length - len(data)),
+                    mode="constant",
+                    constant_values=data[-1],
+                )
+            )
+            padded_labels.append(
+                np.pad(
+                    labels,
+                    (0, max_length - len(labels)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            )
+        else:
+            padded_data.append(data[:max_length])
+            padded_labels.append(labels[:max_length])
+
+    return np.array(padded_data), np.array(padded_labels)
+
+
+class Dataset(BaseDataset):
+    name = "YAHOO"
+
+    parameters = {
+        "recordings_id": [["1"]],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        """Load the YAHOO dataset."""
+
+        # X shape (n_recordings, n_samples)
+        # y shape (n_recordings, n_samples)
+        X, y_true = load_data(PATH, self.recordings_id)
+
+        X_test = X.copy()
+        y_test = y_true.copy()
+
+        X_train = X[:, :int(X.shape[1] * 0.1)]
+
+        if self.debug:
+            X_train = X_train[:, :1000]
+            X_test = X_test[:, :1000]
+            y_test = y_test[:, :1000]
+
+        # Reshaping data to (n_recordings, n_features, n_samples)
+        n_recordings = X_train.shape[0]
+        X_train = X_train.reshape(n_recordings, 1, -1)
+        X_test = X_test.reshape(n_recordings, 1, -1)
+        y_test = y_test.reshape(n_recordings, -1)
+
+        return dict(
+            X_train=X_train,
+            y_test=y_test,
+            X_test=X_test
+        )
diff --git a/objective.py b/objective.py
index 3d34e42..04dbde5 100644
--- a/objective.py
+++ b/objective.py
@@ -1,20 +1,25 @@
-from benchopt import BaseObjective, safe_import_context
+from benchopt import BaseObjective
 from benchmark_utils.metrics import (
     soft_precision as soft_precision_score,
     soft_recall as soft_recall_score,
     soft_f1 as soft_f1_score,
-    ctt, ttc,
+    ctt,
+    ttc,
     extract_anomaly_ranges,
     precision_t as precision_t_score,
     recall_t as recall_t_score,
-    f1_t as f1_t_score
+    f1_t as f1_t_score,
 )
 
-with safe_import_context() as import_ctx:
-    import numpy as np
-    from sklearn.metrics import (
-        precision_score, recall_score, f1_score, zero_one_loss
-    )
+import numpy as np
+from sklearn.metrics import (
+    average_precision_score,
+    precision_score,
+    recall_score,
+    f1_score,
+    zero_one_loss,
+    roc_auc_score,
+)
 
 
 class Objective(BaseObjective):
@@ -23,80 +28,301 @@ class Objective(BaseObjective):
     install_cmd = "conda"
     requirements = ["scikit-learn"]
 
+    parameters = {
+        "score_metrics": [("auc_pr", "auc_roc")],
+        "prediction_metrics": [None],
+    }
+
+    detection_ranges = (1, 3, 5, 10, 20)
+    default_prediction_metrics = (
+        "precision",
+        "recall",
+        "f1",
+        "precision_t",
+        "recall_t",
+        "f1_t",
+        "ctt",
+        "ttc",
+        "zoloss",
+        "soft_precision",
+        "soft_recall",
+        "soft_f1",
+    )
+
     def get_one_result(self):
-        """Return one solution for which the objective can be computed,
-        Used to get the shape of the result.
-        Our algorithms will return an array of labels of shape (n_samples,)
-        """
-        return dict(y_hat=np.ones(self.X_test.shape[0]))
+        """Return one solution for which the objective can be computed."""
+        score_metrics = self._normalize_metrics(
+            getattr(self, "score_metrics", ("auc_pr", "auc_roc"))
+        )
+        prediction_metrics = self._expand_prediction_metrics(
+            getattr(self, "prediction_metrics", None)
+        )
+
+        result = {}
+        if score_metrics:
+            result["anomaly_scores"] = np.zeros_like(
+                self.y_test, dtype=float
+            )
+        if prediction_metrics:
+            result["anomaly_predictions"] = np.zeros_like(
+                self.y_test, dtype=int
+            )
+        return result
 
     def set_data(self, X_train, y_test, X_test):
         "Set the data to compute the objective."
         self.X_train = X_train
         self.X_test, self.y_test = X_test, y_test
 
-    def evaluate_result(self, y_hat):
-        """Evaluate the result provided by the solver."""
-        to_discard = (y_hat == -1).sum()
-        self.y_test = self.y_test[to_discard:]
-        y_hat = y_hat[to_discard:]
-
-        result = {}
-        detection_ranges = [1, 3, 5, 10, 20]
-
-        # Standard metrics
-        precision = precision_score(self.y_test, y_hat, zero_division=0)
-        recall = recall_score(self.y_test, y_hat, zero_division=0)
-        f1 = f1_score(self.y_test, y_hat, zero_division=0)
+    def evaluate_result(
+        self,
+        anomaly_scores=None,
+        anomaly_predictions=None,
+    ):
+        """Evaluate the result provided by the solver.
 
-        anomaly_ranges = extract_anomaly_ranges(self.y_test)
-        prediction_ranges = extract_anomaly_ranges(y_hat)
+        anomaly_scores is the score-based solver output.
+        anomaly_predictions is optional and only needed when requesting
+        prediction-based metrics.
+        """
+        score_metrics = self._normalize_metrics(
+            getattr(self, "score_metrics", ("auc_pr", "auc_roc"))
+        )
+        prediction_metrics = self._expand_prediction_metrics(
+            getattr(self, "prediction_metrics", None)
+        )
 
-        precision_t = precision_t_score(anomaly_ranges, prediction_ranges)
-        recall_t = recall_t_score(anomaly_ranges, prediction_ranges)
-        f1_t = f1_t_score(anomaly_ranges, prediction_ranges)
+        if score_metrics and anomaly_scores is None:
+            raise ValueError("score_metrics require an anomaly_scores array.")
+        if prediction_metrics and anomaly_predictions is None:
+            raise ValueError(
+                "prediction_metrics require an anomaly_predictions array.")
 
-        result.update({
-            "precision": precision,
-            "recall": recall,
-            "f1": f1
-        })
+        y_true, scores, predictions = self._align_inputs(
+            anomaly_scores=anomaly_scores,
+            anomaly_predictions=anomaly_predictions,
+        )
 
-        for range_value in detection_ranges:
-            soft_precision = soft_precision_score(
-                self.y_test, y_hat, detection_range=range_value
+        result = {}
+        if score_metrics:
+            result.update(
+                self._compute_score_metrics(
+                    y_true=y_true,
+                    anomaly_scores=scores,
+                    metrics=score_metrics,
+                )
             )
-            soft_recall = soft_recall_score(
-                self.y_test, y_hat, detection_range=range_value
+        if prediction_metrics:
+            result.update(
+                self._compute_prediction_metrics(
+                    y_true=y_true,
+                    anomaly_predictions=predictions,
+                    metrics=prediction_metrics,
+                )
             )
-            soft_f1 = soft_f1_score(soft_precision, soft_recall)
-
-            result.update({
-                f"soft_precision_{range_value}": soft_precision,
-                f"soft_recall_{range_value}": soft_recall,
-                f"soft_f1_{range_value}": soft_f1
-            })
-
-        zoloss = zero_one_loss(self.y_test, y_hat)
-
-        # Other metrics
-        cct_score = ctt(self.y_test, y_hat)
-        ttc_score = ttc(self.y_test, y_hat)
-
-        # Add remaining metrics to the result dictionary
-        result.update({
-            "precision_t": precision_t,
-            "recall_t": recall_t,
-            "f1_t": f1_t,
-            "cct": cct_score,
-            "ttc": ttc_score,
-            "zoloss": zoloss,
-            "value": zoloss  # having zoloss twice for the API
-        })
 
+        # Setting value to 0. The actual value is not used for ranking.
+        result["value"] = 0.0
         return result
 
     def get_objective(self):
-        return dict(
-            X_train=self.X_train, y_test=None, X_test=self.X_test
+        return dict(X_train=self.X_train, X_test=self.X_test)
+
+    def _normalize_metrics(self, metrics):
+        if metrics is None:
+            return ()
+        if isinstance(metrics, str):
+            if metrics == "all":
+                return ("auc_pr", "auc_roc")
+            return (metrics,)
+        return tuple(metric for metric in metrics if metric is not None)
+
+    def _expand_prediction_metrics(self, metrics):
+        metrics = self._normalize_prediction_metrics(metrics)
+        expanded = []
+
+        for metric in metrics:
+            if metric == "all":
+                metric = self.default_prediction_metrics
+            else:
+                metric = (metric,)
+
+            for name in metric:
+                if name in {
+                    "soft_precision",
+                    "soft_recall",
+                    "soft_f1",
+                }:
+                    expanded.extend(
+                        f"{name}_{detection_range}"
+                        for detection_range in self.detection_ranges
+                    )
+                else:
+                    expanded.append(name)
+
+        return tuple(expanded)
+
+    def _normalize_prediction_metrics(self, metrics):
+        if metrics is None:
+            return ()
+        if isinstance(metrics, str):
+            return (metrics,)
+        return tuple(metric for metric in metrics if metric is not None)
+
+    def _align_inputs(self, anomaly_scores, anomaly_predictions):
+        # flatten everything before aligning lengths.
+        y_true = np.asarray(self.y_test).reshape(-1)
+        scores = self._as_flat_array(anomaly_scores)
+        predictions = self._as_flat_array(anomaly_predictions)
+
+        # Only align against arrays that were returned. This keeps
+        # score-only and prediction-only evaluations valid.
+        arrays = [array for array in (
+            scores, predictions) if array is not None]
+        if not arrays:
+            return y_true, None, None
+
+        # Windowed solvers return fewer outputs than y_test because the
+        # first timestamps have no full context window. Keep the last samples,
+        # which correspond to the part of y_test the solver scored.
+        length = min([len(y_true)] + [len(array) for array in arrays])
+        y_true = y_true[-length:]
+        if scores is not None:
+            scores = scores[-length:]
+        if predictions is not None:
+            predictions = predictions[-length:]
+
+        # Drop invalid positions. NaN score padding and -1 prediction padding
+        # When both scores and predictions are present, the same mask is
+        # applied to keep mixed metric requests on the same timestamps.
+        valid = np.ones(length, dtype=bool)
+        if scores is not None:
+            valid &= ~np.isnan(scores)
+        if predictions is not None:
+            valid &= ~np.isnan(predictions)
+            valid &= predictions != -1
+
+        y_true = y_true[valid]
+        if scores is not None:
+            scores = scores[valid]
+        if predictions is not None:
+            predictions = predictions[valid]
+
+        return y_true, scores, predictions
+
+    def _as_flat_array(self, array):
+        if array is None:
+            return None
+        return np.asarray(array).reshape(-1)
+
+    def _compute_score_metrics(self, y_true, anomaly_scores, metrics):
+        if len(y_true) == 0:
+            return {metric: np.nan for metric in metrics}
+
+        result = {}
+        for metric in metrics:
+            if metric == "auc_roc":
+                result[metric] = self._safe_auc_roc(y_true, anomaly_scores)
+            elif metric == "auc_pr":
+                result[metric] = self._auc_pr(y_true, anomaly_scores)
+            else:
+                raise ValueError(f"Unknown score metric: {metric}")
+        return result
+
+    def _compute_prediction_metrics(
+            self,
+            y_true,
+            anomaly_predictions,
+            metrics,
+    ):
+        if len(y_true) == 0:
+            return {metric: np.nan for metric in metrics}
+
+        result = {}
+        anomaly_ranges = None
+        prediction_ranges = None
+
+        for metric in metrics:
+            if metric == "precision":
+                result[metric] = precision_score(
+                    y_true, anomaly_predictions, zero_division=0
+                )
+            elif metric == "recall":
+                result[metric] = recall_score(
+                    y_true, anomaly_predictions, zero_division=0
+                )
+            elif metric == "f1":
+                result[metric] = f1_score(
+                    y_true, anomaly_predictions, zero_division=0)
+            elif metric == "zoloss":
+                result[metric] = zero_one_loss(y_true, anomaly_predictions)
+            elif metric in {"precision_t", "recall_t", "f1_t"}:
+                if anomaly_ranges is None:
+                    anomaly_ranges, prediction_ranges = self._get_ranges(
+                        y_true, anomaly_predictions
+                    )
+                if metric == "precision_t":
+                    result[metric] = precision_t_score(
+                        anomaly_ranges, prediction_ranges
+                    )
+                elif metric == "recall_t":
+                    result[metric] = recall_t_score(
+                        anomaly_ranges, prediction_ranges)
+                else:
+                    result[metric] = f1_t_score(
+                        anomaly_ranges, prediction_ranges)
+            elif metric == "ctt":
+                result[metric] = ctt(y_true, anomaly_predictions)
+            elif metric == "ttc":
+                result[metric] = ttc(y_true, anomaly_predictions)
+            elif metric.startswith("soft_precision_"):
+                detection_range = self._parse_detection_range(
+                    metric, "soft_precision")
+                result[metric] = soft_precision_score(
+                    y_true,
+                    anomaly_predictions,
+                    detection_range=detection_range,
+                )
+            elif metric.startswith("soft_recall_"):
+                detection_range = self._parse_detection_range(
+                    metric, "soft_recall")
+                result[metric] = soft_recall_score(
+                    y_true,
+                    anomaly_predictions,
+                    detection_range=detection_range,
+                )
+            elif metric.startswith("soft_f1_"):
+                detection_range = self._parse_detection_range(
+                    metric, "soft_f1")
+                result[metric] = soft_f1_score(
+                    y_true,
+                    anomaly_predictions,
+                    detection_range=detection_range,
+                )
+            else:
+                raise ValueError(f"Unknown prediction metric: {metric}")
+
+        return result
+
+    def _get_ranges(self, y_true, anomaly_predictions):
+        return (
+            extract_anomaly_ranges(y_true),
+            extract_anomaly_ranges(anomaly_predictions),
         )
+
+    def _parse_detection_range(self, metric, prefix):
+        suffix = metric.replace(f"{prefix}_", "", 1)
+        try:
+            return int(suffix)
+        except ValueError as exc:
+            raise ValueError(
+                f"Invalid detection range in prediction metric: {metric}"
+            ) from exc
+
+    def _safe_auc_roc(self, y_true, anomaly_scores):
+        return roc_auc_score(y_true, anomaly_scores)
+
+    def _auc_pr(self, y_true, anomaly_scores):
+        if len(np.unique(y_true)) == 1:
+            return np.nan
+        return average_precision_score(y_true, anomaly_scores)
diff --git a/solvers/AR.py b/solvers/AR.py
index ff2d547..50eb908 100644
--- a/solvers/AR.py
+++ b/solvers/AR.py
@@ -1,20 +1,21 @@
 # AR model
-from benchopt import BaseSolver, safe_import_context
-from benchmark_utils import mean_overlaping_pred
+from benchopt import BaseSolver
+
+import torch
+from torch import optim, nn
+import numpy as np
+from tqdm import tqdm
 
-with safe_import_context() as import_ctx:
-    import torch
-    from torch import optim, nn
-    import numpy as np
-    from tqdm import tqdm
-    from benchmark_utils.models import ARModel
+from benchmark_utils.models import ARModel
+from benchmark_utils import mean_overlaping_pred
+from benchmark_utils.predictions import cutoff_scores
 
 
 class Solver(BaseSolver):
     name = "AR"  # AutoRegressive Linear model
 
     install_cmd = "conda"
-    requirements = ["pip:torch", "tqdm"]
+    requirements = ["pytorch", "tqdm"]
 
     sampling_strategy = "run_once"
 
@@ -23,29 +24,39 @@ class Solver(BaseSolver):
         "n_epochs": [50],
         "lr": [1e-5],
         "weight_decay": [1e-7],
-        "window_size": [256],
+        "window_size": [100],
         "horizon": [1],
-        "percentile": [99.4],
+        "cutoff": [None],
+    }
+
+    test_config = {
+        "n_epochs": 1,
+        "window_size": 16,
     }
 
-    def set_objective(self, X_train, y_test, X_test):
+    def set_objective(self, X_train, X_test):
 
         self.device = torch.device(
             "cuda" if torch.cuda.is_available() else "cpu"
         )
 
-        self.X_train = X_train  # (n_samples, n_features)
-        self.X_test, self.y_test = X_test, y_test  # (n_samples, n_features)
-        self.n_features = X_train.shape[1]
+        # Receiving shapes of (n_recordings, n_features, n_samples)
+
+        _, n_features, _ = X_train.shape
+
+        # (n_samples, n_features)
+        self.X_train = X_train.reshape(-1, n_features)
+        # (n_samples, n_features)
+        self.X_test = X_test.reshape(-1, n_features)
 
         self.model = ARModel(
-            self.n_features,
+            n_features,
             self.window_size,
             self.horizon
         )
         self.optimizer = optim.Adam(
             self.model.parameters(),
-            lr=self.lr,
+            lr=float(self.lr),
             # weight_decay=self.weight_decay
         )
         self.criterion = nn.MSELoss()
@@ -53,7 +64,7 @@ def set_objective(self, X_train, y_test, X_test):
         if self.X_train is not None:
             # (n_windows, window_size+horizon, n_features)
             self.Xw_train = np.lib.stride_tricks.sliding_window_view(
-                X_train,
+                self.X_train,
                 window_shape=self.window_size+self.horizon,
                 axis=0
             ).transpose(0, 2, 1)
@@ -61,7 +72,7 @@ def set_objective(self, X_train, y_test, X_test):
         if self.X_test is not None:
             # (n_windows, window_size+horizon, n_features)
             self.Xw_test = np.lib.stride_tricks.sliding_window_view(
-                X_test,
+                self.X_test,
                 window_shape=self.window_size+self.horizon,
                 axis=0
             ).transpose(0, 2, 1)
@@ -120,38 +131,36 @@ def run(self, _):
 
         xw_hat = xw_hat.detach().cpu().numpy()
 
-        # Reconstructing the prediction from the predicted windows
-        # Creating the prediction array with -1 for the unknown values
-        # Corresponding to the first window_size values
-        x_hat = np.zeros_like(self.X_test)-1  # (n_test_samples, n_features)
-        x_hat[self.window_size:self.window_size+self.horizon] = xw_hat[0]
+        # Reconstructing the prediction from the predicted windows.
+        # The first ``window_size`` positions have no forecast (no full input
+        # window precedes them); fill them with -1 as a sentinel.
+        x_hat = np.zeros_like(self.X_test) - 1
+        x_hat[self.window_size:] = mean_overlaping_pred(xw_hat, 1)
 
-        x_hat[self.window_size+self.horizon:] = mean_overlaping_pred(
-            xw_hat, 1
+        reconstruction_err = np.abs(
+            self.X_test[self.window_size:] - x_hat[self.window_size:]
         )
-
-        # Calculating the percentile value for the threshold
-        percentile_value = np.percentile(
-            np.abs(self.X_test[self.window_size:] - x_hat[self.window_size:]),
-            self.percentile
+        self.anomaly_scores = np.full(
+            self.X_test.shape, np.nan, dtype=float
         )
+        self.anomaly_scores[self.window_size:] = reconstruction_err
+        self.anomaly_scores = np.max(self.anomaly_scores, axis=1)
 
-        # Thresholding
-        predictions = np.zeros_like(x_hat)-1
-        predictions[self.window_size:] = np.where(
-            np.abs(self.X_test[self.window_size:] -
-                   x_hat[self.window_size:]) > percentile_value, 1, 0
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
         )
 
-        self.predictions = np.max(predictions, axis=1)
-
     # Skipping the solver call if a condition is met
-    def skip(self, X_train, X_test, y_test):
-        if X_train.shape[0] < self.window_size + self.horizon:
+    def skip(self, X_train, X_test):
+        if X_train.shape[0]*X_train.shape[2] < self.window_size + self.horizon:
             return True, "No enough training samples"
-        if X_test.shape[0] < self.window_size + self.horizon:
+        if X_test.shape[0]*X_test.shape[2] < self.window_size + self.horizon:
             return True, "No enough testing samples"
         return False, None
 
     def get_result(self):
-        return dict(y_hat=self.predictions)
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/anomalybert.py b/solvers/anomalybert.py
new file mode 100644
index 0000000..8c9d1b4
--- /dev/null
+++ b/solvers/anomalybert.py
@@ -0,0 +1,295 @@
+from models.anomaly_transformer import get_anomaly_transformer
+from benchopt import BaseSolver
+
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from tqdm import tqdm
+
+from benchmark_utils.predictions import cutoff_scores
+
+# Add AnomalyBERT to path
+sys.path.append(str(Path(__file__).parent.parent / 'AnomalyBERT'))
+
+
+class Solver(BaseSolver):
+    name = "AnomalyBERT"
+    sampling_strategy = "run_once"
+
+    requirements = ["pip::timm", "pytorch", "numpy", "tqdm"]
+
+    parameters = {
+        "patch_size": [1],
+        "d_embed": [512],
+        "n_layer": [6],
+        "batch_size": [128],
+        "lr": [0.0001],
+        "max_steps": [5000],
+        "n_patches": [512],
+        "seed": [548920],
+        "device": ["cuda:1"],
+        "window_sliding": [16],
+        "cutoff": [None],
+    }
+
+    sampling_strategy = "run_once"
+
+    def set_objective(self, X_train, X_test):
+        # X_train shape: (n_series, n_features, n_samples)
+        if X_train.ndim == 3:
+            self.X_train = np.transpose(
+                X_train, (0, 2, 1)).reshape(
+                    -1, X_train.shape[1]
+            ).astype(np.float32)
+            self.X_test = np.transpose(
+                X_test, (0, 2, 1)).reshape(
+                    -1, X_test.shape[1]
+            ).astype(np.float32)
+        else:
+            self.X_train = X_train.astype(np.float32)
+            self.X_test = X_test.astype(np.float32)
+
+    def run(self, _):
+        torch.manual_seed(self.seed)
+        np.random.seed(self.seed)
+
+        device = torch.device(
+            self.device if torch.cuda.is_available() else 'cpu')
+
+        train_data = self.X_train
+        d_data = train_data.shape[1]
+
+        # Configuration
+        patch_size = self.patch_size
+        # This corresponds to n_features (max_seq_len) in AnomalyBERT
+        n_patches = self.n_patches
+        data_seq_len = n_patches * patch_size
+
+        if len(train_data) <= data_seq_len:
+            raise ValueError(
+                f"Data length {len(train_data)} is smaller than "
+                f"sequence length {data_seq_len}")
+        self.model = get_anomaly_transformer(
+            input_d_data=d_data,
+            output_d_data=1,  # BCE loss
+            patch_size=patch_size,
+            d_embed=self.d_embed,
+            hidden_dim_rate=4.,
+            max_seq_len=n_patches,
+            positional_encoding=None,
+            relative_position_embedding=True,
+            transformer_n_layer=self.n_layer,
+            transformer_n_head=8,
+            dropout=0.1
+        ).to(device)
+
+        # Optimizer
+        optimizer = torch.optim.AdamW(
+            params=self.model.parameters(), lr=self.lr, weight_decay=1e-4)
+        scheduler = CosineAnnealingLR(
+            optimizer, T_max=self.max_steps, eta_min=self.lr*0.01)
+
+        train_loss_fn = nn.BCELoss().to(device)
+        sigmoid = nn.Sigmoid().to(device)
+
+        # Data Augmentation Parameters
+        replacing_rate = (0.015, 0.15)
+
+        replacing_table = list(np.random.randint(
+            int(data_seq_len*replacing_rate[0]),
+            int(data_seq_len*replacing_rate[1]),
+            size=10000))
+        replacing_table_index = 0
+        replacing_table_length = 10000
+
+        soft_replacing_prob = 1 - 0.5
+        uniform_replacing_prob = soft_replacing_prob - 0.15
+        peak_noising_prob = uniform_replacing_prob - 0.15
+
+        replacing_weight = 0.7
+
+        def replacing_weights(interval_len):
+            warmup_len = interval_len // 10
+            return np.concatenate((
+                np.linspace(0, replacing_weight, num=warmup_len),
+                np.full(interval_len-2*warmup_len, replacing_weight),
+                np.linspace(replacing_weight, 0, num=warmup_len)),
+                axis=None)
+
+        valid_index_list = np.arange(len(train_data) - data_seq_len)
+        numerical_column = np.arange(d_data)  # Assume all numerical
+
+        # Training Loop
+        self.model.train()
+        for i in tqdm(range(self.max_steps)):
+            first_index = np.random.choice(
+                valid_index_list, size=self.batch_size)
+            x = []
+            for j in first_index:
+                x.append(torch.Tensor(
+                    train_data[j:j+data_seq_len].copy()).to(device))
+
+            # Replace data logic
+            current_index = replacing_table_index
+            replacing_table_index += self.batch_size
+
+            if replacing_table_index > replacing_table_length:
+                replacing_lengths = replacing_table[current_index:] + \
+                    replacing_table[:replacing_table_index -
+                                    replacing_table_length]
+                replacing_table_index -= replacing_table_length
+            else:
+                replacing_lengths = replacing_table[
+                    current_index:replacing_table_index
+                ]
+                if replacing_table_index == replacing_table_length:
+                    replacing_table_index = 0
+
+            replacing_lengths = np.array(replacing_lengths)
+
+            target_index = np.random.randint(
+                0, data_seq_len-replacing_lengths+1)
+
+            replacing_type = np.random.uniform(0., 1., size=(self.batch_size,))
+            replacing_dim_numerical = np.random.uniform(
+                0., 1., size=(self.batch_size, d_data))
+            replacing_dim_numerical = replacing_dim_numerical - \
+                np.maximum(replacing_dim_numerical.min(
+                    axis=1, keepdims=True), 0.3) <= 0.001
+
+            x_anomaly = torch.zeros(
+                self.batch_size, data_seq_len, device=device)
+
+            for j, tar, leng, typ, dim_num in zip(
+                    range(self.batch_size),
+                    target_index, replacing_lengths,
+                    replacing_type,
+                    replacing_dim_numerical):
+                if leng > 0:
+                    _x = x[j].clone().transpose(0, 1)  # (d_data, seq_len)
+
+                    # External interval replacing
+                    if typ > soft_replacing_prob:
+                        col_num = numerical_column[dim_num]
+                        if len(col_num) > 0:
+                            # Pick random interval from train_data
+                            rep_start = np.random.randint(
+                                0, len(train_data) - leng)
+                            random_interval = train_data[rep_start:rep_start +
+                                                         leng, col_num].copy()
+
+                            # Random flip
+                            if np.random.rand() > 0.5:  # Horizontal
+                                random_interval = random_interval[::-1].copy()
+                            if np.random.rand() > 0.5:  # Vertical
+                                random_interval = 1 - random_interval
+
+                            _x_temp = torch.from_numpy(random_interval).to(
+                                device).transpose(0, 1)  # (n_cols, leng)
+
+                            weights = torch.from_numpy(replacing_weights(
+                                leng)).float().unsqueeze(0).to(device)
+                            _x[col_num, tar:tar+leng] = _x_temp * weights + \
+                                _x[col_num, tar:tar+leng] * (1 - weights)
+
+                            x_anomaly[j, tar:tar+leng] = 1
+                            x[j] = _x.transpose(0, 1)
+
+                    # Uniform replacing
+                    elif typ > uniform_replacing_prob:
+                        col_num = numerical_column[dim_num]
+                        if len(col_num) > 0:
+                            _x[col_num, tar:tar +
+                                leng] = torch.rand(
+                                    len(col_num), leng, device=device
+                            )
+                            x_anomaly[j, tar:tar+leng] = 1
+                            x[j] = _x.transpose(0, 1)
+
+                    # Peak noising
+                    elif typ > peak_noising_prob:
+                        col_num = numerical_column[dim_num]
+                        if len(col_num) > 0:
+                            peak_index = np.random.randint(0, leng)
+                            peak_value = (
+                                _x[col_num, tar+peak_index] < 0.5
+                            ).float().to(device)
+                            peak_value = peak_value + \
+                                (0.1 * (1 - 2 * peak_value)) * \
+                                torch.rand(len(col_num), device=device)
+                            _x[col_num, tar+peak_index] = peak_value
+
+                            tar_first = np.maximum(
+                                0, tar + peak_index - patch_size)
+                            tar_last = tar + peak_index + patch_size + 1
+                            x_anomaly[j, tar_first:tar_last] = 1
+                            x[j] = _x.transpose(0, 1)
+
+            z = torch.stack(x)
+            y = self.model(z)
+            y = y.squeeze(-1)
+            loss = train_loss_fn(sigmoid(y), x_anomaly)
+
+            optimizer.zero_grad()
+            loss.backward()
+            nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
+            optimizer.step()
+            scheduler.step()
+
+        device = torch.device(
+            self.device if torch.cuda.is_available() else 'cpu')
+        self.model.eval()
+
+        test_data = self.X_test
+        window_size = self.n_patches * self.patch_size
+        window_sliding = self.window_sliding  # Default from estimate.py
+        batch_size = self.batch_size
+
+        # We will just slide over the test data
+
+        n_samples = len(test_data)
+        output_values = torch.zeros(n_samples, device=device)
+        n_overlap = torch.zeros(n_samples, device=device)
+
+        sigmoid = nn.Sigmoid().to(device)
+
+        with torch.no_grad():
+            # Pad test data if needed or just handle boundaries
+            # estimate.py handles divisions. We assume 1 continuous sequence.
+
+            # We need to batch the sliding windows
+            indices = list(
+                range(0, n_samples - window_size + 1, window_sliding))
+
+            for i in range(0, len(indices), batch_size):
+                batch_indices = indices[i:i+batch_size]
+                x_batch = []
+                for idx in batch_indices:
+                    x_batch.append(test_data[idx:idx+window_size])
+
+                if not x_batch:
+                    continue
+
+                x_batch = torch.Tensor(np.stack(x_batch)).to(device)
+                # (batch, window_size)
+                y_batch = sigmoid(self.model(x_batch)).squeeze(-1)
+
+                for j, idx in enumerate(batch_indices):
+                    output_values[idx:idx+window_size] += y_batch[j]
+                    n_overlap[idx:idx+window_size] += 1
+
+        n_overlap[n_overlap == 0] = 1
+        self.anomaly_scores = (output_values / n_overlap).cpu().numpy()
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
+        )
+
+    def get_result(self):
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/autoencoder.py b/solvers/autoencoder.py
new file mode 100644
index 0000000..a36fad0
--- /dev/null
+++ b/solvers/autoencoder.py
@@ -0,0 +1,85 @@
+from benchopt import BaseSolver
+
+from sklearn.preprocessing import MinMaxScaler
+
+from benchmark_utils.models import Autoencoder
+from benchmark_utils.predictions import cutoff_scores
+from benchmark_utils.windowing import find_period_length
+
+
+class Solver(BaseSolver):
+    name = "AE"
+
+    install_cmd = "conda"
+    requirements = ["pytorch", "scikit-learn", "tqdm"]
+
+    parameters = {
+        "window_size": [10, "auto"],
+        "num_epochs": [100],
+        "batch_size": [1024],
+        "learning_rate": [1e-3],
+        "hidden_size": [64],
+        "latent_size": [32],
+        "cutoff": [None],
+    }
+
+    test_config = {
+        "window_size": 10,
+        "num_epochs": 1,
+        "batch_size": 8,
+    }
+
+    sampling_strategy = "run_once"
+
+    def set_objective(self, X_train, X_test):
+        if self.window_size == "auto":
+            self.window_size = find_period_length(X_train.reshape(-1))
+
+        # Data received has shape (n_recordings, n_features, n_samples)
+        n_features = X_train.shape[1]
+        self.X_train = X_train.reshape(-1, n_features)
+        self.X_test = X_test.reshape(-1, n_features)
+
+        # For multivariate data, input_size = window_size * n_features
+        self.clf = Autoencoder(
+            input_size=self.window_size * n_features,
+            sliding_window=self.window_size,
+            latent_size=self.latent_size,
+            hidden_size=self.hidden_size,
+        )
+
+    def run(self, _):
+        self.clf.fit(
+            self.X_train,
+            num_epochs=self.num_epochs,
+            batch_size=self.batch_size,
+            learning_rate=float(self.learning_rate),
+        )
+
+        self.clf.predict(self.X_test)
+        anomaly_scores = self.clf.decision_scores_
+
+        self.anomaly_scores = (
+            MinMaxScaler(feature_range=(0, 1))
+            .fit_transform(anomaly_scores.reshape(-1, 1))
+            .ravel()
+        )
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
+        )
+
+    def skip(self, X_train, X_test):
+        """Check if the solver can be skipped."""
+        if find_period_length(X_train.reshape(-1)) == 0 and (
+            self.window_size == "auto"
+        ):
+            return True, "Window size is 0"
+        return False, None
+
+    def get_result(self):
+        """Return the result of the solver."""
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/dagmm.py b/solvers/dagmm.py
new file mode 100644
index 0000000..148fbc3
--- /dev/null
+++ b/solvers/dagmm.py
@@ -0,0 +1,76 @@
+from benchopt import BaseSolver
+
+import pandas as pd
+from merlion.models.anomaly.dagmm import DAGMM, DAGMMConfig
+from merlion.utils.time_series import TimeSeries
+
+from benchmark_utils.predictions import cutoff_scores
+
+
+class Solver(BaseSolver):
+    name = "DAGMM"
+
+    install_cmd = "conda"
+    requirements = ["pip::salesforce-merlion", "pip::scikit-learn"]
+
+    parameters = {
+        "gmm_k": [3],
+        "hidden_size": [256],
+        "sequence_len": [10],
+        "num_epochs": [10],
+        "lr": [1e-3],
+        "batch_size": [8192],
+        "lambda_energy": [0.1],
+        "lambda_cov": [0.005],
+        "cutoff": [None],
+        # "device": ["cuda:3"]
+    }
+
+    sampling_strategy = "run_once"
+
+    def set_objective(self, X_train, X_test):
+        n_features = X_train.shape[1]
+        self.X_train = X_train.transpose(0, 2, 1).reshape(-1, n_features)
+        self.X_test = X_test.transpose(0, 2, 1).reshape(-1, n_features)
+        # Convert to Merlion TimeSeries
+        # We use a default index since we don't have timestamps
+        train_df = pd.DataFrame(self.X_train)
+        test_df = pd.DataFrame(self.X_test)
+
+        # Merlion expects a time index or it will generate one
+        self.train_data = TimeSeries.from_pd(train_df)
+        self.test_data = TimeSeries.from_pd(test_df)
+
+        # Configure DAGMM
+        config = DAGMMConfig(
+            gmm_k=self.gmm_k,
+            hidden_size=self.hidden_size,
+            sequence_len=self.sequence_len,
+            num_epochs=self.num_epochs,
+            lr=self.lr,
+            batch_size=self.batch_size,
+            lambda_energy=self.lambda_energy,
+            lambda_cov=self.lambda_cov,
+            # device=self.device
+        )
+
+        self.model = DAGMM(config)
+
+    def run(self, _):
+        # Train
+        self.model.train(self.train_data)
+
+        # Predict
+        # get_anomaly_score returns a TimeSeries of scores
+        scores_ts = self.model.get_anomaly_score(self.test_data)
+        self.anomaly_scores = scores_ts.to_pd().values.flatten()
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
+        )
+
+    def get_result(self):
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/dif.py b/solvers/dif.py
deleted file mode 100644
index 6aeef8e..0000000
--- a/solvers/dif.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Deep Isolation Forest
-from benchopt import BaseSolver
-from benchopt import safe_import_context
-
-with safe_import_context() as import_ctx:
-    from pyod.models.dif import DIF
-    import numpy as np
-
-
-class Solver(BaseSolver):
-    name = "DIF"
-
-    install_cmd = "conda"
-    requirements = ["pip:pyod"]
-
-    parameters = {
-        "contamination": [0.05, 0.1, 0.2],
-        "window": [True],
-        "window_size": [20],
-        "stride": [1],
-    }
-
-    sampling_strategy = "run_once"
-
-    def set_objective(self, X_train, y_test, X_test):
-        self.X_train = X_train
-        self.X_test, self.y_test = X_test, y_test
-        # Device is automatically selected by the model
-        # if device=None
-        self.clf = DIF(contamination=self.contamination, device=None)
-
-    def run(self, _):
-        # Using only windowed data, parameter used only for consistency
-        if self.window:
-
-            # Transofrming the data into rolling windowed data
-            if self.X_train is not None:
-                self.Xw_train = np.lib.stride_tricks.sliding_window_view(
-                    self.X_train, window_shape=self.window_size, axis=0
-                )[::self.stride].transpose(0, 2, 1)
-
-            if self.X_test is not None:
-                self.Xw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.X_test, window_shape=self.window_size, axis=0
-                )[::self.stride].transpose(0, 2, 1)
-
-            if self.y_test is not None:
-                self.yw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.y_test, window_shape=self.window_size, axis=0
-                )[::self.stride]
-
-            # Flattening the data for the model
-            flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1)
-            flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1)
-
-            self.clf.fit(flatrain)
-            raw_y_hat = self.clf.predict(flatest)
-            raw_anomaly_score = self.clf.decision_function(flatest)
-
-            # The results we get has a shape of
-            result_shape = (
-                (self.X_train.shape[0] - self.window_size) // self.stride
-            ) + 1
-
-            # Mapping the binary output from {-1, 1} to {1, 0}
-            # For consistency with the other solvers
-            self.raw_y_hat = np.array(raw_y_hat)
-            self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0)
-
-            # Adding -1 for the non predicted samples
-            # The first window_size samples are not predicted by the model
-            self.raw_y_hat = np.append(
-                np.full(self.X_train.shape[0] -
-                        result_shape, -1), self.raw_y_hat
-            )
-
-            # Anomaly scores (Not used but allows finer thresholding)
-            self.raw_anomaly_score = np.array(raw_anomaly_score)
-            self.raw_anomaly_score = np.append(
-                np.full(result_shape, -1), self.raw_anomaly_score
-            )
-
-    def skip(self, X_train, X_test, y_test):
-        if X_train.shape[0] < self.window_size:
-            return True, "Not enough samples to create a window"
-        return False, None
-
-    def get_result(self):
-        # Anomaly : 1
-        # Inlier : 0
-        # To ignore : -1
-        self.y_hat = self.raw_y_hat
-        return dict(y_hat=self.y_hat)
diff --git a/solvers/isolation-forest.py b/solvers/isolation-forest.py
deleted file mode 100644
index dac03e3..0000000
--- a/solvers/isolation-forest.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Isolation Forest solver
-
-from benchopt import BaseSolver
-from benchopt import safe_import_context
-
-with safe_import_context() as import_ctx:
-    from sklearn.ensemble import IsolationForest
-    import numpy as np
-
-
-class Solver(BaseSolver):
-    name = "IsolationForest"
-
-    install_cmd = "conda"
-    requirements = ["scikit-learn"]
-
-    parameters = {
-        "contamination": [5e-4, 5e-3, 5e-2, 0.1, 0.2, 0.4, 0.5],
-        "window": [True],
-        "window_size": [60, 120, 180],
-        "stride": [1],
-    }
-
-    sampling_strategy = "run_once"
-
-    def set_objective(self, X_train, y_test, X_test):
-        self.X_train = X_train
-        self.X_test, self.y_test = X_test, y_test
-        self.clf = IsolationForest(contamination=self.contamination)
-
-    def run(self, _):
-        if self.window:
-            # We need to transform the data to have a rolling window
-            if self.X_train is not None:
-                self.Xw_train = np.lib.stride_tricks.sliding_window_view(
-                    self.X_train, window_shape=self.window_size, axis=0
-                )[::self.stride].transpose(0, 2, 1)
-
-            if self.X_test is not None:
-                self.Xw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.X_test, window_shape=self.window_size, axis=0
-                )[::self.stride].transpose(0, 2, 1)
-
-            if self.y_test is not None:
-                self.yw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.y_test, window_shape=self.window_size, axis=0
-                )[::self.stride]
-
-            flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1)
-            flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1)
-
-            self.clf.fit(flatrain)
-            raw_y_hat = self.clf.predict(flatest)
-            raw_anomaly_score = self.clf.decision_function(flatest)
-
-            # The results we get has a shape of
-            result_shape = (
-                (self.X_train.shape[0] - self.window_size) // self.stride
-            ) + 1
-
-            # Mapping the binary output from {-1, 1} to {1, 0}
-            # For consistency with the other solvers
-            self.raw_y_hat = np.array(raw_y_hat)
-            self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0)
-
-            # Adding -1 for the non predicted samples
-            # The first window_size samples are not predicted by the model
-            self.raw_y_hat = np.append(
-                np.full(self.X_train.shape[0] -
-                        result_shape, -1), self.raw_y_hat
-            )
-
-            # Anomaly scores (Not used but allows finer thresholding)
-            self.raw_anomaly_score = np.array(raw_anomaly_score)
-            self.raw_anomaly_score = np.append(
-                np.full(result_shape, -1), self.raw_anomaly_score
-            )
-
-    def skip(self, X_train, X_test, y_test):
-        # Skip if dataset size is smaller than window size
-        if X_train.shape[0] < self.window_size:
-            return True, "Window size is larger than dataset size. Skipping."
-        return False, None
-
-    def get_result(self):
-        # Anomaly : 1
-        # Inlier : 0
-        # To ignore : -1
-        self.y_hat = self.raw_y_hat
-        return dict(y_hat=self.y_hat)
diff --git a/solvers/abod.py b/solvers/legacy/abod.py
similarity index 52%
rename from solvers/abod.py
rename to solvers/legacy/abod.py
index 6ff02ae..13bca75 100644
--- a/solvers/abod.py
+++ b/solvers/legacy/abod.py
@@ -1,18 +1,18 @@
 # ABOD solver
 
 from benchopt import BaseSolver
-from benchopt import safe_import_context
 
-with safe_import_context() as import_ctx:
-    from pyod.models.abod import ABOD
-    import numpy as np
+from pyod.models.abod import ABOD
+import numpy as np
+
+from benchmark_utils.predictions import cutoff_scores
 
 
 class Solver(BaseSolver):
     name = "ABOD"  # Angle-Based Outlier Detection
 
     install_cmd = "conda"
-    requirements = ["pip:pyod"]
+    requirements = ["pip::pyod"]
 
     parameters = {
         "contamination": [5e-4, 0.1, 0.2, 0.3],
@@ -20,13 +20,14 @@ class Solver(BaseSolver):
         "window": [True],
         "window_size": [20],
         "stride": [1],
+        "cutoff": [None],
     }
 
     sampling_strategy = "run_once"
 
-    def set_objective(self, X_train, y_test, X_test):
+    def set_objective(self, X_train, X_test):
         self.X_train = X_train
-        self.X_test, self.y_test = X_test, y_test
+        self.X_test = X_test
         self.clf = ABOD(
             n_neighbors=self.n_neighbors,
             contamination=self.contamination,
@@ -48,45 +49,27 @@ def run(self, _):
                     self.X_test, window_shape=self.window_size, axis=0
                 )[::self.stride].transpose(0, 2, 1)
 
-            if self.y_test is not None:
-                self.yw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.y_test, window_shape=self.window_size, axis=0
-                )[::self.stride]
-
             # Flattening the data for the model
             flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1)
             flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1)
 
             self.clf.fit(flatrain)
-
-            raw_y_hat = self.clf.predict(flatest)
-            raw_anomaly_score = self.clf.decision_function(flatest)
-
-            # The results we get has a shape of
-            result_shape = (
-                (self.X_train.shape[0] - self.window_size) // self.stride
-            ) + 1
-
-            # Mapping the binary output from {-1, 1} to {1, 0}
-            # For consistency with the other solvers
-            self.raw_y_hat = np.array(raw_y_hat)
-            self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0)
-
-            # Adding -1 for the non predicted samples
-            # The first window_size samples are not predicted by the model
-            self.raw_y_hat = np.append(
-                np.full(self.X_train.shape[0] -
-                        result_shape, -1), self.raw_y_hat
+            anomaly_scores = self.clf.decision_function(flatest)
+
+            # Anomaly scores
+            self.anomaly_scores = np.array(anomaly_scores)
+            padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0)
+            self.anomaly_scores = np.append(
+                np.full(padding, np.nan),
+                self.anomaly_scores,
             )
-
-            # Anomaly scores (Not used but allows finer thresholding)
-            self.raw_anomaly_score = np.array(raw_anomaly_score)
-            self.raw_anomaly_score = np.append(
-                np.full(result_shape, -1), self.raw_anomaly_score
+            self.anomaly_predictions = cutoff_scores(
+                self.anomaly_scores,
+                cutoff=self.cutoff,
             )
 
     # Function used to skip a solver call when n_neighbors >= window_size
-    def skip(self, X_train, X_test, y_test):
+    def skip(self, X_train, X_test):
         if self.n_neighbors >= self.window_size:
             return True, "Number of neighbors greater than number of samples."
         return False, None
@@ -95,5 +78,7 @@ def get_result(self):
         # Anomaly : 1
         # Inlier : 0
         # To ignore : -1
-        self.y_hat = self.raw_y_hat
-        return dict(y_hat=self.y_hat)
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/cblof.py b/solvers/legacy/cblof.py
similarity index 51%
rename from solvers/cblof.py
rename to solvers/legacy/cblof.py
index 3e44432..1e65a7b 100644
--- a/solvers/cblof.py
+++ b/solvers/legacy/cblof.py
@@ -1,18 +1,18 @@
 # Cluster Based Local Outlier Factor (CBLOF) solver
 
 from benchopt import BaseSolver
-from benchopt import safe_import_context
 
-with safe_import_context() as import_ctx:
-    from pyod.models.cblof import CBLOF
-    import numpy as np
+from pyod.models.cblof import CBLOF
+import numpy as np
+
+from benchmark_utils.predictions import cutoff_scores
 
 
 class Solver(BaseSolver):
     name = "CBLOF"
 
     install_cmd = "conda"
-    requirements = ["pip:pyod"]
+    requirements = ["pip::pyod"]
 
     parameters = {
         "contamination": [5e-4, 0.01, 0.02, 0.03, 0.04],
@@ -20,13 +20,14 @@ class Solver(BaseSolver):
         "n_clusters": [10],
         "window_size": [20],
         "stride": [1],
+        "cutoff": [None],
     }
 
     sampling_strategy = "run_once"
 
-    def set_objective(self, X_train, y_test, X_test):
+    def set_objective(self, X_train, X_test):
         self.X_train = X_train
-        self.X_test, self.y_test = X_test, y_test
+        self.X_test = X_test
         self.clf = CBLOF(
             contamination=self.contamination,
             n_clusters=self.n_clusters
@@ -47,44 +48,27 @@ def run(self, _):
                     self.X_test, window_shape=self.window_size, axis=0
                 )[::self.stride].transpose(0, 2, 1)
 
-            if self.y_test is not None:
-                self.yw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.y_test, window_shape=self.window_size, axis=0
-                )[::self.stride]
-
             # Flattening the data for the model
             flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1)
             flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1)
 
             self.clf.fit(flatrain)
-            raw_y_hat = self.clf.predict(flatest)
-            raw_anomaly_score = self.clf.decision_function(flatest)
-
-            # The results we get has a shape of
-            result_shape = (
-                (self.X_train.shape[0] - self.window_size) // self.stride
-            ) + 1
-
-            # Mapping the binary output from {-1, 1} to {1, 0}
-            # For consistency with the other solvers
-            self.raw_y_hat = np.array(raw_y_hat)
-            self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0)
-
-            # Adding -1 for the non predicted samples
-            # The first window_size samples are not predicted by the model
-            self.raw_y_hat = np.append(
-                np.full(self.X_train.shape[0] -
-                        result_shape, -1), self.raw_y_hat
+            anomaly_scores = self.clf.decision_function(flatest)
+
+            # Anomaly scores
+            self.anomaly_scores = np.array(anomaly_scores)
+            padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0)
+            self.anomaly_scores = np.append(
+                np.full(padding, np.nan),
+                self.anomaly_scores,
             )
-
-            # Anomaly scores (Not used but allows finer thresholding)
-            self.raw_anomaly_score = np.array(raw_anomaly_score)
-            self.raw_anomaly_score = np.append(
-                np.full(result_shape, -1), self.raw_anomaly_score
+            self.anomaly_predictions = cutoff_scores(
+                self.anomaly_scores,
+                cutoff=self.cutoff,
             )
 
     # Skipping the solver call if a condition is met
-    def skip(self, X_train, X_test, y_test):
+    def skip(self, X_train, X_test):
         if X_train.shape[0] < self.window_size:
             return True, "No enough samples to create a window"
         return False, None
@@ -93,5 +77,7 @@ def get_result(self):
         # Anomaly : 1
         # Inlier : 0
         # To ignore : -1
-        self.y_hat = self.raw_y_hat
-        return dict(y_hat=self.y_hat)
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/legacy/dif.py b/solvers/legacy/dif.py
new file mode 100644
index 0000000..36de441
--- /dev/null
+++ b/solvers/legacy/dif.py
@@ -0,0 +1,79 @@
+# Deep Isolation Forest
+from benchopt import BaseSolver
+
+from pyod.models.dif import DIF
+import numpy as np
+
+from benchmark_utils.predictions import cutoff_scores
+
+
+class Solver(BaseSolver):
+    name = "DIF"
+
+    install_cmd = "conda"
+    requirements = ["pip::pyod"]
+
+    parameters = {
+        "contamination": [0.05, 0.1, 0.2],
+        "window": [True],
+        "window_size": [20],
+        "stride": [1],
+        "cutoff": [None],
+    }
+
+    sampling_strategy = "run_once"
+
+    def set_objective(self, X_train, X_test):
+        self.X_train = X_train
+        self.X_test = X_test
+        # Device is automatically selected by the model
+        # if device=None
+        self.clf = DIF(contamination=self.contamination, device=None)
+
+    def run(self, _):
+        # Using only windowed data, parameter used only for consistency
+        if self.window:
+
+            # Transofrming the data into rolling windowed data
+            if self.X_train is not None:
+                self.Xw_train = np.lib.stride_tricks.sliding_window_view(
+                    self.X_train, window_shape=self.window_size, axis=0
+                )[::self.stride].transpose(0, 2, 1)
+
+            if self.X_test is not None:
+                self.Xw_test = np.lib.stride_tricks.sliding_window_view(
+                    self.X_test, window_shape=self.window_size, axis=0
+                )[::self.stride].transpose(0, 2, 1)
+
+            # Flattening the data for the model
+            flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1)
+            flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1)
+
+            self.clf.fit(flatrain)
+            anomaly_scores = self.clf.decision_function(flatest)
+
+            # Anomaly scores
+            self.anomaly_scores = np.array(anomaly_scores)
+            padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0)
+            self.anomaly_scores = np.append(
+                np.full(padding, np.nan),
+                self.anomaly_scores,
+            )
+            self.anomaly_predictions = cutoff_scores(
+                self.anomaly_scores,
+                cutoff=self.cutoff,
+            )
+
+    def skip(self, X_train, X_test):
+        if X_train.shape[0] < self.window_size:
+            return True, "Not enough samples to create a window"
+        return False, None
+
+    def get_result(self):
+        # Anomaly : 1
+        # Inlier : 0
+        # To ignore : -1
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/legacy/isolation-forest.py b/solvers/legacy/isolation-forest.py
new file mode 100644
index 0000000..83e8839
--- /dev/null
+++ b/solvers/legacy/isolation-forest.py
@@ -0,0 +1,106 @@
+# Isolation Forest solver
+
+from benchopt import BaseSolver
+
+from sklearn.ensemble import IsolationForest
+import numpy as np
+
+from benchmark_utils.predictions import cutoff_scores
+
+
+class Solver(BaseSolver):
+    name = "IsolationForest"
+
+    install_cmd = "conda"
+    requirements = ["scikit-learn"]
+
+    parameters = {
+        "contamination": [5e-4, 5e-3, 5e-2, 0.1, 0.2, 0.4, 0.5],
+        "window": [True],
+        "window_size": [60, 120, 180],
+        "stride": [1],
+        "cutoff": [None],
+    }
+
+    sampling_strategy = "run_once"
+
+    def set_objective(self, X_train, X_test):
+        self.X_train = X_train
+        self.X_test = X_test
+        n_recordings, n_features, n_samples = self.X_train.shape
+        self.clf = IsolationForest(contamination=self.contamination)
+
+    def run(self, _):
+        if self.window:
+            # We need to transform the data to have a rolling window
+            if self.X_train is not None:
+                # Apply sliding window along the time dimension (axis=2)
+                n_recordings, n_features, n_samples = self.X_train.shape
+                self.Xw_train = np.lib.stride_tricks.sliding_window_view(
+                    self.X_train, window_shape=self.window_size, axis=2
+                )[:, :, ::self.stride].transpose(0, 1, 3, 2)
+
+            if self.X_test is not None:
+                n_recordings, n_features, n_samples = self.X_test.shape
+                self.Xw_test = np.lib.stride_tricks.sliding_window_view(
+                    self.X_test, window_shape=self.window_size, axis=2
+                )[:, :, ::self.stride].transpose(0, 1, 3, 2)
+
+            # Flatten for sklearn
+            flatrain = self.Xw_train.reshape(
+                self.Xw_train.shape[0] * self.Xw_train.shape[1], -1)
+            flatest = self.Xw_test.reshape(
+                self.Xw_test.shape[0] * self.Xw_test.shape[1], -1)
+
+            self.clf.fit(flatrain)
+            anomaly_scores = -self.clf.decision_function(flatest)
+
+            # The results we get has a shape of
+            n_recordings, n_features, n_windows, _ = self.Xw_test.shape
+
+            # Anomaly scores
+            self.anomaly_scores = np.array(anomaly_scores)
+            self.anomaly_scores = self.anomaly_scores.reshape(
+                n_recordings, n_features, n_windows)
+        else:
+            # No windowing case
+            # Flatten the data for sklearn
+            n_recordings, n_features, n_samples = self.X_train.shape
+            X_train_flat = self.X_train.reshape(-1, n_features)
+            X_test_flat = self.X_test.reshape(-1, n_features)
+
+            self.clf.fit(X_train_flat)
+            self.anomaly_scores = -self.clf.decision_function(X_test_flat)
+
+            # Reshape to (n_recordings, n_samples) for single feature case
+            # We assume we take the first feature or average across features
+            self.anomaly_scores = self.anomaly_scores.reshape(
+                n_recordings, n_samples)
+
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
+        )
+
+    def skip(self, X_train, X_test):
+        # Skip if dataset size is smaller than window size
+        _, _, n_samples = X_train.shape
+        if n_samples < self.window_size:
+            return True, "Window size is larger than dataset size. Skipping."
+        return False, None
+
+    def get_result(self):
+        # Anomaly : 1
+        # Inlier : 0
+        # To ignore : -1
+        # For now, take the first recording
+        anomaly_scores = self.anomaly_scores[0] if (
+            self.anomaly_scores.ndim > 1
+        ) else self.anomaly_scores
+        result = dict(anomaly_scores=anomaly_scores)
+        if self.anomaly_predictions is not None:
+            anomaly_predictions = self.anomaly_predictions[0] if (
+                self.anomaly_predictions.ndim > 1
+            ) else self.anomaly_predictions
+            result["anomaly_predictions"] = anomaly_predictions
+        return result
diff --git a/solvers/lof.py b/solvers/legacy/lof.py
similarity index 53%
rename from solvers/lof.py
rename to solvers/legacy/lof.py
index 1ce2058..9075caa 100644
--- a/solvers/lof.py
+++ b/solvers/legacy/lof.py
@@ -1,11 +1,11 @@
 # Local Outlier Factor
 
 from benchopt import BaseSolver
-from benchopt import safe_import_context
 
-with safe_import_context() as import_ctx:
-    from sklearn.neighbors import LocalOutlierFactor
-    import numpy as np
+from sklearn.neighbors import LocalOutlierFactor
+import numpy as np
+
+from benchmark_utils.predictions import cutoff_scores
 
 
 class Solver(BaseSolver):
@@ -20,13 +20,14 @@ class Solver(BaseSolver):
         "window": [True],
         "window_size": [20],
         "stride": [1],
+        "cutoff": [None],
     }
 
     sampling_strategy = "run_once"
 
-    def set_objective(self, X_train, y_test, X_test):
+    def set_objective(self, X_train, X_test):
         self.X_train = X_train
-        self.X_test, self.y_test = X_test, y_test
+        self.X_test = X_test
         self.clf = LocalOutlierFactor(
             novelty=True,
             n_neighbors=self.n_neighbors,
@@ -46,42 +47,25 @@ def run(self, _):
                     self.X_test, window_shape=self.window_size, axis=0
                 )[::self.stride].transpose(0, 2, 1)
 
-            if self.y_test is not None:
-                self.yw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.y_test, window_shape=self.window_size, axis=0
-                )[::self.stride]
-
             flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1)
             flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1)
 
             self.clf.fit(flatrain)
-            raw_y_hat = self.clf.predict(flatest)
-            raw_anomaly_score = self.clf.decision_function(flatest)
-
-            # The results we get has a shape of
-            result_shape = (
-                (self.X_train.shape[0] - self.window_size) // self.stride
-            ) + 1
-
-            # Mapping the binary output from {-1, 1} to {1, 0}
-            # For consistency with the other solvers
-            self.raw_y_hat = np.array(raw_y_hat)
-            self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0)
-
-            # Adding -1 for the non predicted samples
-            # The first window_size samples are not predicted by the model
-            self.raw_y_hat = np.append(
-                np.full(self.X_train.shape[0] -
-                        result_shape, -1), self.raw_y_hat
+            anomaly_scores = -self.clf.decision_function(flatest)
+
+            # Anomaly scores
+            self.anomaly_scores = np.array(anomaly_scores)
+            padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0)
+            self.anomaly_scores = np.append(
+                np.full(padding, np.nan),
+                self.anomaly_scores,
             )
-
-            # Anomaly scores (Not used but allows finer thresholding)
-            self.raw_anomaly_score = np.array(raw_anomaly_score)
-            self.raw_anomaly_score = np.append(
-                np.full(result_shape, -1), self.raw_anomaly_score
+            self.anomaly_predictions = cutoff_scores(
+                self.anomaly_scores,
+                cutoff=self.cutoff,
             )
 
-    def skip(self, X_train, y_test, X_test):
+    def skip(self, X_train, X_test):
         if self.n_neighbors > self.window_size:
             return True, "Number of neighbors greater than number of samples."
         if self.n_neighbors > X_train.shape[0]:
@@ -94,5 +78,7 @@ def get_result(self):
         # Anomaly : 1
         # Inlier : 0
         # To ignore : -1
-        self.y_hat = self.raw_y_hat
-        return dict(y_hat=self.y_hat)
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/legacy/ocsvm.py b/solvers/legacy/ocsvm.py
new file mode 100644
index 0000000..1813763
--- /dev/null
+++ b/solvers/legacy/ocsvm.py
@@ -0,0 +1,76 @@
+from benchopt import BaseSolver
+
+from sklearn.svm import OneClassSVM
+import numpy as np
+
+from benchmark_utils.predictions import cutoff_scores
+
+
+class Solver(BaseSolver):
+    name = "OCSVM"
+
+    install_cmd = "conda"
+    requirements = ["scikit-learn"]
+
+    parameters = {
+        "nu": [0.001, 0.01, 0.05],
+        "gamma": [1e-5, 1e-2],
+        "kernel": ["rbf"],
+        "window": [True],
+        "window_size": [128],
+        "stride": [1],
+        "cutoff": [None],
+    }
+
+    sampling_strategy = "run_once"
+
+    def set_objective(self, X_train, X_test):
+        self.X_train = X_train
+        self.X_test = X_test
+        self.clf = OneClassSVM(
+            nu=self.nu,
+            kernel=self.kernel,
+            gamma=self.gamma,
+        )
+
+        if self.window:
+            if self.X_train is not None:
+                self.Xw_train = np.lib.stride_tricks.sliding_window_view(
+                    self.X_train, window_shape=self.window_size, axis=0
+                )[::self.stride].transpose(0, 2, 1)
+
+            if self.X_test is not None:
+                self.Xw_test = np.lib.stride_tricks.sliding_window_view(
+                    self.X_test, window_shape=self.window_size, axis=0
+                )[::self.stride].transpose(0, 2, 1)
+
+            self.flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1)
+            self.flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1)
+
+    def run(self, _):
+        if self.window:
+            self.clf.fit(self.flatrain)
+            anomaly_scores = -self.clf.decision_function(self.flatest)
+
+            # Anomaly scores
+            self.anomaly_scores = np.array(anomaly_scores)
+            padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0)
+            self.anomaly_scores = np.append(
+                np.full(padding, np.nan),
+                self.anomaly_scores,
+            )
+            self.anomaly_predictions = cutoff_scores(
+                self.anomaly_scores,
+                cutoff=self.cutoff,
+            )
+
+    def skip(self, X_train, X_test):
+        if X_train.shape[0] < self.window_size:
+            return True, "Window size is larger than dataset size."
+        return False, None
+
+    def get_result(self):
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/lstm.py b/solvers/lstm.py
index b3e128f..ff4a975 100644
--- a/solvers/lstm.py
+++ b/solvers/lstm.py
@@ -1,21 +1,23 @@
 # LSTM Autoencoder
-from benchopt import BaseSolver, safe_import_context
+from benchopt import BaseSolver
 
-with safe_import_context() as import_ctx:
-    import torch
-    import torch.nn as nn
-    import torch.optim as optim
-    import numpy as np
-    from torch.utils.data import DataLoader
-    from tqdm import tqdm
-    from benchmark_utils.models import AutoEncoderLSTM
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from benchmark_utils.models import AutoEncoderLSTM
+from benchmark_utils.windowing import make_windowed_dataset
+from benchmark_utils.windowing import reconstruct_from_windows
+from benchmark_utils.predictions import cutoff_scores
 
 
 class Solver(BaseSolver):
     name = "LSTM"
 
     install_cmd = "conda"
-    requirements = ["pip:torch", "tqdm"]
+    requirements = ["pytorch", "tqdm"]
 
     sampling_strategy = "run_once"
 
@@ -24,28 +26,28 @@ class Solver(BaseSolver):
         "batch_size": [32],
         "n_epochs": [50],
         "lr": [1e-5],
-        "window": [True],
         "window_size": [256],  # window_size = seq_len
         "stride": [1],
-        "percentile": [97],
+        "cutoff": [None],
         "encoder_layers": [32],
         "decoder_layers": [32],
     }
 
-    def prepare_data(self, *data):
-        # return tensors on device
-        return (torch.tensor(
-            d, dtype=torch.float32, device=self.device)
-            for d in data)
+    test_config = {
+        "embedding_dim": 2,
+        "batch_size": 1,
+        "n_epochs": 1,
+        "window_size": 16,
+    }
 
-    def set_objective(self, X_train, y_test, X_test):
+    def set_objective(self, X_train, X_test):
 
         self.device = torch.device(
             "cuda" if torch.cuda.is_available() else "cpu"
         )
 
         self.X_train = X_train
-        self.X_test, self.y_test = X_test, y_test
+        self.X_test = X_test
         self.n_features = X_train.shape[1]
         self.seq_len = self.window_size
 
@@ -59,33 +61,15 @@ def set_objective(self, X_train, y_test, X_test):
         self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
         self.criterion = nn.MSELoss()
 
-        if self.window:
-            if self.X_train is not None:
-                self.Xw_train = np.lib.stride_tricks.sliding_window_view(
-                    self.X_train, window_shape=self.window_size, axis=0
-                )[::self.stride].transpose(0, 2, 1)
-
-                self.Xw_train = torch.tensor(
-                    self.Xw_train, dtype=torch.float32
-                )
-
-            if self.X_test is not None:
-                self.Xw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.X_test, window_shape=self.window_size, axis=0
-                )[::self.stride].transpose(0, 2, 1)
-
-                self.Xw_test = torch.tensor(
-                    self.Xw_test, dtype=torch.float32
-                )
-
-            if self.y_test is not None:
-                self.yw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.y_test, window_shape=self.window_size, axis=0
-                )[::self.stride]
+        self.Xw_train = make_windowed_dataset(
+            self.X_train, window_size=self.window_size,
+            stride=self.stride
+        )
 
-                self.yw_test = torch.tensor(
-                    self.yw_test, dtype=torch.float32
-                )
+        self.Xw_test = make_windowed_dataset(
+            self.X_test, window_size=self.window_size,
+            stride=self.stride
+        )
 
         self.train_loader = DataLoader(
             self.Xw_train, batch_size=self.batch_size, shuffle=True,
@@ -105,7 +89,7 @@ def run(self, _):
         for epoch in ti:
             self.model.train()
             train_loss = 0
-            for i, x in enumerate(self.train_loader):
+            for x, in self.train_loader:
 
                 x = x.to(self.device)
 
@@ -120,38 +104,38 @@ def run(self, _):
 
             ti.set_postfix(train_loss=f"{train_loss:.5f}")
 
-        # Saving the model
-        torch.save(self.model.state_dict(), "model.pth")
-
         # Test loop
         self.model.eval()
         raw_reconstruction = []
-        for x in self.test_loader:
+        for x, in self.test_loader:
 
             x = x.to(self.device)
-
-            x_hat = self.model(x)
+            with torch.no_grad():
+                x_hat = self.model(x)
             raw_reconstruction.append(x_hat.detach().cpu().numpy())
-
-        raw_reconstruction = np.concatenate(raw_reconstruction, axis=0)
-
-        reconstructed_data = np.concatenate(
-            [raw_reconstruction[0], raw_reconstruction[1:, -1, :]], axis=0
+        reconstructed_data = np.concatenate(raw_reconstruction, axis=0)
+        reconstructed_data = reconstruct_from_windows(
+            reconstructed_data, stride=self.stride,
+            batch=len(self.X_test), n_features=self.n_features
         )
 
         reconstruction_err = np.mean(
             np.abs(self.X_test - reconstructed_data), axis=1
         )
+        self.anomaly_scores = reconstruction_err
 
-        self.y_hat = np.where(
-            reconstruction_err > np.percentile(
-                reconstruction_err, self.percentile), 1, 0
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
         )
 
-    def skip(self, X_train, X_test, y_test):
-        if X_train.shape[0] < self.window_size:
+    def skip(self, X_train, X_test):
+        if X_train.shape[-1] < self.window_size:
             return True, "Not enough samples to create a window."
         return False, None
 
     def get_result(self):
-        return dict(y_hat=self.y_hat)
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/matrixprofile.py b/solvers/matrixprofile.py
new file mode 100644
index 0000000..7b91d8d
--- /dev/null
+++ b/solvers/matrixprofile.py
@@ -0,0 +1,74 @@
+from benchopt import BaseSolver
+from sklearn.preprocessing import MinMaxScaler
+
+from benchmark_utils.predictions import cutoff_scores
+from benchmark_utils.windowing import find_period_length
+from TSB_AD.models.MatrixProfile import MatrixProfile
+
+
+class Solver(BaseSolver):
+    name = "MP"
+
+    install_cmd = "conda"
+    requirements = ["pip::tsb-ad", "scikit-learn"]
+
+    parameters = {
+        "window_size": [128, "auto"],
+        "cutoff": [None],
+    }
+
+    test_config = {
+        "dataset": {
+            "n_features": 1,
+        },
+        "window_size": 8,
+    }
+
+    sampling_strategy = "run_once"
+
+    def set_objective(self, X_train, X_test):
+        # Shapes received: (n_recordings, n_features, n_samples)
+        self.X_train = X_train
+        self.X_test = X_test
+
+        n_features = X_train.shape[1]
+
+        self.X_train = self.X_train.reshape(-1, n_features)
+        self.X_test = self.X_test.reshape(-1, n_features)
+
+        if self.window_size == "auto":
+            self.window_size = int(find_period_length(X_train.reshape(-1)))
+
+        self.clf = MatrixProfile(
+            window=self.window_size,
+        )
+
+    def run(self, _):
+        # Special solver, fitting on X_test
+        self.clf.fit(self.X_test.reshape(-1))
+        anomaly_scores = self.clf.decision_scores_
+        self.anomaly_scores = (
+            MinMaxScaler(feature_range=(0, 1))
+            .fit_transform(anomaly_scores.reshape(-1, 1))
+            .ravel()
+        )
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
+        )
+
+    def skip(self, X_train, X_test):
+        """Check if the solver can be skipped."""
+        if (find_period_length(X_train.reshape(-1)) == 0) and (
+                self.window_size == "auto"):
+            return True, "Window size is 0"
+        if X_train.shape[1] != 1:
+            return True, "Matrix Profile only supports univariate data"
+        return False, None
+
+    def get_result(self):
+        """Return the result of the solver."""
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/ocsvm.py b/solvers/ocsvm.py
deleted file mode 100644
index 268e57c..0000000
--- a/solvers/ocsvm.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from benchopt import BaseSolver, safe_import_context
-
-with safe_import_context() as import_ctx:
-    from sklearn.svm import OneClassSVM
-    import numpy as np
-
-
-class Solver(BaseSolver):
-    name = "OCSVM"
-
-    install_cmd = "conda"
-    requirements = ["scikit-learn"]
-
-    parameters = {
-        "nu": [0.001, 0.01, 0.05],
-        "gamma": [1e-5, 1e-2],
-        "kernel": ["rbf"],
-        "window": [True],
-        "window_size": [128],
-        "stride": [1],
-    }
-
-    sampling_strategy = "run_once"
-
-    def set_objective(self, X_train, y_test, X_test):
-        self.X_train = X_train
-        self.X_test, self.y_test = X_test, y_test
-        self.clf = OneClassSVM(
-            nu=self.nu,
-            kernel=self.kernel,
-            gamma=self.gamma,
-        )
-
-        if self.window:
-            if self.X_train is not None:
-                self.Xw_train = np.lib.stride_tricks.sliding_window_view(
-                    self.X_train, window_shape=self.window_size, axis=0
-                )[::self.stride].transpose(0, 2, 1)
-
-            if self.X_test is not None:
-                self.Xw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.X_test, window_shape=self.window_size, axis=0
-                )[::self.stride].transpose(0, 2, 1)
-
-            if self.y_test is not None:
-                self.yw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.y_test, window_shape=self.window_size, axis=0
-                )[::self.stride]
-
-            self.flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1)
-            self.flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1)
-
-    def run(self, _):
-        if self.window:
-            self.clf.fit(self.flatrain)
-            raw_y_hat = self.clf.predict(self.flatest)
-            raw_anomaly_score = self.clf.decision_function(self.flatest)
-
-            # The results we get has a shape of
-            result_shape = (
-                (self.X_train.shape[0] - self.window_size) // self.stride
-            ) + 1
-
-            # Mapping the binary output from {-1, 1} to {1, 0}
-            # For consistency with the other solvers
-            self.raw_y_hat = np.array(raw_y_hat)
-
-            # Adding -1 for the non predicted samples
-            # The first window_size samples are not predicted by the model
-            self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0)
-            self.raw_y_hat = np.append(
-                np.full(self.X_train.shape[0] -
-                        result_shape, -1), self.raw_y_hat
-            )
-
-            # Anomaly scores (Not used but allows finer thresholding)
-            self.raw_anomaly_score = np.array(raw_anomaly_score)
-            self.raw_anomaly_score = np.append(
-                np.full(result_shape, -1), self.raw_anomaly_score
-            )
-
-    def skip(self, X_train, X_test, y_test):
-        if X_train.shape[0] < self.window_size:
-            return True, "Window size is larger than dataset size."
-        return False, None
-
-    def get_result(self):
-        return dict(y_hat=self.raw_y_hat)
diff --git a/solvers/rosecdl.py b/solvers/rosecdl.py
new file mode 100644
index 0000000..6ccf54c
--- /dev/null
+++ b/solvers/rosecdl.py
@@ -0,0 +1,100 @@
+from benchopt import BaseSolver
+
+import torch
+from benchmark_utils.predictions import cutoff_scores
+from benchmark_utils.windowing import find_period_length
+from rosecdl.rosecdl import RoseCDL
+
+
+class Solver(BaseSolver):
+    name = "RoseCDL"
+
+    install_cmd = "conda"
+    requirements = [
+        "pytorch", "pip::git+https://github.com/tommoral/rosecdl.git"
+    ]
+
+    parameters = {
+        "n_components": [1],
+        "kernel_size": ["auto"],
+        "lmbd": [0.8],
+        "scale_lmbd": [False],
+        "epochs": [70],
+        "max_batch": [None],
+        "mini_batch_size": [600],
+        "sample_window": [1_000],
+        "optimizer": ["linesearch"],
+        "n_iterations": [90],
+        "window": [False],
+        "outliers_kwargs": [
+            {
+                "method": "mad",
+                "alpha": 3.5,
+                "moving_average": None,
+                "union_channels": True,
+                "opening_window": True,
+            },
+        ],
+        "plot": [False],
+        "cutoff": [None],
+    }
+
+    sampling_strategy = "run_once"
+
+    def set_objective(self, X_train, X_test):
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")
+
+        # We receive data in shape (n_recordings, n_features, n_samples)
+        self.X_train = torch.tensor(
+            X_train, dtype=torch.float32, device=self.device)
+        self.X_test = X_test
+
+        if self.kernel_size == "auto":
+            self.kernel_size = int(find_period_length(X_train.reshape(-1)))
+
+        self.clf = RoseCDL(
+            n_components=self.n_components,
+            n_channels=X_train.shape[1],
+            kernel_size=self.kernel_size,
+            lmbd=self.lmbd,
+            scale_lmbd=self.scale_lmbd,
+            epochs=self.epochs,
+            max_batch=self.max_batch,
+            mini_batch_size=self.mini_batch_size,
+            sample_window=self.sample_window,
+            optimizer=self.optimizer,
+            n_iterations=self.n_iterations,
+            window=self.window,
+            device=self.device,
+            outliers_kwargs=self.outliers_kwargs,
+        )
+
+    def run(self, _):
+        self.clf.fit(self.X_train)
+        del self.X_train  # Free GPU memory for X_train after fitting
+
+        xh, zh = self.clf.csc(
+            torch.tensor(self.X_test, dtype=torch.float32, device=self.device)
+        )
+        err = self.clf.loss_fn.compute_patch_error(
+            X_hat=xh,
+            z_hat=zh,
+            X=torch.tensor(self.X_test, dtype=torch.float32,
+                           device=self.device),
+        )
+        err = err.cpu().detach().numpy()
+        # Aggregate errors over channels
+        self.anomaly_scores = err.sum(axis=1).reshape(-1)
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
+        )
+        del self.clf  # Free GPU memory for the model
+        torch.cuda.empty_cache()  # Release cached GPU memory
+
+    def get_result(self):
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/sktime_lof.py b/solvers/sktime_lof.py
deleted file mode 100644
index 31e1c94..0000000
--- a/solvers/sktime_lof.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from benchopt import BaseSolver
-from benchopt import safe_import_context
-
-with safe_import_context() as import_ctx:
-    from sktime.annotation.lof import SubLOF
-    import pandas as pd
-    import numpy as np
-
-
-class Solver(BaseSolver):
-    name = "SubLOF"
-
-    install_cmd = "conda"
-    requirements = ["sktime", "pandas"]
-
-    parameters = {
-        "n_neighbors": [5, 10, 20, 25, 40],
-        "window_size": [20, 64, 128],
-        "leaf_size": [30, 40],
-        "contamination": ["auto", 0.1, 0.2, 0.3],
-    }
-
-    sampling_strategy = "run_once"
-
-    def set_objective(self, X_train, y_test, X_test):
-        self.X_train = pd.DataFrame(X_train)
-        self.X_test, self.y_test = pd.DataFrame(X_test), y_test
-        self.clf = SubLOF(
-            n_neighbors=self.n_neighbors,
-            window_size=self.window_size,
-            leaf_size=self.leaf_size,
-            contamination=self.contamination,
-            n_jobs=-1,
-            novelty=True,
-        )
-
-    def run(self, _):
-        self.clf.fit(self.X_train)
-        self.raw_y_hat = self.clf.predict(self.X_test)
-        # self.raw_anomaly_score = self.clf.predict_score(self.X_test)
-
-    def skip(self, X_train, y_test, X_test):
-        if self.n_neighbors > self.window_size:
-            return True, "Number of neighbors greater than window size"
-        if self.n_neighbors > X_train.shape[0]:
-            return True, "Number of neighbors greater than number of samples"
-        if self.leaf_size > X_train.shape[0]:
-            return True, "Leaf size greater than number of samples"
-        if self.window_size > X_train.shape[0]:
-            return True, "Window size greater than number of samples"
-        return False, None
-
-    def get_result(self):
-        self.y_hat = np.zeros(self.X_test.shape[0])
-        self.y_hat[self.raw_y_hat] = 1
-        return dict(y_hat=self.y_hat)
diff --git a/solvers/tsb_chronos.py b/solvers/tsb_chronos.py
new file mode 100644
index 0000000..855c842
--- /dev/null
+++ b/solvers/tsb_chronos.py
@@ -0,0 +1,59 @@
+from benchopt import BaseSolver
+
+import torch
+import numpy as np
+from TSB_AD.models.Chronos import Chronos
+from TSB_AD.utils.slidingWindows import find_length
+
+from benchmark_utils.predictions import cutoff_scores
+
+
+class Solver(BaseSolver):
+    name = "TSB-Chronos"
+
+    install_cmd = "conda"
+    requirements = ["pip::tsb-ad"]
+
+    parameters = {
+        "win_size": ["auto"],
+        "prediction_length": [1],
+        "model_size": ['base'],
+        "batch_size": [32],
+        "cutoff": [None],
+    }
+
+    sampling_strategy = "run_once"
+
+    def set_objective(self, X_train, X_test):
+        _, n_features, _ = X_train.shape
+        self.data = np.append(X_train, X_test, axis=2)
+        self.data = self.data.reshape(-1, n_features)
+        self.X_test = X_test.reshape(-1, n_features)
+
+        if self.win_size == "auto":
+            self.win_size = int(find_length(X_train.reshape(-1)))
+
+        self.clf = Chronos(
+            win_size=self.win_size,
+            input_c=n_features,
+            prediction_length=self.prediction_length,
+            model_size=self.model_size,
+            batch_size=self.batch_size,
+        )
+
+    def run(self, _):
+        self.clf.fit(self.data)
+        self.anomaly_scores = self.clf.decision_scores_[-len(self.X_test):]
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
+        )
+
+        del self.clf  # Free memory for the model
+        torch.cuda.empty_cache()  # Release cached GPU memory
+
+    def get_result(self):
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/tsb_timesfm.py b/solvers/tsb_timesfm.py
new file mode 100644
index 0000000..cfa591b
--- /dev/null
+++ b/solvers/tsb_timesfm.py
@@ -0,0 +1,52 @@
+from benchopt import BaseSolver
+
+from importlib.util import find_spec
+
+import numpy as np
+import torch
+from TSB_AD.model_wrapper import run_TimesFM
+
+from benchmark_utils.predictions import cutoff_scores
+
+
+class Solver(BaseSolver):
+    name = "TSB-TimesFM"
+
+    install_cmd = "conda"
+    requirements = ["pip::tsb-ad", "pip::timesfm"]
+
+    parameters = {
+        "win_size": [256],
+        "cutoff": [None],
+    }
+
+    sampling_strategy = "run_once"
+
+    def set_objective(self, X_train, X_test):
+        _, n_features, _ = X_train.shape
+        self.data = np.append(X_train, X_test, axis=2)
+        self.data = self.data.reshape(-1, n_features)
+        self.X_test = X_test.reshape(-1, n_features)
+
+    def skip(self, X_train, X_test):
+        if find_spec("timesfm") is None:
+            return True, "TSB-TimesFM requires the optional timesfm package."
+        return False, None
+
+    def run(self, _):
+        anomaly_scores = run_TimesFM(
+            data=self.data,
+            win_size=self.win_size,
+        )
+        self.anomaly_scores = anomaly_scores[-len(self.X_test):]
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
+        )
+        torch.cuda.empty_cache()  # Release cached GPU memory
+
+    def get_result(self):
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/tsb_timesnet.py b/solvers/tsb_timesnet.py
new file mode 100644
index 0000000..ed431ae
--- /dev/null
+++ b/solvers/tsb_timesnet.py
@@ -0,0 +1,76 @@
+from benchopt import BaseSolver
+
+import torch
+from TSB_AD.models.TimesNet import TimesNet
+
+from benchmark_utils.predictions import cutoff_scores
+
+
+class Solver(BaseSolver):
+    name = "TSB-TimesNet"
+
+    install_cmd = "conda"
+    requirements = ["pip::tsb-ad"]
+
+    parameters = {
+        "window_size": [256],
+        "lr": [1e-4],
+        "epochs": [10],
+        "batch_size": [128],
+        "cutoff": [None],
+    }
+
+    test_config = {
+        "dataset": {
+            "n_samples": 512,
+            "n_features": 2,
+            "n_anomaly": 32,
+        },
+        "window_size": 32,
+        "epochs": 1,
+        "batch_size": 16,
+    }
+
+    sampling_strategy = "run_once"
+
+    def set_objective(self, X_train, X_test):
+        _, n_features, _ = X_train.shape
+        self.X_train = X_train.reshape(-1, n_features)
+        self.X_test = X_test.reshape(-1, n_features)
+
+        self.clf = TimesNet(
+            win_size=self.window_size,
+            enc_in=n_features,
+            epochs=self.epochs,
+            batch_size=self.batch_size,
+            lr=self.lr,
+            patience=3,
+            features="M",
+            lradj="type1",
+            validation_size=0.2,
+        )
+
+    def run(self, _):
+        self.clf.fit(self.X_train)
+        self.anomaly_scores = self.clf.decision_function(self.X_test)
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
+        )
+
+        del self.clf.model
+        del self.clf
+        torch.cuda.empty_cache()  # Release cached GPU memory
+
+    def skip(self, X_train, X_test):
+        if X_train.shape[-1] < self.window_size:
+            return True, "Not enough training samples to create a window."
+        if X_test.shape[-1] < self.window_size:
+            return True, "Not enough testing samples to create a window."
+        return False, None
+
+    def get_result(self):
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/vae.py b/solvers/vae.py
index 085f9af..9dfd400 100644
--- a/solvers/vae.py
+++ b/solvers/vae.py
@@ -1,81 +1,82 @@
-from benchopt import BaseSolver, safe_import_context
+from benchopt import BaseSolver
 
-with safe_import_context() as import_ctx:
-    from pyod.models.vae import VAE
-    import numpy as np
-    import torch
+import torch
+from pyod.models.vae import VAE
+
+from benchmark_utils.predictions import cutoff_scores
+from benchmark_utils.windowing import make_windows
 
 
 class Solver(BaseSolver):
     name = "VAE"
 
     install_cmd = "conda"
-    requirements = ["pyod", "tqdm", "pip:torch"]
+    requirements = ["pyod", "pytorch"]
 
     sampling_strategy = "run_once"
 
     parameters = {
         "contamination": [0.005, 0.05, 0.1, 0.2],
         "n_epochs": [50],
-        "window": [False],
         "window_size": [256],
         "horizon": [0],
         "stride": [1],
         "batch_size": [128],
-        "preprocessing": [True, False],
+        "preprocessing": [True],
         "latent_dim": [2, 5, 10],
-        "batch_norm": [True, False],
+        "batch_norm": [True],
         "dropout_rate": [0.1, 0.2, 0.5],
+        "cutoff": [None],
+    }
+    test_config = {
+        "n_epochs": 1,
+        "window_size": 16,
     }
 
-    def set_objective(self, X_train, y_test, X_test):
+    def set_objective(self, X_train, X_test):
 
         self.device = torch.device(
             "cuda" if torch.cuda.is_available() else "cpu"
         )
 
         self.X_train = X_train
-        self.X_test, self.y_test = X_test, y_test
-
-        self.clf = VAE(contamination=self.contamination,
-                       preprocessing=self.preprocessing,
-                       batch_size=self.batch_size,
-                       epoch_num=self.n_epochs,
-                       device=self.device,
-                       latent_dim=self.latent_dim,
-                       batch_norm=self.batch_norm,
-                       dropout_rate=self.dropout_rate,
-                       )
-
-        if self.window:
-            self.Xw_train = np.lib.stride_tricks.sliding_window_view(
-                    X_train,
-                    window_shape=self.window_size+self.horizon,
-                    axis=0
-                ).transpose(0, 2, 1)
-
-            if self.X_test is not None:
-                self.Xw_test = np.lib.stride_tricks.sliding_window_view(
-                    X_test,
-                    window_shape=self.window_size+self.horizon,
-                    axis=0
-                ).transpose(0, 2, 1)
-
-            if self.y_test is not None:
-                self.yw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.y_test, window_shape=self.window_size, axis=0
-                )[::self.stride]
-
-                self.yw_test = torch.tensor(
-                    self.yw_test, dtype=torch.float32
-                )
-        else:
-            self.Xw_train = X_train
-            self.Xw_test = X_test
+        self.X_test = X_test
+
+        self.Xw_train = make_windows(
+            X_train,
+            window_size=self.window_size,
+            stride=self.stride
+        ).reshape(-1, self.window_size * X_train.shape[1])
+
+        self.Xw_test = make_windows(
+            X_test,
+            window_size=self.window_size+self.horizon,
+            stride=self.stride,
+            padding=True
+        ).reshape(-1, self.window_size * X_train.shape[1])
+
+        self.clf = VAE(
+            contamination=self.contamination,
+            preprocessing=self.preprocessing,
+            batch_size=min(self.batch_size, len(self.Xw_train)),
+            epoch_num=self.n_epochs,
+            device=self.device,
+            latent_dim=self.latent_dim,
+            batch_norm=self.batch_norm,
+            dropout_rate=self.dropout_rate,
+            lr=1e-5
+        )
 
     def run(self, _):
         self.clf.fit(self.Xw_train)
-        self.y_pred = self.clf.predict(self.Xw_test)
+        self.anomaly_scores = self.clf.decision_function(self.Xw_test)
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
+        )
 
     def get_result(self):
-        return dict(y_hat=self.y_pred)
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/solvers/vanilla-transformer.py b/solvers/vanilla-transformer.py
index 677cfca..11f91dd 100644
--- a/solvers/vanilla-transformer.py
+++ b/solvers/vanilla-transformer.py
@@ -1,21 +1,24 @@
 # Vanilla Transformer
-from benchopt import BaseSolver, safe_import_context
-from benchmark_utils import mean_overlaping_pred
+from benchopt import BaseSolver
 
-with safe_import_context() as import_ctx:
-    import torch
-    import torch.nn as nn
-    import torch.optim as optim
-    import numpy as np
-    from tqdm import tqdm
-    from benchmark_utils.models import TransformerModel
+import numpy as np
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+
+from benchmark_utils.models import TransformerModel
+from benchmark_utils.windowing import make_windowed_dataset
+from benchmark_utils.windowing import reconstruct_from_windows
+from benchmark_utils.predictions import cutoff_scores
 
 
 class Solver(BaseSolver):
     name = "Transformer"
 
     install_cmd = "conda"
-    requirements = ["pip:torch", "tqdm"]
+    requirements = ["pytorch", "tqdm"]
 
     sampling_strategy = "run_once"
 
@@ -27,20 +30,23 @@ class Solver(BaseSolver):
         "n_epochs": [50],
         "lr": [1e-5],
         "horizon": [1],
-        "window": [True],
         "window_size": [256],
         "stride": [1],
-        "percentile": [97],
+        "cutoff": [None],
+    }
+    test_config = {
+        "n_epochs": 1,
+        "window_size": 16,
     }
 
-    def set_objective(self, X_train, y_test, X_test):
+    def set_objective(self, X_train, X_test):
 
         self.device = torch.device(
             "cuda" if torch.cuda.is_available() else "cpu"
         )
 
         self.X_train = X_train
-        self.X_test, self.y_test = X_test, y_test
+        self.X_test = X_test
 
         self.model = TransformerModel(
             n_features=X_train.shape[1],
@@ -57,30 +63,22 @@ def set_objective(self, X_train, y_test, X_test):
             self.optimizer, mode='min', factor=0.5, patience=5
         )
 
-        # Using only windowed data, parameter used only for consistency
-        if self.window:
-            if self.X_train is not None:
-                self.Xw_train = np.lib.stride_tricks.sliding_window_view(
-                    X_train,
-                    window_shape=self.window_size+self.horizon,
-                    axis=0
-                ).transpose(0, 2, 1)
-
-            if self.X_test is not None:
-                self.Xw_test = np.lib.stride_tricks.sliding_window_view(
-                    X_test,
-                    window_shape=self.window_size+self.horizon,
-                    axis=0
-                ).transpose(0, 2, 1)
-
-            if self.y_test is not None:
-                self.yw_test = np.lib.stride_tricks.sliding_window_view(
-                    self.y_test, window_shape=self.window_size, axis=0
-                )[::self.stride]
-
-                self.yw_test = torch.tensor(
-                    self.yw_test, dtype=torch.float32
-                )
+        self.Xw_train = make_windowed_dataset(
+            X_train,
+            window_size=self.window_size+self.horizon,
+            stride=self.stride
+        )
+        self.Xw_test = make_windowed_dataset(
+            X_test,
+            window_size=self.window_size+self.horizon,
+            stride=self.stride
+        )
+        self.train_loader = DataLoader(
+            self.Xw_train, batch_size=self.batch_size, shuffle=True,
+        )
+        self.test_loader = DataLoader(
+            self.Xw_test, batch_size=self.batch_size, shuffle=False,
+        )
 
     def run(self, _):
         self.model.to(self.device)
@@ -96,13 +94,10 @@ def run(self, _):
         for epoch in ti:
             self.model.train()
             total_loss = 0
-            for i in range(0, len(self.Xw_train), self.batch_size):
-                x = torch.tensor(
-                    self.Xw_train[i:i+self.batch_size, :self.window_size, :],
-                    dtype=torch.float32).to(self.device)
-                y = torch.tensor(
-                    self.Xw_train[i:i+self.batch_size, -self.horizon:, :],
-                    dtype=torch.float32).to(self.device)
+            for x, in self.train_loader:
+                x = x.to(self.device)
+                y = x[:, -self.horizon:]
+                x = x[:, :-self.horizon]
 
                 self.optimizer.zero_grad()
                 output = self.model(x)
@@ -117,7 +112,9 @@ def run(self, _):
                 total_loss += loss.item()
 
                 avg_loss = total_loss / (len(self.Xw_train) // self.batch_size)
-                ti.set_description(f"Epoch {epoch} (loss={avg_loss:.5e})")
+                ti.set_description(
+                    f"Epoch {epoch} (loss={avg_loss:.5e})"
+                )
 
                 # Learning rate scheduling
                 self.scheduler.step(avg_loss)
@@ -126,7 +123,6 @@ def run(self, _):
                 if avg_loss < best_loss:
                     best_loss = avg_loss
                     no_improve = 0
-                    torch.save(self.model.state_dict(), 'best_model.pth')
                 else:
                     no_improve += 1
                     if no_improve == patience:
@@ -134,52 +130,48 @@ def run(self, _):
 
         # Test loop
         self.model.eval()
-        batch_size = 1024
         all_predictions = []
 
         with torch.no_grad():
-            for i in range(0, len(self.Xw_test), batch_size):
-                batch = torch.tensor(
-                    self.Xw_test[i:i+batch_size, :self.window_size, :],
-                    dtype=torch.float32
-                ).to(self.device)
-
-                batch_predictions = self.model(batch)
-
-                if batch_predictions.is_cuda:
-                    batch_predictions = batch_predictions.cpu().numpy()
-                else:
-                    batch_predictions = batch_predictions.numpy()
-
-                all_predictions.append(batch_predictions)
+            for x, in self.test_loader:
+                batch = x[:, :self.window_size].to(self.device)
+                with torch.no_grad():
+                    batch_predictions = self.model(batch)
+                all_predictions.append(batch_predictions.cpu().numpy())
 
         xw_hat = np.concatenate(all_predictions, axis=0)
 
         # Continue with the rest of your code for reconstructing predictions
         x_hat = np.zeros_like(self.X_test) - 1
-        x_hat[self.window_size:self.window_size+self.horizon] = xw_hat[0]
-        x_hat[self.window_size+self.horizon:] = mean_overlaping_pred(
-            xw_hat, 1)
-
-        # Calculating the percentile value for the threshold
-        percentile_value = np.percentile(
-            np.abs(self.X_test[self.window_size:] - x_hat[self.window_size:]),
-            self.percentile
+        x_hat[..., self.window_size:] = reconstruct_from_windows(
+            xw_hat, stride=self.stride, batch=len(self.X_test),
+            n_features=self.X_test.shape[1]
         )
 
-        # Thresholding
-        predictions = np.zeros_like(x_hat)-1
-        predictions[self.window_size:] = np.where(
-            np.abs(self.X_test[self.window_size:] -
-                   x_hat[self.window_size:]) > percentile_value, 1, 0
+        reconstruction_err = np.abs(
+            self.X_test[..., self.window_size:] - x_hat[..., self.window_size:]
+        )
+        self.anomaly_scores = np.full(
+            self.X_test.shape[:1] + self.X_test.shape[2:],
+            np.nan,
+            dtype=float,
+        )
+        self.anomaly_scores[..., self.window_size:] = np.max(
+            reconstruction_err, axis=1
         )
 
-        self.predictions = np.max(predictions, axis=1)
+        self.anomaly_predictions = cutoff_scores(
+            self.anomaly_scores,
+            cutoff=self.cutoff,
+        )
 
-    def skip(self, X_train, X_test, y_test):
-        if X_train.shape[0] < self.window_size + self.horizon:
+    def skip(self, X_train, X_test):
+        if X_train.shape[-1] < self.window_size + self.horizon:
             return True, "No enough training samples"
         return False, None
 
     def get_result(self):
-        return dict(y_hat=self.predictions)
+        result = dict(anomaly_scores=self.anomaly_scores)
+        if self.anomaly_predictions is not None:
+            result["anomaly_predictions"] = self.anomaly_predictions
+        return result
diff --git a/test_config.py b/test_config.py
index 92e34d5..3606a74 100644
--- a/test_config.py
+++ b/test_config.py
@@ -1,21 +1,41 @@
 import sys  # noqa: F401
+from importlib.util import find_spec
 
 import pytest  # noqa: F401
 
 from benchopt.utils.sys_info import get_cuda_version
 
 
-def check_test_solver_install(solver_class):
+OPTIONAL_BACKEND_INSTALL_XFAILS = {
+    "dagmm": "DAGMM depends on the optional salesforce-merlion package.",
+    "mp": "MP depends on the optional TSB-AD package.",
+    "rosecdl": "RoseCDL depends on an optional GitHub package.",
+    "tsb-chronos": "TSB-Chronos depends on the optional TSB-AD backend.",
+    "tsb-timesfm": "TSB-TimesFM depends on TSB-AD and timesfm.",
+    "tsb-timesnet": "TSB-TimesNet depends on the optional TSB-AD backend.",
+}
+
+
+def check_test_solver_install(benchmark, solver_class):
     """Hook called in `test_solver_install`.
 
     If one solver needs to be skip/xfailed on some
     particular architecture, call pytest.xfail when
     detecting the situation.
     """
-    if solver_class.name.lower() == "dif":
+    solver_name = solver_class.name.lower()
+
+    if solver_name in OPTIONAL_BACKEND_INSTALL_XFAILS:
+        pytest.xfail(OPTIONAL_BACKEND_INSTALL_XFAILS[solver_name])
+
+    if solver_name == "dif":
         if get_cuda_version() is None:
             pytest.xfail("Deep IsolationForest needs a working GPU hardware.")
 
+    if solver_name == "anomalybert":
+        pytest.xfail("AnomalyBERT needs to be installed locally from repo"
+                     " at https://github.com/Jhryu30/AnomalyBERT.git")
+
     # if solver_class.name.lower() == "lstm":
     #     if get_cuda_version() is None:
     #         pytest.xfail("LSTM needs a working GPU hardware.")
@@ -23,3 +43,22 @@ def check_test_solver_install(solver_class):
     # if solver_class.name.lower() == "transformer":
     #     if get_cuda_version() is None:
     #         pytest.xfail("Transformer needs a working GPU hardware.")
+
+
+def check_test_solver_run(benchmark, solver_class):
+    """Hook called in `test_solver_run`."""
+    if solver_class.name.lower() == "tsb-timesfm":
+        if find_spec("timesfm") is None:
+            pytest.xfail(
+                "TSB-TimesFM needs the optional timesfm package."
+            )
+
+
+def check_test_dataset_get_data(benchmark, dataset_class):
+    if dataset_class.name.lower() in [
+        "daphnet", "dodgers", "ecg", "genesis", "ghl",
+        "iops", "kdd21", "mgab", "mitdb", "nab",
+        "occupancy", "opportunity", "sensorscope", "smd",
+        "svdb", "yahoo"
+    ]:
+        pytest.xfail(f"{dataset_class.name} dataset is not downloaded.")
diff --git a/tests/test_mean_overlaping_pred.py b/tests/test_mean_overlaping_pred.py
new file mode 100644
index 0000000..4189fa4
--- /dev/null
+++ b/tests/test_mean_overlaping_pred.py
@@ -0,0 +1,47 @@
+import numpy as np
+
+from benchmark_utils import mean_overlaping_pred
+
+
+def test_length_horizon_one_stride_one():
+    # 5 windows, horizon=1, stride=1 → reconstructed signal length is 5
+    preds = np.arange(5).reshape(5, 1, 1).astype(float)
+    out = mean_overlaping_pred(preds, stride=1)
+    assert out.shape == (5, 1)
+    assert np.allclose(out.ravel(), np.arange(5))
+
+
+def test_length_horizon_gt_one():
+    # 4 windows, H=3, stride=1 → (4-1)*1 + 3 = 6 positions
+    preds = np.ones((4, 3, 2))
+    out = mean_overlaping_pred(preds, stride=1)
+    assert out.shape == (6, 2)
+    # every position covered, averaged value is 1.0
+    assert np.allclose(out, 1.0)
+
+
+def test_overlap_averages_correctly():
+    # H=2, stride=1, 3 windows. Index 1 is covered by windows 0 and 1,
+    # index 2 by windows 1 and 2.
+    preds = np.array(
+        [[[1.0], [2.0]],
+         [[3.0], [4.0]],
+         [[5.0], [6.0]]]
+    )
+    out = mean_overlaping_pred(preds, stride=1)
+    # positions: 0 -> 1, 1 -> mean(2, 3) = 2.5, 2 -> mean(4, 5) = 4.5, 3 -> 6
+    assert out.shape == (4, 1)
+    assert np.allclose(out.ravel(), [1.0, 2.5, 4.5, 6.0])
+
+
+def test_stride_gt_one_no_overlap():
+    # H=2, stride=2 → windows tile end-to-end
+    preds = np.array(
+        [[[1.0], [2.0]],
+         [[3.0], [4.0]],
+         [[5.0], [6.0]]]
+    )
+    out = mean_overlaping_pred(preds, stride=2)
+    # (3-1)*2 + 2 = 6 positions, no overlap
+    assert out.shape == (6, 1)
+    assert np.allclose(out.ravel(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
diff --git a/tests/test_objective.py b/tests/test_objective.py
new file mode 100644
index 0000000..6a50afc
--- /dev/null
+++ b/tests/test_objective.py
@@ -0,0 +1,110 @@
+import numpy as np
+import pytest
+
+from objective import Objective
+
+
+def make_objective(score_metrics=("auc_pr", "auc_roc"),
+                   prediction_metrics=None):
+    objective = Objective()
+    objective.score_metrics = score_metrics
+    objective.prediction_metrics = prediction_metrics
+    objective.set_data(
+        X_train=np.empty((1, 1, 6)),
+        y_test=np.array([0, 0, 1, 0, 1, 0]),
+        X_test=np.empty((1, 1, 6)),
+    )
+    return objective
+
+
+def test_default_evaluation_uses_score_metrics_only():
+    objective = make_objective()
+    scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2])
+
+    result = objective.evaluate_result(anomaly_scores=scores)
+
+    assert result["auc_pr"] == pytest.approx(1.0)
+    assert result["auc_roc"] == pytest.approx(1.0)
+    assert result["value"] == pytest.approx(0.0)
+    assert "precision" not in result
+
+
+def test_score_and_prediction_metrics_use_canonical_keys():
+    objective = make_objective(
+        score_metrics=("auc_pr",),
+        prediction_metrics=("precision",),
+    )
+    scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2])
+    predictions = np.array([0, 0, 1, 0, 1, 0])
+
+    result = objective.evaluate_result(
+        anomaly_scores=scores,
+        anomaly_predictions=predictions,
+    )
+
+    assert result["auc_pr"] == pytest.approx(1.0)
+    assert result["precision"] == pytest.approx(1.0)
+
+
+def test_prediction_metrics_are_opt_in():
+    objective = make_objective(
+        prediction_metrics=("precision", "recall", "f1", "zoloss"),
+    )
+    scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2])
+    predictions = np.array([0, 0, 1, 0, 1, 0])
+
+    result = objective.evaluate_result(
+        anomaly_scores=scores,
+        anomaly_predictions=predictions,
+    )
+
+    assert result["precision"] == pytest.approx(1.0)
+    assert result["recall"] == pytest.approx(1.0)
+    assert result["f1"] == pytest.approx(1.0)
+    assert result["zoloss"] == pytest.approx(0.0)
+
+
+def test_prediction_metrics_require_prediction_array():
+    objective = make_objective(prediction_metrics=("precision",))
+    scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2])
+
+    with pytest.raises(ValueError, match="anomaly_predictions"):
+        objective.evaluate_result(anomaly_scores=scores)
+
+
+def test_nan_score_padding_is_masked():
+    objective = make_objective()
+    scores = np.array([np.nan, 0.2, 0.9, 0.1, 0.8, 0.2])
+
+    result = objective.evaluate_result(anomaly_scores=scores)
+
+    assert result["auc_pr"] == pytest.approx(1.0)
+    assert result["auc_roc"] == pytest.approx(1.0)
+
+
+def test_prediction_padding_is_masked():
+    objective = make_objective(
+        score_metrics=None,
+        prediction_metrics=("precision", "recall", "f1"),
+    )
+    predictions = np.array([-1, 0, 1, 0, 1, 0])
+
+    result = objective.evaluate_result(anomaly_predictions=predictions)
+
+    assert result["precision"] == pytest.approx(1.0)
+    assert result["recall"] == pytest.approx(1.0)
+    assert result["f1"] == pytest.approx(1.0)
+    assert result["value"] == pytest.approx(0.0)
+
+
+def test_prediction_only_metrics_without_primary_value_fallback_to_zero():
+    objective = make_objective(
+        score_metrics=None,
+        prediction_metrics=("precision",),
+    )
+    predictions = np.array([0, 0, 1, 0, 1, 0])
+
+    result = objective.evaluate_result(anomaly_predictions=predictions)
+
+    assert result["precision"] == pytest.approx(1.0)
+    assert result["value"] == pytest.approx(0.0)
diff --git a/tests/test_predictions.py b/tests/test_predictions.py
new file mode 100644
index 0000000..1e8f9b7
--- /dev/null
+++ b/tests/test_predictions.py
@@ -0,0 +1,33 @@
+import numpy as np
+import pytest
+
+from benchmark_utils.predictions import cutoff_scores
+
+
+def test_cutoff_scores_returns_none_without_cutoff():
+    scores = np.array([0.1, 0.8, 0.2])
+
+    assert cutoff_scores(scores) is None
+
+
+def test_cutoff_scores_uses_top_score_fraction():
+    scores = np.array([0.1, 0.8, 0.2, 0.9])
+
+    predictions = cutoff_scores(scores, cutoff=0.25)
+
+    np.testing.assert_array_equal(predictions, np.array([0, 0, 0, 1]))
+
+
+def test_cutoff_scores_preserves_nan_padding_as_ignore_label():
+    scores = np.array([np.nan, 0.1, 0.8, 0.2, 0.9])
+
+    predictions = cutoff_scores(scores, cutoff=0.25)
+
+    np.testing.assert_array_equal(predictions, np.array([-1, 0, 0, 0, 1]))
+
+
+def test_cutoff_scores_rejects_invalid_cutoff():
+    scores = np.array([0.1, 0.8, 0.2])
+
+    with pytest.raises(ValueError, match="must be in"):
+        cutoff_scores(scores, cutoff=1)