From bb6d8d81be2f883146dcb6a182effc48f2bd9b57 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 10:06:54 +0300 Subject: [PATCH 01/50] Add ECG, MGAB, and MITDB datasets from the tsb-uad benchmark --- datasets/ecg.py | 104 ++++++++++++++++++++++++++++++++++++++ datasets/mgab.py | 116 ++++++++++++++++++++++++++++++++++++++++++ datasets/mitdb.py | 126 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 346 insertions(+) create mode 100644 datasets/ecg.py create mode 100644 datasets/mgab.py create mode 100644 datasets/mitdb.py diff --git a/datasets/ecg.py b/datasets/ecg.py new file mode 100644 index 0000000..20ea4fc --- /dev/null +++ b/datasets/ecg.py @@ -0,0 +1,104 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("ECG") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) - the time series data + - y_true: numpy array of shape (num_records, num_samples) - the labels + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files + record_files = list(db_path.glob("*.out")) + record_ids = [f.stem for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + record_file = db_path / f"MBA_ECG14046_data_{record_id}.out" + if record_file.exists(): + # Load the record data + record_data = pd.read_csv(record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record {record_id}") + else: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append(np.pad(data, (0, max_length - len(data)), mode='constant', constant_values=data[-1])) + padded_labels.append(np.pad(labels, (0, max_length - len(labels)), mode='constant', constant_values=0)) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + + +class Dataset(BaseDataset): + name = "ECG" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the MITDB dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + size = 5000 + X_train = X_train[:, :size] + X_test = X_test[:, :size] + y_test = y_test[:, :size] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) \ No newline at end of file diff --git a/datasets/mgab.py b/datasets/mgab.py new file mode 100644 index 0000000..e4004df --- /dev/null +++ b/datasets/mgab.py @@ -0,0 +1,116 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("MGAB") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) - the time series data + - y_true: numpy array of shape (num_records, num_samples) - the labels + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files + record_files = list(db_path.glob("*.test.out")) + record_ids = [f.name.split(".")[0] for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + record_file = db_path / f"{record_id}.test.out" + if record_file.exists(): + # Load the record data + record_data = pd.read_csv(record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record {record_id}") + else: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "MGAB" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the MITDB dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) \ No newline at end of file diff --git a/datasets/mitdb.py b/datasets/mitdb.py new file mode 100644 index 0000000..66e9515 --- /dev/null +++ b/datasets/mitdb.py @@ -0,0 +1,126 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("MITDB") + + +def load_mitdb_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) - the time series data + - y_true: numpy array of shape (num_records, num_samples) - the labels + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with the format like 100.test.csv@1.out + record_files = list(db_path.glob("*.out")) + record_ids = [f.name for f in record_files] + + print(f"Loading records: {record_ids}") + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Find file starting with record_id and ending with .out + record_files = list(db_path.glob(f"{record_id}*.out")) + if record_files: + if len(record_files) > 1: + print( + f"Multiple files found for record ID {record_id}, using the first one: {record_files[0]}" + ) + record_file = record_files[0] + # Load the record data + record_data = pd.read_csv(record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + print(f"Loaded record {record_id} with shape {record_data.shape}") + if record_data.shape[1] >= 2: + print(f"Record {record_id} has sufficient columns") + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record {record_id}") + else: + print(f"Record file not found for ID: {record_id}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "MITDB" + + parameters = { + "recordings_id": [["100", "201"], ["100"]], + "debug": [False], + } + + def get_data(self): + """Load the MITDB dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_mitdb_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, : int(X.shape[1] * 0.1)] + + if self.debug: + print("Debug mode: limiting data to 1000 samples") + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + print( + f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_test shape: {y_test.shape}" + ) + return dict(X_train=X_train, y_test=y_test, X_test=X_test) From 95deadb7c3cbbfc3b790b9f45a4d73df3c595d0c Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 10:10:45 +0300 Subject: [PATCH 02/50] FIX reshape y_test and y_hat in evaluate_result method; update get_objective to actually return y_test --- objective.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/objective.py b/objective.py index 3d34e42..7af1f4c 100644 --- a/objective.py +++ b/objective.py @@ -38,8 +38,8 @@ def set_data(self, X_train, y_test, X_test): def evaluate_result(self, y_hat): """Evaluate the result provided by the solver.""" to_discard = (y_hat == -1).sum() - self.y_test = self.y_test[to_discard:] - y_hat = y_hat[to_discard:] + self.y_test = self.y_test[to_discard:].reshape(-1) + y_hat = y_hat[to_discard:].reshape(-1) result = {} detection_ranges = [1, 3, 5, 10, 20] @@ -98,5 +98,5 @@ def evaluate_result(self, y_hat): def get_objective(self): return dict( - X_train=self.X_train, y_test=None, X_test=self.X_test + X_train=self.X_train, y_test=self.y_test, X_test=self.X_test ) From 3e53d190a6769ee9955aab535f0420daab3309d3 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 10:16:33 +0300 Subject: [PATCH 03/50] ADD implement Matrix Profile solver --- solvers/matrixprofile.py | 66 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 solvers/matrixprofile.py diff --git a/solvers/matrixprofile.py b/solvers/matrixprofile.py new file mode 100644 index 0000000..94f0af5 --- /dev/null +++ b/solvers/matrixprofile.py @@ -0,0 +1,66 @@ +from benchopt import BaseSolver, safe_import_context +from sklearn.preprocessing import MinMaxScaler + +with safe_import_context() as import_ctx: + from TSB_UAD.models.matrix_profile import MatrixProfile + from TSB_UAD.utils.slidingWindows import find_length + import math + import numpy as np + + +class Solver(BaseSolver): + name = "MP" + + install_cmd = "conda" + requirements = ["pip:tsb-uad"] + + parameters = { + "window_size": [10, "auto"], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + self.X_train = X_train + self.X_test, self.y_test = X_test, y_test + + if self.window_size == "auto": + self.window_size = find_length(X_train) + self.clf = MatrixProfile( + window=self.window_size, + ) + + def run(self, _): + print("Running Matrix Profile solver...") + # Special solver, fitting on X_test + self.clf.fit(self.X_test.reshape(-1)) + self.scores = self.clf.decision_scores_ + score = ( + MinMaxScaler(feature_range=(0, 1)) + .fit_transform(self.scores.reshape(-1, 1)) + .ravel() + ) + + print("MP Fitted") + + self.score = np.array( + [score[0]] * math.ceil((self.window_size - 1) / 2) + + list(score) + + [score[-1]] * ((self.window_size - 1) // 2) + ) + + print("MP Scored") + print(f"Score shape: {score.shape}") + + def skip(self, X_train, y_test, X_test): + """Check if the solver can be skipped.""" + if find_length(X_train) == 0 and self.window_size == "auto": + return True, "Window size is 0" + return False, None + + def get_result(self): + """Return the result of the solver.""" + # Binarizing the scores to 0 and 1 + # TEMPORARY SOLUTION + self.final_score = np.where(self.score > 0.75, 1, 0) + return dict(y_hat=self.final_score) From 4952aca9c090541c6a89ed2410118879609b875f Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 10:16:47 +0300 Subject: [PATCH 04/50] ADD implement TSB-OCSVM --- solvers/tsb_ocsvm.py | 75 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 solvers/tsb_ocsvm.py diff --git a/solvers/tsb_ocsvm.py b/solvers/tsb_ocsvm.py new file mode 100644 index 0000000..b06371e --- /dev/null +++ b/solvers/tsb_ocsvm.py @@ -0,0 +1,75 @@ +from benchopt import BaseSolver, safe_import_context +from sklearn.preprocessing import MinMaxScaler + +with safe_import_context() as import_ctx: + from TSB_UAD.models.ocsvm import OCSVM + from TSB_UAD.models.feature import Window + from TSB_UAD.utils.slidingWindows import find_length + import math + import numpy as np + + +class Solver(BaseSolver): + name = "TSB-OCVSM" + + install_cmd = "conda" + requirements = ["pip:tsb-uad"] + + parameters = { + "window_size": [10, "auto"], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + if self.window_size == "auto": + self.window_size = find_length(X_train) + + X_train = X_train.reshape(-1) + X_test = X_test.reshape(-1) + + X_train = Window(window=self.window_size).convert(X_train).to_numpy() + X_test = Window(window=self.window_size).convert(X_test).to_numpy() + + self.X_train = MinMaxScaler(feature_range=(0, 1)).fit_transform(X_train.T).T + self.X_test = MinMaxScaler(feature_range=(0, 1)).fit_transform(X_test.T).T + + self.y_test = y_test.reshape(-1) + + self.clf = OCSVM(nu=0.05, max_iter=200) + + def run(self, _): + print("Running OCSVM solver...") + # Special solver, fitting on X_test + self.clf.fit(self.X_train, self.X_test) + score = self.clf.decision_scores_ + + print("OCSVM Fitted") + + score = np.array( + [score[0]] * math.ceil((self.window_size - 1) / 2) + + list(score) + + [score[-1]] * ((self.window_size - 1) // 2) + ) + + self.score = ( + MinMaxScaler(feature_range=(0, 1)) + .fit_transform(score.reshape(-1, 1)) + .ravel() + ) + + print("MP Scored") + print(f"Score shape: {score.shape}") + + def skip(self, X_train, y_test, X_test): + """Check if the solver can be skipped.""" + if find_length(X_train) == 0 and self.window_size == "auto": + return True, "Window size is 0" + return False, None + + def get_result(self): + """Return the result of the solver.""" + # Binarizing the scores to 0 and 1 + # TEMPORARY SOLUTION + self.final_score = np.where(self.score > 0.75, 1, 0) + return dict(y_hat=self.final_score) From 3a0ab4cac95a7b876ebe314e79d54d459ec0af01 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 10:53:13 +0300 Subject: [PATCH 05/50] ADD Autoencoder for univariate data (inspired from tsb-uad) --- benchmark_utils/models.py | 220 ++++++++++++++++++++++++++++++++++++++ solvers/autoencoder.py | 70 ++++++++++++ 2 files changed, 290 insertions(+) create mode 100644 solvers/autoencoder.py diff --git a/benchmark_utils/models.py b/benchmark_utils/models.py index 7bcf100..e423f67 100644 --- a/benchmark_utils/models.py +++ b/benchmark_utils/models.py @@ -1,4 +1,10 @@ from torch import nn +from sklearn.preprocessing import MinMaxScaler +import torch +import torch.optim as optim +from torch.utils.data import DataLoader, Dataset +import numpy as np +from tqdm import tqdm class ARModel(nn.Module): @@ -122,3 +128,217 @@ def forward(self, x): x, (_, _) = self.decoder(x) return x + + +class SlidingWindowDataset(Dataset): + def __init__(self, data, window_size): + self.data = data + self.window_size = window_size + + def __len__(self): + return len(self.data) - self.window_size + 1 + + def __getitem__(self, idx): + window = self.data[idx:idx + self.window_size] + return window # Input and target are the same for autoencoder + + +class Autoencoder(nn.Module): + def __init__(self, input_size=32, hidden_size=32, latent_size=16, sliding_window=10): + super(Autoencoder, self).__init__() + + self.sliding_window = sliding_window + self.decision_scores_ = None + + # Encoder + self.encoder = nn.Sequential( + nn.Linear(input_size, hidden_size), + nn.ReLU(), + nn.BatchNorm1d(hidden_size), + nn.Linear(hidden_size, latent_size), + nn.ReLU(), + nn.BatchNorm1d(latent_size), + ) + + # Decoder + self.decoder = nn.Sequential( + nn.Linear(latent_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, input_size), + nn.ReLU(), + ) + + def forward(self, x): + # Flatten input if needed + x = x.view(x.size(0), -1) + + # Encode + encoded = self.encoder(x) + + # Decode + decoded = self.decoder(encoded) + + return decoded + + def encode(self, x): + x = x.view(x.size(0), -1) + return self.encoder(x) + + def _create_sliding_windows(self, X): + """Create sliding windows from input data""" + if isinstance(X, np.ndarray): + X = torch.from_numpy(X).float() + + # If X is 1D, reshape to 2D + if X.dim() == 1: + X = X.unsqueeze(1) + + windows = [] + for i in range(len(X) - self.sliding_window + 1): + window = X[i:i + self.sliding_window].flatten() + windows.append(window) + + return torch.stack(windows) + + def fit(self, X, num_epochs=50, learning_rate=1e-3, device="cuda", batch_size=32): + """ + Train the autoencoder on the provided data. + + Args: + X: Input data as tensor or numpy array shape (n_samples, n_features) + num_epochs: Number of training epochs + learning_rate: Learning rate for optimizer + device: Device to train on ('cuda' or 'cpu') + batch_size: Batch size for training + + Returns: + List of training losses per epoch + """ + # Convert to tensor if numpy array + if isinstance(X, np.ndarray): + X = torch.from_numpy(X).float() + + # Ensure X is 2D + if X.dim() == 1: + X = X.unsqueeze(1) + if X.dim() == 3: + # (n_samples, n_timesteps, n_features) + X = X.view(-1, 1) + + # Create sliding windows + windowed_data = self._create_sliding_windows(X) + + # Create dataset and dataloader + dataset = SlidingWindowDataset(windowed_data, window_size=1) # window_size=1 since we already created windows + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True) + + self.to(device) + criterion = nn.MSELoss() + optimizer = optim.Adam(self.parameters(), lr=learning_rate) + + self.train() + losses = [] + + # Progress bar for epochs + epoch_pbar = tqdm(range(num_epochs), desc="Training", unit="epoch") + + for epoch in epoch_pbar: + epoch_loss = 0.0 + + # Progress bar for batches + batch_pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False) + + for batch_idx, (data) in enumerate(batch_pbar): + data = data.to(device) + + # Forward pass + output = self(data) + loss = criterion(output, data) + + # Backward pass + optimizer.zero_grad() + loss.backward() + optimizer.step() + + epoch_loss += loss.item() + + # Update batch progress bar + batch_pbar.set_postfix({"Batch Loss": f"{loss.item():.4f}"}) + + avg_loss = epoch_loss / len(dataloader) + losses.append(avg_loss) + + # Update epoch progress bar + epoch_pbar.set_postfix({"Avg Loss": f"{avg_loss:.4f}"}) + + return losses + + def predict(self, X_test, X_dirty=None, device="cuda"): + """ + Predict anomaly scores for time series data. + + Args: + X_test: Test data for reconstruction + X_dirty: Original dirty data (if None, uses X_test) + device: Device to run inference on + + Returns: + Reconstructed data and sets decision_scores_ attribute + """ + self.eval() + self.to(device) + + # Create sliding windows for test data + if isinstance(X_test, np.ndarray): + X_test = torch.from_numpy(X_test).float() + + windowed_test = self._create_sliding_windows(X_test) + windowed_test = windowed_test.to(device) + + with torch.no_grad(): + test_predict = self(windowed_test).cpu().numpy() + + # Calculate MAE loss + test_mae_loss = np.mean(np.abs(test_predict - windowed_test.cpu().numpy()), axis=1) + + # Normalize MAE loss + nor_test_mae_loss = MinMaxScaler().fit_transform(test_mae_loss.reshape(-1, 1)).ravel() + + # Use X_dirty if provided, otherwise use original X_test + if X_dirty is None: + X_dirty = X_test.cpu().numpy() if isinstance(X_test, torch.Tensor) else X_test + + # Initialize score array + score = np.zeros(len(X_dirty)) + + # Fill the score array with sliding window approach + score[self.sliding_window // 2:self.sliding_window // 2 + len(test_mae_loss)] = nor_test_mae_loss + score[:self.sliding_window // 2] = nor_test_mae_loss[0] + score[self.sliding_window // 2 + len(test_mae_loss):] = nor_test_mae_loss[-1] + + # Store decision scores + self.decision_scores_ = score + + return test_predict + + def encode_data(self, x, device="cuda"): + """ + Encode input data to latent representation. + + Args: + x: Input tensor or numpy array + device: Device to run inference on + + Returns: + Encoded data as numpy array + """ + self.eval() + self.to(device) + + # Convert to tensor if numpy array + if isinstance(x, np.ndarray): + x = torch.from_numpy(x).float() + x = x.to(device) + with torch.no_grad(): + encoded = self.encode(x) + return encoded.cpu().numpy() diff --git a/solvers/autoencoder.py b/solvers/autoencoder.py new file mode 100644 index 0000000..3e8d000 --- /dev/null +++ b/solvers/autoencoder.py @@ -0,0 +1,70 @@ +from benchopt import BaseSolver, safe_import_context +from sklearn.preprocessing import MinMaxScaler + +with safe_import_context() as import_ctx: + from benchmark_utils.models import Autoencoder + from TSB_UAD.utils.slidingWindows import find_length + import numpy as np + + +class Solver(BaseSolver): + name = "AE" + + install_cmd = "conda" + requirements = ["pip:tsb-uad"] + + parameters = { + "window_size": [10, "auto"], + "num_epochs": [100], + "batch_size": [128], + "learning_rate": [1e-3], + "hidden_size": [64], + "latent_size": [32], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + if self.window_size == "auto": + self.window_size = find_length(X_train) + + self.X_train = X_train.reshape(-1) + self.X_test = X_test.reshape(-1) + self.y_test = y_test + + self.clf = Autoencoder( + input_size=self.window_size, + sliding_window=self.window_size, + latent_size=self.latent_size, + hidden_size=self.hidden_size, + ) + + def run(self, _): + self.clf.fit( + self.X_train, + num_epochs=self.num_epochs, + batch_size=self.batch_size, + learning_rate=self.learning_rate, + ) + + self.clf.predict(self.X_test.reshape(-1, 1)) + score = self.clf.decision_scores_ + + self.score = ( + MinMaxScaler(feature_range=(0, 1)) + .fit_transform(score.reshape(-1, 1)) + .ravel() + ) + + def skip(self, X_train, y_test, X_test): + """Check if the solver can be skipped.""" + if find_length(X_train) == 0 and self.window_size == "auto": + return True, "Window size is 0" + return False, None + + def get_result(self): + """Return the result of the solver.""" + # Binarizing the scores to 0 and 1 + # TEMPORARY SOLUTION + self.final_score = np.where(self.score > 0.75, 1, 0) + return dict(y_hat=self.final_score) From 2cdf79acabd38b7fb5c8a2188e3a1b332a656dbe Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 11:18:53 +0300 Subject: [PATCH 06/50] ADD RoseCDL solver --- solvers/rosecdl.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 solvers/rosecdl.py diff --git a/solvers/rosecdl.py b/solvers/rosecdl.py new file mode 100644 index 0000000..f33e86d --- /dev/null +++ b/solvers/rosecdl.py @@ -0,0 +1,77 @@ +from benchopt import safe_import_context, BaseSolver + +with safe_import_context() as import_ctx: + from rosecdl.rosecdl import RoseCDL + import torch + + +class Solver(BaseSolver): + name = "RoseCDL" + + install_cmd = "conda" + requirements = ["pip:rosecdl"] + + parameters = { + "n_components": [1], + "n_channels": [1], + "kernel_size": [64], + "lmbd": [0.8], + "scale_lmbd": [False], + "epochs": [5, 50], + "max_batch": [None], + "mini_batch_size": [600], + "sample_window": [10_000], + "optimizer": ["adam"], + "n_iterations": [10, 90], + "window": [False], + "outliers_kwargs": [ + { + "method": "mad", + "alpha": 3.5, + "moving_average": None, + "union_channels": True, + "opening_window": True, + }, + ], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu" + ) + + # We receive data in shape (n_samples, n_features) + # We want to reshape it to (n_recordings, n_features, n_samples) + X_train = X_train.reshape(1, X_train.shape[1], X_train.shape[0]) + X_test = X_test.reshape(1, X_test.shape[1], X_test.shape[0]) + self.y_test = y_test + + self.X_train = torch.tensor( + X_train, dtype=torch.float32, device=self.device) + self.X_test = X_test + + self.clf = RoseCDL( + n_components=self.n_components, + n_channels=self.n_channels, + kernel_size=self.kernel_size, + lmbd=self.lmbd, + scale_lmbd=self.scale_lmbd, + epochs=self.epochs, + max_batch=self.max_batch, + mini_batch_size=self.mini_batch_size, + sample_window=self.sample_window, + optimizer=self.optimizer, + n_iterations=self.n_iterations, + window=self.window, + device=self.device, + outliers_kwargs=self.outliers_kwargs, + ) + + def run(self, _): + self.clf.fit(self.X_train) + self.y_pred = self.clf.get_outlier_mask(self.X_test) + + def get_result(self): + return dict(y_hat=self.y_pred) From f6f55ead5b6f6dc3aa36b0d9f924bab060ff60f1 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 12:35:44 +0300 Subject: [PATCH 07/50] ADD DAPHNET dataset --- datasets/daphnet.py | 124 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 datasets/daphnet.py diff --git a/datasets/daphnet.py b/datasets/daphnet.py new file mode 100644 index 0000000..9e4ce3e --- /dev/null +++ b/datasets/daphnet.py @@ -0,0 +1,124 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("DAPHNET") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with .test.csv@X.out pattern + record_files = list(db_path.glob("*.test.csv@*.out")) + record_ids = [f.name for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Find all files matching the pattern for the given record_id + record_files = list(db_path.glob(f"{record_id}.test.csv@*.out")) + + if not record_files: + print(f"No record files found for ID: {record_id}") + continue + + for record_file in record_files: + print(f"Loading record file: {record_file}") + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print( + f"Insufficient columns for record file {record_file.name}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "DAPHNET" + + parameters = { + "recordings_id": [["S01R02E0"]], + "debug": [False], + } + + def get_data(self): + """Load the DAPHNET dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) From 4bd99f5e7657e3c729f6bd6db76e2824548cb136 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 12:41:13 +0300 Subject: [PATCH 08/50] ADD Dodgers dataset --- datasets/dodgers.py | 122 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 datasets/dodgers.py diff --git a/datasets/dodgers.py b/datasets/dodgers.py new file mode 100644 index 0000000..0418e89 --- /dev/null +++ b/datasets/dodgers.py @@ -0,0 +1,122 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("DODGERS") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with freeway-traffic pattern + record_files = list(db_path.glob("*-freeway-traffic.test.out")) + record_ids = [f.name for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Handle direct filename or construct pattern + if record_id.endswith('-freeway-traffic.test.out'): + record_file = db_path / record_id + else: + record_file = db_path / f"{record_id}-freeway-traffic.test.out" + if record_file.exists(): + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record {record_id}") + else: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "DODGERS" + + parameters = { + "recordings_id": [["101"]], + "debug": [False], + } + + def get_data(self): + """Load the DODGERS dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) From 8d266971c0d303ae5f8b5af2d18e6285da6d620c Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 14:42:51 +0300 Subject: [PATCH 09/50] ADD Genesis dataset --- datasets/genesis.py | 123 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 datasets/genesis.py diff --git a/datasets/genesis.py b/datasets/genesis.py new file mode 100644 index 0000000..696f266 --- /dev/null +++ b/datasets/genesis.py @@ -0,0 +1,123 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("GENESIS") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with genesis pattern + record_files = list(db_path.glob("genesis-*.out")) + record_ids = [f.name for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Handle direct filename or construct genesis pattern + if record_id.startswith('genesis-') and record_id.endswith('.out'): + record_file = db_path / record_id + else: + record_file = db_path / \ + f"genesis-anomalies.test.csv@{record_id}.out" + if record_file.exists(): + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record {record_id}") + else: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "GENESIS" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the GENESIS dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) From 95c9a99ec27fd39c2209c7868b2397851d4ba3d7 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 14:46:50 +0300 Subject: [PATCH 10/50] ADD GHL dataset --- datasets/ghl.py | 127 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 datasets/ghl.py diff --git a/datasets/ghl.py b/datasets/ghl.py new file mode 100644 index 0000000..074862d --- /dev/null +++ b/datasets/ghl.py @@ -0,0 +1,127 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("GHL") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with GHL pattern + record_files = list(db_path.glob( + "*_Lev_fault_Temp_corr_*.test.csv@*.out")) + record_ids = [f.name for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Handle direct filename or construct pattern + if '_Lev_fault_Temp_corr_' in record_id and record_id.endswith('.out'): + record_file = db_path / record_id + else: + # Try to find matching files with pattern + matching_files = list(db_path.glob( + f"*{record_id}*_Lev_fault_Temp_corr_*.out")) + record_file = matching_files[0] if matching_files else db_path / \ + f"{record_id}.out" + if record_file.exists(): + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record {record_id}") + else: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "GHL" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the GHL dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) From 01ed75b9357d755094af7fbb9b1121b77b7de1a0 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 15:06:35 +0300 Subject: [PATCH 11/50] ADD Occupancy dataset --- datasets/occupancy.py | 135 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 datasets/occupancy.py diff --git a/datasets/occupancy.py b/datasets/occupancy.py new file mode 100644 index 0000000..1a6f6fd --- /dev/null +++ b/datasets/occupancy.py @@ -0,0 +1,135 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("OCCUPANCY") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load for testing. + + Returns: + tuple: (X_train, X_test, y_test) where: + - X_train: numpy array of shape (num_records, num_samples) + - X_test: numpy array of shape (num_records, num_samples) + - y_test: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + # Load training data + train_files = sorted(list(db_path.glob("room-occupancy.train.csv@*.out"))) + print(train_files) + if not train_files: + raise FileNotFoundError("No training files found.") + train_data_list = [ + pd.read_csv(f, header=None).dropna().to_numpy()[:, 0].astype(float) + for f in train_files + ] + # Concatenate all training series into a single array + X_train = np.concatenate(train_data_list) + + # Load testing data + if record_ids is None: + record_ids = sorted( + list(set( + f.name.split('.')[0].split('-')[-1] + for f in db_path.glob("room-occupancy-*.test.csv@*.out") + )) + ) + + test_data_list = [] + labels_list = [] + for record_id in record_ids: + test_files = sorted( + list(db_path.glob(f"room-occupancy-{record_id}.test.csv@*.out")) + ) + if not test_files: + print(f"No test files found for record_id {record_id}") + continue + + for test_file in test_files: + record_data = pd.read_csv( + test_file, header=None).dropna().to_numpy() + if record_data.shape[1] >= 2: + test_data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record file {test_file.name}") + + if not test_data_list: + raise ValueError("No valid test data found") + + # Find maximum length for padding test data + max_length = max(len(data) for data in test_data_list) + + # Pad all test sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(test_data_list, labels_list): + pad_width = max_length - len(data) + if pad_width > 0: + padded_data.append( + np.pad( + data, ( + 0, + pad_width), + mode="constant", + constant_values=data[-1] + ) + ) + padded_labels.append( + np.pad( + labels, (0, pad_width), mode="constant", constant_values=0 + ) + ) + else: + padded_data.append(data) + padded_labels.append(labels) + + X_test = np.array(padded_data) + y_test = np.array(padded_labels) + + # Reshape X_train to be 2D + X_train = X_train.reshape(1, -1) + + return X_train, X_test, y_test + + +class Dataset(BaseDataset): + name = "OCCUPANCY" + + parameters = { + "recordings_id": [None], + "debug": [False], + } + + def get_data(self): + """Load the OCCUPANCY dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X_train, X_test, y_test = load_data(PATH, self.recordings_id) + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) From dfa9fedbb6b14c34f3f811e6feab5f1f7fdf8597 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 15:11:32 +0300 Subject: [PATCH 12/50] ADD SensorScope dataset --- datasets/sensorscope.py | 123 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 datasets/sensorscope.py diff --git a/datasets/sensorscope.py b/datasets/sensorscope.py new file mode 100644 index 0000000..aa00fde --- /dev/null +++ b/datasets/sensorscope.py @@ -0,0 +1,123 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("SENSORSCOPE") + PATH = "/data/parietal/store2/data/tsb-uad/TSB-UAD-Public/SensorScope/" + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with stb pattern + record_files = list(db_path.glob("stb-*.test.out")) + record_ids = [f.name for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Handle direct filename or construct pattern + if record_id.startswith('stb-') and record_id.endswith('.test.out'): + record_file = db_path / record_id + else: + record_file = db_path / f"stb-{record_id}.test.out" + if record_file.exists(): + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record {record_id}") + else: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "SENSORSCOPE" + + parameters = { + "recordings_id": [["10", "11"]], + "debug": [False], + } + + def get_data(self): + """Load the SENSORSCOPE dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) From e05fd97de0dfba82dc8efdb611830527e4d11227 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 15:11:49 +0300 Subject: [PATCH 13/50] ADD SensorScope dataset --- datasets/sensorscope.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datasets/sensorscope.py b/datasets/sensorscope.py index aa00fde..64b4cab 100644 --- a/datasets/sensorscope.py +++ b/datasets/sensorscope.py @@ -6,7 +6,6 @@ import pandas as pd PATH = config.get_data_path("SENSORSCOPE") - PATH = "/data/parietal/store2/data/tsb-uad/TSB-UAD-Public/SensorScope/" def load_data(db_path, record_ids=None): From 4118e060edde62f9c063dd65fd5219a61311817d Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 15:23:25 +0300 Subject: [PATCH 14/50] Fix formatting --- benchmark_utils/models.py | 45 +++++++++++++++++++++++++++++---------- datasets/ecg.py | 27 ++++++++++++++++------- datasets/mgab.py | 12 ++++++----- datasets/mitdb.py | 18 ++++++++++------ 4 files changed, 71 insertions(+), 31 deletions(-) diff --git a/benchmark_utils/models.py b/benchmark_utils/models.py index e423f67..1432e97 100644 --- a/benchmark_utils/models.py +++ b/benchmark_utils/models.py @@ -144,7 +144,13 @@ def __getitem__(self, idx): class Autoencoder(nn.Module): - def __init__(self, input_size=32, hidden_size=32, latent_size=16, sliding_window=10): + def __init__( + self, + input_size=32, + hidden_size=32, + latent_size=16, + sliding_window=10 + ): super(Autoencoder, self).__init__() self.sliding_window = sliding_window @@ -200,12 +206,19 @@ def _create_sliding_windows(self, X): return torch.stack(windows) - def fit(self, X, num_epochs=50, learning_rate=1e-3, device="cuda", batch_size=32): + def fit( + self, + X, + num_epochs=50, + learning_rate=1e-3, + device="cuda", + batch_size=32 + ): """ Train the autoencoder on the provided data. Args: - X: Input data as tensor or numpy array shape (n_samples, n_features) + X: Input data tensor or numpy array shape (n_samples, n_features) num_epochs: Number of training epochs learning_rate: Learning rate for optimizer device: Device to train on ('cuda' or 'cpu') @@ -229,8 +242,10 @@ def fit(self, X, num_epochs=50, learning_rate=1e-3, device="cuda", batch_size=32 windowed_data = self._create_sliding_windows(X) # Create dataset and dataloader - dataset = SlidingWindowDataset(windowed_data, window_size=1) # window_size=1 since we already created windows - dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True) + # window_size=1 since we already created windows + dataset = SlidingWindowDataset(windowed_data, window_size=1) + dataloader = DataLoader( + dataset, batch_size=batch_size, shuffle=True, drop_last=True) self.to(device) criterion = nn.MSELoss() @@ -246,7 +261,8 @@ def fit(self, X, num_epochs=50, learning_rate=1e-3, device="cuda", batch_size=32 epoch_loss = 0.0 # Progress bar for batches - batch_pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False) + batch_pbar = tqdm( + dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False) for batch_idx, (data) in enumerate(batch_pbar): data = data.to(device) @@ -299,22 +315,29 @@ def predict(self, X_test, X_dirty=None, device="cuda"): test_predict = self(windowed_test).cpu().numpy() # Calculate MAE loss - test_mae_loss = np.mean(np.abs(test_predict - windowed_test.cpu().numpy()), axis=1) + test_mae_loss = np.mean( + np.abs(test_predict - windowed_test.cpu().numpy()), axis=1) # Normalize MAE loss - nor_test_mae_loss = MinMaxScaler().fit_transform(test_mae_loss.reshape(-1, 1)).ravel() + nor_test_mae_loss = MinMaxScaler().fit_transform( + test_mae_loss.reshape(-1, 1)).ravel() # Use X_dirty if provided, otherwise use original X_test if X_dirty is None: - X_dirty = X_test.cpu().numpy() if isinstance(X_test, torch.Tensor) else X_test + if isinstance(X_test, torch.Tensor): + X_dirty = X_test.cpu().numpy() + else: + X_dirty = X_test # Initialize score array score = np.zeros(len(X_dirty)) # Fill the score array with sliding window approach - score[self.sliding_window // 2:self.sliding_window // 2 + len(test_mae_loss)] = nor_test_mae_loss + score[self.sliding_window // 2:self.sliding_window // + 2 + len(test_mae_loss)] = nor_test_mae_loss score[:self.sliding_window // 2] = nor_test_mae_loss[0] - score[self.sliding_window // 2 + len(test_mae_loss):] = nor_test_mae_loss[-1] + score[self.sliding_window // 2 + + len(test_mae_loss):] = nor_test_mae_loss[-1] # Store decision scores self.decision_scores_ = score diff --git a/datasets/ecg.py b/datasets/ecg.py index 20ea4fc..30bb2b1 100644 --- a/datasets/ecg.py +++ b/datasets/ecg.py @@ -14,12 +14,13 @@ def load_data(db_path, record_ids=None): Args: db_path: Path to the database directory - record_ids: List of record IDs to load. If None, loads all available records. + record_ids: List of record IDs to load. + If None, loads all available records. Returns: tuple: (X, y_true) where: - - X: numpy array of shape (num_records, num_samples) - the time series data - - y_true: numpy array of shape (num_records, num_samples) - the labels + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) """ db_path = Path(db_path) @@ -34,7 +35,8 @@ def load_data(db_path, record_ids=None): record_file = db_path / f"MBA_ECG14046_data_{record_id}.out" if record_file.exists(): # Load the record data - record_data = pd.read_csv(record_file, header=None).dropna().to_numpy() + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() # Assuming first column is the data, second column is labels if record_data.shape[1] >= 2: data_list.append(record_data[:, 0].astype(float)) @@ -56,8 +58,18 @@ def load_data(db_path, record_ids=None): for data, labels in zip(data_list, labels_list): if len(data) < max_length: # Pad with last value for data and 0 for labels - padded_data.append(np.pad(data, (0, max_length - len(data)), mode='constant', constant_values=data[-1])) - padded_labels.append(np.pad(labels, (0, max_length - len(labels)), mode='constant', constant_values=0)) + padded_data.append(np.pad( + data, + (0, max_length - len(data)), + mode='constant', + constant_values=data[-1]) + ) + padded_labels.append(np.pad( + labels, + (0, max_length - len(labels)), + mode='constant', + constant_values=0), + ) else: padded_data.append(data[:max_length]) padded_labels.append(labels[:max_length]) @@ -65,7 +77,6 @@ def load_data(db_path, record_ids=None): return np.array(padded_data), np.array(padded_labels) - class Dataset(BaseDataset): name = "ECG" @@ -101,4 +112,4 @@ def get_data(self): X_train=X_train, y_test=y_test, X_test=X_test - ) \ No newline at end of file + ) diff --git a/datasets/mgab.py b/datasets/mgab.py index e4004df..cfe610c 100644 --- a/datasets/mgab.py +++ b/datasets/mgab.py @@ -14,12 +14,13 @@ def load_data(db_path, record_ids=None): Args: db_path: Path to the database directory - record_ids: List of record IDs to load. If None, loads all available records. + record_ids: List of record IDs to load. + If None, loads all available records. Returns: tuple: (X, y_true) where: - - X: numpy array of shape (num_records, num_samples) - the time series data - - y_true: numpy array of shape (num_records, num_samples) - the labels + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) """ db_path = Path(db_path) @@ -34,7 +35,8 @@ def load_data(db_path, record_ids=None): record_file = db_path / f"{record_id}.test.out" if record_file.exists(): # Load the record data - record_data = pd.read_csv(record_file, header=None).dropna().to_numpy() + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() # Assuming first column is the data, second column is labels if record_data.shape[1] >= 2: data_list.append(record_data[:, 0].astype(float)) @@ -113,4 +115,4 @@ def get_data(self): X_train=X_train, y_test=y_test, X_test=X_test - ) \ No newline at end of file + ) diff --git a/datasets/mitdb.py b/datasets/mitdb.py index 66e9515..e9b03bc 100644 --- a/datasets/mitdb.py +++ b/datasets/mitdb.py @@ -14,17 +14,18 @@ def load_mitdb_data(db_path, record_ids=None): Args: db_path: Path to the database directory - record_ids: List of record IDs to load. If None, loads all available records. + record_ids: List of record IDs to load. + If None, loads all available records. Returns: tuple: (X, y_true) where: - - X: numpy array of shape (num_records, num_samples) - the time series data - - y_true: numpy array of shape (num_records, num_samples) - the labels + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) """ db_path = Path(db_path) if record_ids is None: - # Get all available record files with the format like 100.test.csv@1.out + # Get all available record files with format like 100.test.csv@1.out record_files = list(db_path.glob("*.out")) record_ids = [f.name for f in record_files] @@ -38,11 +39,13 @@ def load_mitdb_data(db_path, record_ids=None): if record_files: if len(record_files) > 1: print( - f"Multiple files found for record ID {record_id}, using the first one: {record_files[0]}" + f"Multiple files found for record ID {record_id}, " + f"using the first one: {record_files[0]}" ) record_file = record_files[0] # Load the record data - record_data = pd.read_csv(record_file, header=None).dropna().to_numpy() + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() # Assuming first column is the data, second column is labels print(f"Loaded record {record_id} with shape {record_data.shape}") if record_data.shape[1] >= 2: @@ -121,6 +124,7 @@ def get_data(self): y_test = y_test.reshape(-1, 1) print( - f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_test shape: {y_test.shape}" + f"X_train shape: {X_train.shape}, " + f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}" ) return dict(X_train=X_train, y_test=y_test, X_test=X_test) From 30491fec50c9da668259d4740a4b057a16ed0550 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 15:24:39 +0300 Subject: [PATCH 15/50] Fix formatting --- solvers/tsb_ocsvm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/solvers/tsb_ocsvm.py b/solvers/tsb_ocsvm.py index b06371e..7a76396 100644 --- a/solvers/tsb_ocsvm.py +++ b/solvers/tsb_ocsvm.py @@ -31,8 +31,10 @@ def set_objective(self, X_train, y_test, X_test): X_train = Window(window=self.window_size).convert(X_train).to_numpy() X_test = Window(window=self.window_size).convert(X_test).to_numpy() - self.X_train = MinMaxScaler(feature_range=(0, 1)).fit_transform(X_train.T).T - self.X_test = MinMaxScaler(feature_range=(0, 1)).fit_transform(X_test.T).T + self.X_train = MinMaxScaler( + feature_range=(0, 1)).fit_transform(X_train.T).T + self.X_test = MinMaxScaler( + feature_range=(0, 1)).fit_transform(X_test.T).T self.y_test = y_test.reshape(-1) From ae5c1c262b0faa93e62ac6c18aa92dd9c0849427 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Fri, 1 Aug 2025 16:12:25 +0300 Subject: [PATCH 16/50] New datasets --- datasets/iops.py | 136 +++++++++++++++++++++++++++++++++++++++ datasets/kdd21.py | 121 +++++++++++++++++++++++++++++++++++ datasets/nab.py | 120 +++++++++++++++++++++++++++++++++++ datasets/opportunity.py | 123 ++++++++++++++++++++++++++++++++++++ datasets/smd.py | 129 +++++++++++++++++++++++++++++++++++++ datasets/svdb.py | 127 +++++++++++++++++++++++++++++++++++++ datasets/yahoo.py | 137 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 893 insertions(+) create mode 100644 datasets/iops.py create mode 100644 datasets/kdd21.py create mode 100644 datasets/nab.py create mode 100644 datasets/opportunity.py create mode 100644 datasets/smd.py create mode 100644 datasets/svdb.py create mode 100644 datasets/yahoo.py diff --git a/datasets/iops.py b/datasets/iops.py new file mode 100644 index 0000000..d15603d --- /dev/null +++ b/datasets/iops.py @@ -0,0 +1,136 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("IOPS") + PATH = "/data/parietal/store2/data/tsb-uad/TSB-UAD-Public/IOPS/" + + +def load_data(db_path): + """ + Load train and test data from the database path. + + Args: + db_path: Path to the database directory + + Returns: + tuple: (X_train, X_test, y_test) where: + - X_train: nd.array of shape (num_records, num_samples) + - X_test: nd.array of shape (num_records, num_samples) + - y_test: nd.array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + # Get all train and test files + train_files = list(db_path.glob("KPI-*.train.out")) + test_files = list(db_path.glob("KPI-*.test.out")) + + if not train_files or not test_files: + raise ValueError("No train or test files found") + + # Load train data + train_data_list = [] + for train_file in train_files: + record_data = pd.read_csv(train_file, header=None).dropna().to_numpy() + if record_data.shape[1] >= 1: + train_data_list.append(record_data[:, 0].astype(float)) + else: + print(f"Insufficient columns for train file {train_file}") + + # Load test data and labels + test_data_list = [] + test_labels_list = [] + for test_file in test_files: + record_data = pd.read_csv(test_file, header=None).dropna().to_numpy() + if record_data.shape[1] >= 2: + test_data_list.append(record_data[:, 0].astype(float)) + test_labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for test file {test_file}") + + if not train_data_list or not test_data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_train_length = max(len(data) for data in train_data_list) + max_test_length = max(len(data) for data in test_data_list) + + # Pad train sequences + padded_train_data = [] + for data in train_data_list: + if len(data) < max_train_length: + padded_train_data.append( + np.pad( + data, + (0, max_train_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + else: + padded_train_data.append(data[:max_train_length]) + + # Pad test sequences and labels + padded_test_data = [] + padded_test_labels = [] + for data, labels in zip(test_data_list, test_labels_list): + if len(data) < max_test_length: + padded_test_data.append( + np.pad( + data, + (0, max_test_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_test_labels.append( + np.pad( + labels, + (0, max_test_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_test_data.append(data[:max_test_length]) + padded_test_labels.append(labels[:max_test_length]) + + return ( + np.array(padded_train_data), + np.array(padded_test_data), + np.array(padded_test_labels) + ) + + +class Dataset(BaseDataset): + name = "IOPS" + + parameters = { + "debug": [False], + } + + def get_data(self): + """Load the IOPS dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X_train, X_test, y_test = load_data(PATH) + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/kdd21.py b/datasets/kdd21.py new file mode 100644 index 0000000..5fc690e --- /dev/null +++ b/datasets/kdd21.py @@ -0,0 +1,121 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("KDD21") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + if record_ids is None: + # Get all available record files + record_files = list(db_path.glob("*.out")) + record_ids = [f.name.split('_')[0] for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Convert record_id to 3-digit format + formatted_id = str(record_id).zfill(3) + # Find file that starts with the formatted record_id + matching_files = list(db_path.glob(f"{formatted_id}_*.out")) + if matching_files: + record_file = matching_files[0] # Take the first matching file + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record {record_id}") + else: + print(f"Record file not found for ID: {record_id}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "KDD21" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the KDD21 dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/nab.py b/datasets/nab.py new file mode 100644 index 0000000..bba2f90 --- /dev/null +++ b/datasets/nab.py @@ -0,0 +1,120 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("NAB") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files + record_files = list(db_path.glob("NAB_data_*.out")) + record_ids = [f.name.split('_')[2] for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + record_files = list(db_path.glob(f"NAB_data_{record_id}_*.out")) + if record_files: + # Take the first matching file + record_file = record_files[0] + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record {record_id}") + else: + print(f"Record file not found for: {record_id}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "NAB" + + parameters = { + "recordings_id": [["art0"], ["art1"], ["CloudWatch"]], + "debug": [False], + } + + def get_data(self): + """Load the NAB dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/opportunity.py b/datasets/opportunity.py new file mode 100644 index 0000000..604c4b2 --- /dev/null +++ b/datasets/opportunity.py @@ -0,0 +1,123 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("OPPORTUNITY") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with S*-ADL*.test.csv@*.out pattern + record_files = list(db_path.glob("S*-ADL*.test.csv@*.out")) + # Extract record_id from filename + record_ids = [f.name.split('-')[0][1:] for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Find files matching the pattern S{record_id}-ADL*.test.csv@*.out + pattern = f"S{record_id}-ADL*.test.csv@*.out" + matching_files = list(db_path.glob(pattern)) + + if matching_files: + record_file = matching_files[0] # Take first match + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record {record_id}") + else: + print(f"Record file not found for pattern: {pattern}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "OPPORTUNITY" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the OPPORTUNITY dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/smd.py b/datasets/smd.py new file mode 100644 index 0000000..8dacaea --- /dev/null +++ b/datasets/smd.py @@ -0,0 +1,129 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("SMD") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files matching the pattern + record_files = list(db_path.glob("machine-*-*.test.csv*")) + # Extract record IDs from filenames + record_ids = [] + for f in record_files: + # Extract from machine-{record_id}-*.test.csv + parts = f.stem.split('-') + if len(parts) >= 3: + record_ids.append(parts[1]) + record_ids = list(set(record_ids)) # Remove duplicates + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Find files matching the pattern + pattern = f"machine-{record_id}-*.test.csv*" + record_files = list(db_path.glob(pattern)) + + for record_file in record_files: + if record_file.exists(): + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record {record_id}") + else: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "SMD" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the SMD dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/svdb.py b/datasets/svdb.py new file mode 100644 index 0000000..bd679f9 --- /dev/null +++ b/datasets/svdb.py @@ -0,0 +1,127 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("SVDB") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + record_files = list(db_path.glob("*.test.csv@*.out")) + record_ids = [f.name for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Handle case where record_id already includes the pattern + if record_id.endswith('.test.csv@*.out'): + pattern = record_id + else: + pattern = f"{record_id}.test.csv@*.out" + + # Find all matching files for this record_id + matching_files = list(db_path.glob(pattern)) + + if not matching_files: + print(f"No files found for record {record_id}") + continue + + for record_file in matching_files: + if record_file.exists(): + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for file {record_file}") + else: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + max_length = max(len(data) for data in data_list) + + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Padding with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "SVDB" + + parameters = { + "recordings_id": [["801"]], + "debug": [False], + } + + def get_data(self): + """Load the SVDB dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/yahoo.py b/datasets/yahoo.py new file mode 100644 index 0000000..5aff5de --- /dev/null +++ b/datasets/yahoo.py @@ -0,0 +1,137 @@ +from benchopt import BaseDataset, safe_import_context, config + +with safe_import_context() as import_ctx: + from pathlib import Path + import numpy as np + import pandas as pd + + PATH = config.get_data_path("YAHOO") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + record_files = list(db_path.glob("*.data.out")) + record_ids = [f.name for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Handle case where record_id already includes the pattern + if record_id.endswith('.data.out'): + pattern = record_id + else: + # Create pattern based on the A{record_id} format + patterns = [ + f"Yahoo_A{record_id}real_*_data.out", + f"Yahoo_A{record_id}synthetic_*_data.out", + f"YahooA{record_id}Benchmark-TS*_data.out" + ] + + # Find all matching files for this record_id + matching_files = [] + if record_id.endswith('.data.out'): + matching_files = list(db_path.glob(pattern)) + else: + for pattern in patterns: + matching_files.extend(list(db_path.glob(pattern))) + + if not matching_files: + print(f"No files found for record {record_id}") + continue + + for record_file in matching_files: + if record_file.exists(): + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # First column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for file {record_file}") + else: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + max_length = max(len(data) for data in data_list) + + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Padding with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "YAHOO" + + parameters = { + "recordings_id": [["1"]], + "debug": [False], + } + + def get_data(self): + """Load the YAHOO dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_samples, n_features) + X_train = X_train.reshape(-1, 1) + X_test = X_test.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) From 4ff164e6e201280af0e8c2881cc744556fe22701 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Mon, 15 Sep 2025 11:15:22 +0200 Subject: [PATCH 17/50] Refactor datasets and new solvers. --- datasets/ecg.py | 21 +++++++++++------- datasets/simulated.py | 12 ++++++++--- objective.py | 18 +++++++++++++--- solvers/AR.py | 29 ++++++++++++++++--------- solvers/rosecdl.py | 33 +++++++++++++++++----------- solvers/tsb_chronos.py | 48 +++++++++++++++++++++++++++++++++++++++++ solvers/tsb_timesfm.py | 36 +++++++++++++++++++++++++++++++ solvers/tsb_timesnet.py | 45 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 206 insertions(+), 36 deletions(-) create mode 100644 solvers/tsb_chronos.py create mode 100644 solvers/tsb_timesfm.py create mode 100644 solvers/tsb_timesnet.py diff --git a/datasets/ecg.py b/datasets/ecg.py index 30bb2b1..743d5e2 100644 --- a/datasets/ecg.py +++ b/datasets/ecg.py @@ -5,10 +5,11 @@ import numpy as np import pandas as pd - PATH = config.get_data_path("ECG") + # PATH = config.get_data_path("ECG") + PATH = "/data/parietal/store2/data/tsb-uad/TSB-UAD-Public/ECG" -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. @@ -16,6 +17,7 @@ def load_data(db_path, record_ids=None): db_path: Path to the database directory record_ids: List of record IDs to load. If None, loads all available records. + verbose: If True, print loading progress information. Returns: tuple: (X, y_true) where: @@ -42,9 +44,11 @@ def load_data(db_path, record_ids=None): data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for record {record_id}") + if verbose: + print(f"Insufficient columns for record {record_id}") else: - print(f"Record file not found: {record_file}") + if verbose: + print(f"Record file not found: {record_file}") if not data_list: raise ValueError("No valid data found") @@ -103,10 +107,11 @@ def get_data(self): X_test = X_test[:, :size] y_test = y_test[:, :size] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) return dict( X_train=X_train, diff --git a/datasets/simulated.py b/datasets/simulated.py index 7f48524..ddf94fe 100644 --- a/datasets/simulated.py +++ b/datasets/simulated.py @@ -12,10 +12,10 @@ class Dataset(BaseDataset): requirements = ["scikit-learn"] parameters = { - "n_samples": [10000], - "n_features": [5], + "n_samples": [10_000], + "n_features": [1], "noise": [0.1], - "n_anomaly": [90], + "n_anomaly": [900], } test_parameters = { @@ -57,4 +57,10 @@ def get_data(self): * 10 ) + # Reshaping data to (n_recordings, n_features, n_samples) + # For simulated data, treat as single recording + X_train = X_train.T.reshape(1, self.n_features, -1) + X_test = X_test.T.reshape(1, self.n_features, -1) + y_test = y_test.reshape(1, -1) + return dict(X_train=X_train, y_test=y_test, X_test=X_test) diff --git a/objective.py b/objective.py index 7af1f4c..b2cc69b 100644 --- a/objective.py +++ b/objective.py @@ -15,6 +15,7 @@ from sklearn.metrics import ( precision_score, recall_score, f1_score, zero_one_loss ) + from TSB_AD.evaluation.metrics import get_metrics class Objective(BaseObjective): @@ -35,11 +36,17 @@ def set_data(self, X_train, y_test, X_test): self.X_train = X_train self.X_test, self.y_test = X_test, y_test - def evaluate_result(self, y_hat): + def evaluate_result(self, y_hat, raw_anomaly_score=None): """Evaluate the result provided by the solver.""" + print("y_hat shape", y_hat.shape) + print("self.y_test shape", self.y_test.shape) + to_discard = (y_hat == -1).sum() - self.y_test = self.y_test[to_discard:].reshape(-1) - y_hat = y_hat[to_discard:].reshape(-1) + self.y_test = self.y_test.reshape(-1)[to_discard:] + y_hat = y_hat.reshape(-1)[to_discard:] + + print("y_hat shape after discard", y_hat.shape) + print("self.y_test shape after discard", self.y_test.shape) result = {} detection_ranges = [1, 3, 5, 10, 20] @@ -94,6 +101,11 @@ def evaluate_result(self, y_hat): "value": zoloss # having zoloss twice for the API }) + print("Computing TSB metrics") + if raw_anomaly_score is not None: + tsb_metrics = get_metrics(raw_anomaly_score, self.y_test, slidingWindow=1, version="opt_mem") + result.update(tsb_metrics) + return result def get_objective(self): diff --git a/solvers/AR.py b/solvers/AR.py index ff2d547..e524841 100644 --- a/solvers/AR.py +++ b/solvers/AR.py @@ -23,7 +23,7 @@ class Solver(BaseSolver): "n_epochs": [50], "lr": [1e-5], "weight_decay": [1e-7], - "window_size": [256], + "window_size": [100], "horizon": [1], "percentile": [99.4], } @@ -34,26 +34,35 @@ def set_objective(self, X_train, y_test, X_test): "cuda" if torch.cuda.is_available() else "cpu" ) - self.X_train = X_train # (n_samples, n_features) - self.X_test, self.y_test = X_test, y_test # (n_samples, n_features) - self.n_features = X_train.shape[1] + # Receiving shapes of (n_recordings, n_features, n_samples) + + _, n_features, _ = X_train.shape + + self.X_train = X_train.reshape(-1, n_features) # (n_samples, n_features) + self.X_test = X_test.reshape(-1, n_features) # (n_samples, n_features) + self.y_test = y_test.reshape(-1) # (n_samples,) self.model = ARModel( - self.n_features, + n_features, self.window_size, self.horizon ) self.optimizer = optim.Adam( self.model.parameters(), - lr=self.lr, + lr=float(self.lr), # weight_decay=self.weight_decay ) self.criterion = nn.MSELoss() + print("IN AR") + print("X_train shape", self.X_train.shape) + print("X_test shape", self.X_test.shape) + print("y_test shape", self.y_test.shape) + if self.X_train is not None: # (n_windows, window_size+horizon, n_features) self.Xw_train = np.lib.stride_tricks.sliding_window_view( - X_train, + self.X_train, window_shape=self.window_size+self.horizon, axis=0 ).transpose(0, 2, 1) @@ -61,7 +70,7 @@ def set_objective(self, X_train, y_test, X_test): if self.X_test is not None: # (n_windows, window_size+horizon, n_features) self.Xw_test = np.lib.stride_tricks.sliding_window_view( - X_test, + self.X_test, window_shape=self.window_size+self.horizon, axis=0 ).transpose(0, 2, 1) @@ -147,9 +156,9 @@ def run(self, _): # Skipping the solver call if a condition is met def skip(self, X_train, X_test, y_test): - if X_train.shape[0] < self.window_size + self.horizon: + if X_train.shape[0]*X_train.shape[2] < self.window_size + self.horizon: return True, "No enough training samples" - if X_test.shape[0] < self.window_size + self.horizon: + if X_test.shape[0]*X_test.shape[2] < self.window_size + self.horizon: return True, "No enough testing samples" return False, None diff --git a/solvers/rosecdl.py b/solvers/rosecdl.py index f33e86d..333b1a4 100644 --- a/solvers/rosecdl.py +++ b/solvers/rosecdl.py @@ -1,3 +1,4 @@ +from re import X from benchopt import safe_import_context, BaseSolver with safe_import_context() as import_ctx: @@ -9,20 +10,19 @@ class Solver(BaseSolver): name = "RoseCDL" install_cmd = "conda" - requirements = ["pip:rosecdl"] + requirements = ["pip:rosecdl", "pip:torch"] parameters = { "n_components": [1], - "n_channels": [1], "kernel_size": [64], "lmbd": [0.8], "scale_lmbd": [False], - "epochs": [5, 50], + "epochs": [50], "max_batch": [None], "mini_batch_size": [600], - "sample_window": [10_000], + "sample_window": [1_000], "optimizer": ["adam"], - "n_iterations": [10, 90], + "n_iterations": [90], "window": [False], "outliers_kwargs": [ { @@ -42,19 +42,15 @@ def set_objective(self, X_train, y_test, X_test): "cuda" if torch.cuda.is_available() else "cpu" ) - # We receive data in shape (n_samples, n_features) - # We want to reshape it to (n_recordings, n_features, n_samples) - X_train = X_train.reshape(1, X_train.shape[1], X_train.shape[0]) - X_test = X_test.reshape(1, X_test.shape[1], X_test.shape[0]) + # We receive data in shape (n_recordings, n_features, n_samples) self.y_test = y_test - self.X_train = torch.tensor( X_train, dtype=torch.float32, device=self.device) self.X_test = X_test self.clf = RoseCDL( n_components=self.n_components, - n_channels=self.n_channels, + n_channels=X_train.shape[1], kernel_size=self.kernel_size, lmbd=self.lmbd, scale_lmbd=self.scale_lmbd, @@ -73,5 +69,18 @@ def run(self, _): self.clf.fit(self.X_train) self.y_pred = self.clf.get_outlier_mask(self.X_test) + xh, zh = self.clf.csc( + torch.tensor(self.X_test, dtype=torch.float32, device=self.device) + ) + err = self.clf.loss_fn.compute_patch_error( + X_hat=xh, + z_hat=zh, + X=torch.tensor(self.X_test, dtype=torch.float32, + device=self.device), + ) + err = err.cpu().detach().numpy() + # Aggregate errors over channels + self.err = err.sum(axis=1).reshape(-1) + def get_result(self): - return dict(y_hat=self.y_pred) + return dict(y_hat=self.y_pred, raw_anomaly_score=self.err) diff --git a/solvers/tsb_chronos.py b/solvers/tsb_chronos.py new file mode 100644 index 0000000..a4cc51f --- /dev/null +++ b/solvers/tsb_chronos.py @@ -0,0 +1,48 @@ +from benchopt import BaseSolver, safe_import_context + +with safe_import_context() as import_ctx: + from TSB_AD.models.Chronos import Chronos + import numpy as np + + +class Solver(BaseSolver): + name = "TSB-Chronos" + + install_cmd = "conda" + requirements = ["pip:tsb-ad"] + + parameters = { + "win_size": [1000], + "prediction_length": [1], + "model_size": ['base'], + "batch_size": [32], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + _, n_features, _ = X_train.shape + self.data = np.append(X_train, X_test, axis=2) + self.data = self.data.reshape(-1, n_features) + self.X_test = X_test.reshape(-1, n_features) + + self.clf = Chronos( + win_size=self.win_size, + input_c=n_features, + prediction_length=self.prediction_length, + model_size=self.model_size, + batch_size=self.batch_size, + ) + + def run(self, _): + print("Running Chronos solver...") + self.clf.fit(self.data) + self.score = self.clf.decision_scores_[-len(self.X_test):] + print("Chronos Fitted") + + # Map scores to predictions + threshold = np.percentile(self.score, (1 - 0.1) * 100) + self.y_hat = (self.score > threshold).astype(int) + + def get_result(self): + return dict(y_hat=self.y_hat, raw_anomaly_score=self.score) diff --git a/solvers/tsb_timesfm.py b/solvers/tsb_timesfm.py new file mode 100644 index 0000000..77f03fc --- /dev/null +++ b/solvers/tsb_timesfm.py @@ -0,0 +1,36 @@ +from benchopt import BaseSolver, safe_import_context + +with safe_import_context() as import_ctx: + from TSB_AD.model_wrapper import run_TimesFM + import numpy as np + + +class Solver(BaseSolver): + name = "TSB-TimesFM" + + install_cmd = "conda" + requirements = ["pip:tsb-ad"] + + parameters = { + "win_size": [96], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + _, n_features, _ = X_train.shape + self.data = np.append(X_train, X_test, axis=2) + self.data = self.data.reshape(-1, n_features) + self.X_test = X_test.reshape(-1, n_features) + + def run(self, _): + self.y_hat = run_TimesFM( + data=self.data, + win_size=self.win_size, + ) + self.raw_anomaly_score = self.y_hat[-len(self.X_test):] + + def get_result(self): + threshold = np.percentile(self.raw_anomaly_score, 90) + self.y_hat = (self.raw_anomaly_score > threshold).astype(int) + return dict(y_hat=self.y_hat, raw_anomaly_score=self.raw_anomaly_score) diff --git a/solvers/tsb_timesnet.py b/solvers/tsb_timesnet.py new file mode 100644 index 0000000..ef4e253 --- /dev/null +++ b/solvers/tsb_timesnet.py @@ -0,0 +1,45 @@ +from benchopt import BaseSolver, safe_import_context + +with safe_import_context() as import_ctx: + from TSB_AD.models.TimesNet import TimesNet + + +class Solver(BaseSolver): + name = "TSB-TimesNet" + + install_cmd = "conda" + requirements = ["pip:tsb-ad"] + + parameters = { + "window_size": [96], + "lr": [1e-4], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + _, n_features, _ = X_train.shape + self.X_train = X_train.reshape(-1, n_features) + self.X_test = X_test.reshape(-1, n_features) + + self.clf = TimesNet( + win_size=self.window_size, + enc_in=n_features, + epochs=10, + batch_size=128, + lr=self.lr, + patience=3, + features="M", + lradj="type1", + validation_size=0.2, + ) + + def run(self, _): + self.clf.fit(self.X_train) + self.raw_anomaly_score = self.clf.decision_function(self.X_test) + + print("TimesNet done") + + def get_result(self): + self.y_hat = (self.raw_anomaly_score > 0).astype(int) + return dict(y_hat=self.y_hat, raw_anomaly_score=self.raw_anomaly_score) From fef8251e72921932b3a689db9a8262672dc485c9 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Mon, 22 Sep 2025 09:36:56 +0200 Subject: [PATCH 18/50] moved legacy solvers to solvers/legacy --- solvers/legacy/abod.py | 99 ++++++++++++++++++++++++++ solvers/legacy/cblof.py | 97 +++++++++++++++++++++++++ solvers/legacy/dif.py | 93 ++++++++++++++++++++++++ solvers/legacy/isolation-forest.py | 110 +++++++++++++++++++++++++++++ solvers/legacy/lof.py | 98 +++++++++++++++++++++++++ solvers/legacy/ocsvm.py | 88 +++++++++++++++++++++++ 6 files changed, 585 insertions(+) create mode 100644 solvers/legacy/abod.py create mode 100644 solvers/legacy/cblof.py create mode 100644 solvers/legacy/dif.py create mode 100644 solvers/legacy/isolation-forest.py create mode 100644 solvers/legacy/lof.py create mode 100644 solvers/legacy/ocsvm.py diff --git a/solvers/legacy/abod.py b/solvers/legacy/abod.py new file mode 100644 index 0000000..6ff02ae --- /dev/null +++ b/solvers/legacy/abod.py @@ -0,0 +1,99 @@ +# ABOD solver + +from benchopt import BaseSolver +from benchopt import safe_import_context + +with safe_import_context() as import_ctx: + from pyod.models.abod import ABOD + import numpy as np + + +class Solver(BaseSolver): + name = "ABOD" # Angle-Based Outlier Detection + + install_cmd = "conda" + requirements = ["pip:pyod"] + + parameters = { + "contamination": [5e-4, 0.1, 0.2, 0.3], + "n_neighbors": [5, 10, 15, 20, 30], + "window": [True], + "window_size": [20], + "stride": [1], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + self.X_train = X_train + self.X_test, self.y_test = X_test, y_test + self.clf = ABOD( + n_neighbors=self.n_neighbors, + contamination=self.contamination, + method="fast" + ) + + def run(self, _): + # Using only windowed data, parameter used only for consistency + if self.window: + + # Transofrming the data into rolling windowed data + if self.X_train is not None: + self.Xw_train = np.lib.stride_tricks.sliding_window_view( + self.X_train, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + if self.X_test is not None: + self.Xw_test = np.lib.stride_tricks.sliding_window_view( + self.X_test, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + if self.y_test is not None: + self.yw_test = np.lib.stride_tricks.sliding_window_view( + self.y_test, window_shape=self.window_size, axis=0 + )[::self.stride] + + # Flattening the data for the model + flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) + flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) + + self.clf.fit(flatrain) + + raw_y_hat = self.clf.predict(flatest) + raw_anomaly_score = self.clf.decision_function(flatest) + + # The results we get has a shape of + result_shape = ( + (self.X_train.shape[0] - self.window_size) // self.stride + ) + 1 + + # Mapping the binary output from {-1, 1} to {1, 0} + # For consistency with the other solvers + self.raw_y_hat = np.array(raw_y_hat) + self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) + + # Adding -1 for the non predicted samples + # The first window_size samples are not predicted by the model + self.raw_y_hat = np.append( + np.full(self.X_train.shape[0] - + result_shape, -1), self.raw_y_hat + ) + + # Anomaly scores (Not used but allows finer thresholding) + self.raw_anomaly_score = np.array(raw_anomaly_score) + self.raw_anomaly_score = np.append( + np.full(result_shape, -1), self.raw_anomaly_score + ) + + # Function used to skip a solver call when n_neighbors >= window_size + def skip(self, X_train, X_test, y_test): + if self.n_neighbors >= self.window_size: + return True, "Number of neighbors greater than number of samples." + return False, None + + def get_result(self): + # Anomaly : 1 + # Inlier : 0 + # To ignore : -1 + self.y_hat = self.raw_y_hat + return dict(y_hat=self.y_hat) diff --git a/solvers/legacy/cblof.py b/solvers/legacy/cblof.py new file mode 100644 index 0000000..3e44432 --- /dev/null +++ b/solvers/legacy/cblof.py @@ -0,0 +1,97 @@ +# Cluster Based Local Outlier Factor (CBLOF) solver + +from benchopt import BaseSolver +from benchopt import safe_import_context + +with safe_import_context() as import_ctx: + from pyod.models.cblof import CBLOF + import numpy as np + + +class Solver(BaseSolver): + name = "CBLOF" + + install_cmd = "conda" + requirements = ["pip:pyod"] + + parameters = { + "contamination": [5e-4, 0.01, 0.02, 0.03, 0.04], + "window": [True], + "n_clusters": [10], + "window_size": [20], + "stride": [1], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + self.X_train = X_train + self.X_test, self.y_test = X_test, y_test + self.clf = CBLOF( + contamination=self.contamination, + n_clusters=self.n_clusters + ) + + def run(self, _): + # Using only windowed data, parameter used only for consistency + if self.window: + + # We need to transform the data to have a rolling window + if self.X_train is not None: + self.Xw_train = np.lib.stride_tricks.sliding_window_view( + self.X_train, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + if self.X_test is not None: + self.Xw_test = np.lib.stride_tricks.sliding_window_view( + self.X_test, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + if self.y_test is not None: + self.yw_test = np.lib.stride_tricks.sliding_window_view( + self.y_test, window_shape=self.window_size, axis=0 + )[::self.stride] + + # Flattening the data for the model + flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) + flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) + + self.clf.fit(flatrain) + raw_y_hat = self.clf.predict(flatest) + raw_anomaly_score = self.clf.decision_function(flatest) + + # The results we get has a shape of + result_shape = ( + (self.X_train.shape[0] - self.window_size) // self.stride + ) + 1 + + # Mapping the binary output from {-1, 1} to {1, 0} + # For consistency with the other solvers + self.raw_y_hat = np.array(raw_y_hat) + self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) + + # Adding -1 for the non predicted samples + # The first window_size samples are not predicted by the model + self.raw_y_hat = np.append( + np.full(self.X_train.shape[0] - + result_shape, -1), self.raw_y_hat + ) + + # Anomaly scores (Not used but allows finer thresholding) + self.raw_anomaly_score = np.array(raw_anomaly_score) + self.raw_anomaly_score = np.append( + np.full(result_shape, -1), self.raw_anomaly_score + ) + + # Skipping the solver call if a condition is met + def skip(self, X_train, X_test, y_test): + if X_train.shape[0] < self.window_size: + return True, "No enough samples to create a window" + return False, None + + def get_result(self): + # Anomaly : 1 + # Inlier : 0 + # To ignore : -1 + self.y_hat = self.raw_y_hat + return dict(y_hat=self.y_hat) diff --git a/solvers/legacy/dif.py b/solvers/legacy/dif.py new file mode 100644 index 0000000..6aeef8e --- /dev/null +++ b/solvers/legacy/dif.py @@ -0,0 +1,93 @@ +# Deep Isolation Forest +from benchopt import BaseSolver +from benchopt import safe_import_context + +with safe_import_context() as import_ctx: + from pyod.models.dif import DIF + import numpy as np + + +class Solver(BaseSolver): + name = "DIF" + + install_cmd = "conda" + requirements = ["pip:pyod"] + + parameters = { + "contamination": [0.05, 0.1, 0.2], + "window": [True], + "window_size": [20], + "stride": [1], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + self.X_train = X_train + self.X_test, self.y_test = X_test, y_test + # Device is automatically selected by the model + # if device=None + self.clf = DIF(contamination=self.contamination, device=None) + + def run(self, _): + # Using only windowed data, parameter used only for consistency + if self.window: + + # Transofrming the data into rolling windowed data + if self.X_train is not None: + self.Xw_train = np.lib.stride_tricks.sliding_window_view( + self.X_train, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + if self.X_test is not None: + self.Xw_test = np.lib.stride_tricks.sliding_window_view( + self.X_test, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + if self.y_test is not None: + self.yw_test = np.lib.stride_tricks.sliding_window_view( + self.y_test, window_shape=self.window_size, axis=0 + )[::self.stride] + + # Flattening the data for the model + flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) + flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) + + self.clf.fit(flatrain) + raw_y_hat = self.clf.predict(flatest) + raw_anomaly_score = self.clf.decision_function(flatest) + + # The results we get has a shape of + result_shape = ( + (self.X_train.shape[0] - self.window_size) // self.stride + ) + 1 + + # Mapping the binary output from {-1, 1} to {1, 0} + # For consistency with the other solvers + self.raw_y_hat = np.array(raw_y_hat) + self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) + + # Adding -1 for the non predicted samples + # The first window_size samples are not predicted by the model + self.raw_y_hat = np.append( + np.full(self.X_train.shape[0] - + result_shape, -1), self.raw_y_hat + ) + + # Anomaly scores (Not used but allows finer thresholding) + self.raw_anomaly_score = np.array(raw_anomaly_score) + self.raw_anomaly_score = np.append( + np.full(result_shape, -1), self.raw_anomaly_score + ) + + def skip(self, X_train, X_test, y_test): + if X_train.shape[0] < self.window_size: + return True, "Not enough samples to create a window" + return False, None + + def get_result(self): + # Anomaly : 1 + # Inlier : 0 + # To ignore : -1 + self.y_hat = self.raw_y_hat + return dict(y_hat=self.y_hat) diff --git a/solvers/legacy/isolation-forest.py b/solvers/legacy/isolation-forest.py new file mode 100644 index 0000000..58910d0 --- /dev/null +++ b/solvers/legacy/isolation-forest.py @@ -0,0 +1,110 @@ +# Isolation Forest solver + +from benchopt import BaseSolver +from benchopt import safe_import_context + +with safe_import_context() as import_ctx: + from sklearn.ensemble import IsolationForest + import numpy as np + + +class Solver(BaseSolver): + name = "IsolationForest" + + install_cmd = "conda" + requirements = ["scikit-learn"] + + parameters = { + "contamination": [5e-4, 5e-3, 5e-2, 0.1, 0.2, 0.4, 0.5], + "window": [True], + "window_size": [60, 120, 180], + "stride": [1], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + self.X_train = X_train + self.X_test, self.y_test = X_test, y_test + n_recordings, n_features, n_samples = self.X_train.shape + self.clf = IsolationForest(contamination=self.contamination) + + def run(self, _): + if self.window: + # We need to transform the data to have a rolling window + if self.X_train is not None: + # Apply sliding window along the time dimension (axis=2) + n_recordings, n_features, n_samples = self.X_train.shape + self.Xw_train = np.lib.stride_tricks.sliding_window_view( + self.X_train, window_shape=self.window_size, axis=2 + )[:, :, ::self.stride].transpose(0, 1, 3, 2) + + if self.X_test is not None: + n_recordings, n_features, n_samples = self.X_test.shape + self.Xw_test = np.lib.stride_tricks.sliding_window_view( + self.X_test, window_shape=self.window_size, axis=2 + )[:, :, ::self.stride].transpose(0, 1, 3, 2) + + if self.y_test is not None: + n_recordings, _, n_samples = self.y_test.shape + self.yw_test = np.lib.stride_tricks.sliding_window_view( + self.y_test, window_shape=self.window_size, axis=2 + )[:, :, ::self.stride] + + # Flatten for sklearn + flatrain = self.Xw_train.reshape( + self.Xw_train.shape[0] * self.Xw_train.shape[1], -1) + flatest = self.Xw_test.reshape( + self.Xw_test.shape[0] * self.Xw_test.shape[1], -1) + + self.clf.fit(flatrain) + raw_y_hat = self.clf.predict(flatest) + raw_anomaly_score = self.clf.decision_function(flatest) + + # The results we get has a shape of + n_recordings, n_features, n_windows, _ = self.Xw_test.shape + + # Mapping the binary output from {-1, 1} to {1, 0} + # For consistency with the other solvers + self.raw_y_hat = np.array(raw_y_hat) + self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) + + # Reshape back to original structure + self.raw_y_hat = self.raw_y_hat.reshape( + n_recordings, n_features, n_windows) + + # Anomaly scores (Not used but allows finer thresholding) + self.raw_anomaly_score = np.array(raw_anomaly_score) + self.raw_anomaly_score = self.raw_anomaly_score.reshape( + n_recordings, n_features, n_windows) + else: + # No windowing case + # Flatten the data for sklearn + n_recordings, n_features, n_samples = self.X_train.shape + X_train_flat = self.X_train.reshape(-1, n_features) + X_test_flat = self.X_test.reshape(-1, n_features) + + self.clf.fit(X_train_flat) + self.raw_y_hat = self.clf.predict(X_test_flat) + self.raw_anomaly_score = self.clf.decision_function(X_test_flat) + + # Reshape back to (n_recordings, n_samples) for single feature case + # For now, assume we take the first feature or average across features + self.raw_y_hat = self.raw_y_hat.reshape(n_recordings, n_samples) + self.raw_anomaly_score = self.raw_anomaly_score.reshape( + n_recordings, n_samples) + + def skip(self, X_train, X_test, y_test): + # Skip if dataset size is smaller than window size + _, _, n_samples = X_train.shape + if n_samples < self.window_size: + return True, "Window size is larger than dataset size. Skipping." + return False, None + + def get_result(self): + # Anomaly : 1 + # Inlier : 0 + # To ignore : -1 + # For now, take the first recording + self.y_hat = self.raw_y_hat[0] if self.raw_y_hat.ndim > 1 else self.raw_y_hat + return dict(y_hat=self.y_hat) diff --git a/solvers/legacy/lof.py b/solvers/legacy/lof.py new file mode 100644 index 0000000..1ce2058 --- /dev/null +++ b/solvers/legacy/lof.py @@ -0,0 +1,98 @@ +# Local Outlier Factor + +from benchopt import BaseSolver +from benchopt import safe_import_context + +with safe_import_context() as import_ctx: + from sklearn.neighbors import LocalOutlierFactor + import numpy as np + + +class Solver(BaseSolver): + name = "LocalOutlierFactor" + + install_cmd = "conda" + requirements = ["scikit-learn"] + + parameters = { + "contamination": [0.1, 0.2, 0.3], + "n_neighbors": [5, 10, 20, 25, 40], + "window": [True], + "window_size": [20], + "stride": [1], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + self.X_train = X_train + self.X_test, self.y_test = X_test, y_test + self.clf = LocalOutlierFactor( + novelty=True, + n_neighbors=self.n_neighbors, + contamination=self.contamination, + ) + + def run(self, _): + if self.window: + # We need to transform the data to have a rolling window + if self.X_train is not None: + self.Xw_train = np.lib.stride_tricks.sliding_window_view( + self.X_train, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + if self.X_test is not None: + self.Xw_test = np.lib.stride_tricks.sliding_window_view( + self.X_test, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + if self.y_test is not None: + self.yw_test = np.lib.stride_tricks.sliding_window_view( + self.y_test, window_shape=self.window_size, axis=0 + )[::self.stride] + + flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) + flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) + + self.clf.fit(flatrain) + raw_y_hat = self.clf.predict(flatest) + raw_anomaly_score = self.clf.decision_function(flatest) + + # The results we get has a shape of + result_shape = ( + (self.X_train.shape[0] - self.window_size) // self.stride + ) + 1 + + # Mapping the binary output from {-1, 1} to {1, 0} + # For consistency with the other solvers + self.raw_y_hat = np.array(raw_y_hat) + self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) + + # Adding -1 for the non predicted samples + # The first window_size samples are not predicted by the model + self.raw_y_hat = np.append( + np.full(self.X_train.shape[0] - + result_shape, -1), self.raw_y_hat + ) + + # Anomaly scores (Not used but allows finer thresholding) + self.raw_anomaly_score = np.array(raw_anomaly_score) + self.raw_anomaly_score = np.append( + np.full(result_shape, -1), self.raw_anomaly_score + ) + + def skip(self, X_train, y_test, X_test): + if self.n_neighbors > self.window_size: + return True, "Number of neighbors greater than number of samples." + if self.n_neighbors > X_train.shape[0]: + return True, "Number of neighbors greater than number of samples." + if X_train.shape[0] < self.window_size: + return True, "No enough samples to create a window" + return False, None + + def get_result(self): + # Anomaly : 1 + # Inlier : 0 + # To ignore : -1 + self.y_hat = self.raw_y_hat + return dict(y_hat=self.y_hat) diff --git a/solvers/legacy/ocsvm.py b/solvers/legacy/ocsvm.py new file mode 100644 index 0000000..268e57c --- /dev/null +++ b/solvers/legacy/ocsvm.py @@ -0,0 +1,88 @@ +from benchopt import BaseSolver, safe_import_context + +with safe_import_context() as import_ctx: + from sklearn.svm import OneClassSVM + import numpy as np + + +class Solver(BaseSolver): + name = "OCSVM" + + install_cmd = "conda" + requirements = ["scikit-learn"] + + parameters = { + "nu": [0.001, 0.01, 0.05], + "gamma": [1e-5, 1e-2], + "kernel": ["rbf"], + "window": [True], + "window_size": [128], + "stride": [1], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + self.X_train = X_train + self.X_test, self.y_test = X_test, y_test + self.clf = OneClassSVM( + nu=self.nu, + kernel=self.kernel, + gamma=self.gamma, + ) + + if self.window: + if self.X_train is not None: + self.Xw_train = np.lib.stride_tricks.sliding_window_view( + self.X_train, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + if self.X_test is not None: + self.Xw_test = np.lib.stride_tricks.sliding_window_view( + self.X_test, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + if self.y_test is not None: + self.yw_test = np.lib.stride_tricks.sliding_window_view( + self.y_test, window_shape=self.window_size, axis=0 + )[::self.stride] + + self.flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) + self.flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) + + def run(self, _): + if self.window: + self.clf.fit(self.flatrain) + raw_y_hat = self.clf.predict(self.flatest) + raw_anomaly_score = self.clf.decision_function(self.flatest) + + # The results we get has a shape of + result_shape = ( + (self.X_train.shape[0] - self.window_size) // self.stride + ) + 1 + + # Mapping the binary output from {-1, 1} to {1, 0} + # For consistency with the other solvers + self.raw_y_hat = np.array(raw_y_hat) + + # Adding -1 for the non predicted samples + # The first window_size samples are not predicted by the model + self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) + self.raw_y_hat = np.append( + np.full(self.X_train.shape[0] - + result_shape, -1), self.raw_y_hat + ) + + # Anomaly scores (Not used but allows finer thresholding) + self.raw_anomaly_score = np.array(raw_anomaly_score) + self.raw_anomaly_score = np.append( + np.full(result_shape, -1), self.raw_anomaly_score + ) + + def skip(self, X_train, X_test, y_test): + if X_train.shape[0] < self.window_size: + return True, "Window size is larger than dataset size." + return False, None + + def get_result(self): + return dict(y_hat=self.raw_y_hat) From ff9e80a16c999bc51f84cbda05c19a032401566a Mon Sep 17 00:00:00 2001 From: jadyehya Date: Mon, 22 Sep 2025 09:39:42 +0200 Subject: [PATCH 19/50] delete duplicate solvers --- solvers/abod.py | 99 ------------------------------------- solvers/cblof.py | 97 ------------------------------------ solvers/dif.py | 93 ---------------------------------- solvers/isolation-forest.py | 90 --------------------------------- solvers/lof.py | 98 ------------------------------------ solvers/ocsvm.py | 88 --------------------------------- solvers/sktime_lof.py | 56 --------------------- 7 files changed, 621 deletions(-) delete mode 100644 solvers/abod.py delete mode 100644 solvers/cblof.py delete mode 100644 solvers/dif.py delete mode 100644 solvers/isolation-forest.py delete mode 100644 solvers/lof.py delete mode 100644 solvers/ocsvm.py delete mode 100644 solvers/sktime_lof.py diff --git a/solvers/abod.py b/solvers/abod.py deleted file mode 100644 index 6ff02ae..0000000 --- a/solvers/abod.py +++ /dev/null @@ -1,99 +0,0 @@ -# ABOD solver - -from benchopt import BaseSolver -from benchopt import safe_import_context - -with safe_import_context() as import_ctx: - from pyod.models.abod import ABOD - import numpy as np - - -class Solver(BaseSolver): - name = "ABOD" # Angle-Based Outlier Detection - - install_cmd = "conda" - requirements = ["pip:pyod"] - - parameters = { - "contamination": [5e-4, 0.1, 0.2, 0.3], - "n_neighbors": [5, 10, 15, 20, 30], - "window": [True], - "window_size": [20], - "stride": [1], - } - - sampling_strategy = "run_once" - - def set_objective(self, X_train, y_test, X_test): - self.X_train = X_train - self.X_test, self.y_test = X_test, y_test - self.clf = ABOD( - n_neighbors=self.n_neighbors, - contamination=self.contamination, - method="fast" - ) - - def run(self, _): - # Using only windowed data, parameter used only for consistency - if self.window: - - # Transofrming the data into rolling windowed data - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - self.X_train, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - self.X_test, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - # Flattening the data for the model - flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) - flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) - - self.clf.fit(flatrain) - - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat - ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score - ) - - # Function used to skip a solver call when n_neighbors >= window_size - def skip(self, X_train, X_test, y_test): - if self.n_neighbors >= self.window_size: - return True, "Number of neighbors greater than number of samples." - return False, None - - def get_result(self): - # Anomaly : 1 - # Inlier : 0 - # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) diff --git a/solvers/cblof.py b/solvers/cblof.py deleted file mode 100644 index 3e44432..0000000 --- a/solvers/cblof.py +++ /dev/null @@ -1,97 +0,0 @@ -# Cluster Based Local Outlier Factor (CBLOF) solver - -from benchopt import BaseSolver -from benchopt import safe_import_context - -with safe_import_context() as import_ctx: - from pyod.models.cblof import CBLOF - import numpy as np - - -class Solver(BaseSolver): - name = "CBLOF" - - install_cmd = "conda" - requirements = ["pip:pyod"] - - parameters = { - "contamination": [5e-4, 0.01, 0.02, 0.03, 0.04], - "window": [True], - "n_clusters": [10], - "window_size": [20], - "stride": [1], - } - - sampling_strategy = "run_once" - - def set_objective(self, X_train, y_test, X_test): - self.X_train = X_train - self.X_test, self.y_test = X_test, y_test - self.clf = CBLOF( - contamination=self.contamination, - n_clusters=self.n_clusters - ) - - def run(self, _): - # Using only windowed data, parameter used only for consistency - if self.window: - - # We need to transform the data to have a rolling window - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - self.X_train, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - self.X_test, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - # Flattening the data for the model - flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) - flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) - - self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat - ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score - ) - - # Skipping the solver call if a condition is met - def skip(self, X_train, X_test, y_test): - if X_train.shape[0] < self.window_size: - return True, "No enough samples to create a window" - return False, None - - def get_result(self): - # Anomaly : 1 - # Inlier : 0 - # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) diff --git a/solvers/dif.py b/solvers/dif.py deleted file mode 100644 index 6aeef8e..0000000 --- a/solvers/dif.py +++ /dev/null @@ -1,93 +0,0 @@ -# Deep Isolation Forest -from benchopt import BaseSolver -from benchopt import safe_import_context - -with safe_import_context() as import_ctx: - from pyod.models.dif import DIF - import numpy as np - - -class Solver(BaseSolver): - name = "DIF" - - install_cmd = "conda" - requirements = ["pip:pyod"] - - parameters = { - "contamination": [0.05, 0.1, 0.2], - "window": [True], - "window_size": [20], - "stride": [1], - } - - sampling_strategy = "run_once" - - def set_objective(self, X_train, y_test, X_test): - self.X_train = X_train - self.X_test, self.y_test = X_test, y_test - # Device is automatically selected by the model - # if device=None - self.clf = DIF(contamination=self.contamination, device=None) - - def run(self, _): - # Using only windowed data, parameter used only for consistency - if self.window: - - # Transofrming the data into rolling windowed data - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - self.X_train, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - self.X_test, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - # Flattening the data for the model - flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) - flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) - - self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat - ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score - ) - - def skip(self, X_train, X_test, y_test): - if X_train.shape[0] < self.window_size: - return True, "Not enough samples to create a window" - return False, None - - def get_result(self): - # Anomaly : 1 - # Inlier : 0 - # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) diff --git a/solvers/isolation-forest.py b/solvers/isolation-forest.py deleted file mode 100644 index dac03e3..0000000 --- a/solvers/isolation-forest.py +++ /dev/null @@ -1,90 +0,0 @@ -# Isolation Forest solver - -from benchopt import BaseSolver -from benchopt import safe_import_context - -with safe_import_context() as import_ctx: - from sklearn.ensemble import IsolationForest - import numpy as np - - -class Solver(BaseSolver): - name = "IsolationForest" - - install_cmd = "conda" - requirements = ["scikit-learn"] - - parameters = { - "contamination": [5e-4, 5e-3, 5e-2, 0.1, 0.2, 0.4, 0.5], - "window": [True], - "window_size": [60, 120, 180], - "stride": [1], - } - - sampling_strategy = "run_once" - - def set_objective(self, X_train, y_test, X_test): - self.X_train = X_train - self.X_test, self.y_test = X_test, y_test - self.clf = IsolationForest(contamination=self.contamination) - - def run(self, _): - if self.window: - # We need to transform the data to have a rolling window - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - self.X_train, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - self.X_test, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) - flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) - - self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat - ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score - ) - - def skip(self, X_train, X_test, y_test): - # Skip if dataset size is smaller than window size - if X_train.shape[0] < self.window_size: - return True, "Window size is larger than dataset size. Skipping." - return False, None - - def get_result(self): - # Anomaly : 1 - # Inlier : 0 - # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) diff --git a/solvers/lof.py b/solvers/lof.py deleted file mode 100644 index 1ce2058..0000000 --- a/solvers/lof.py +++ /dev/null @@ -1,98 +0,0 @@ -# Local Outlier Factor - -from benchopt import BaseSolver -from benchopt import safe_import_context - -with safe_import_context() as import_ctx: - from sklearn.neighbors import LocalOutlierFactor - import numpy as np - - -class Solver(BaseSolver): - name = "LocalOutlierFactor" - - install_cmd = "conda" - requirements = ["scikit-learn"] - - parameters = { - "contamination": [0.1, 0.2, 0.3], - "n_neighbors": [5, 10, 20, 25, 40], - "window": [True], - "window_size": [20], - "stride": [1], - } - - sampling_strategy = "run_once" - - def set_objective(self, X_train, y_test, X_test): - self.X_train = X_train - self.X_test, self.y_test = X_test, y_test - self.clf = LocalOutlierFactor( - novelty=True, - n_neighbors=self.n_neighbors, - contamination=self.contamination, - ) - - def run(self, _): - if self.window: - # We need to transform the data to have a rolling window - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - self.X_train, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - self.X_test, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) - flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) - - self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat - ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score - ) - - def skip(self, X_train, y_test, X_test): - if self.n_neighbors > self.window_size: - return True, "Number of neighbors greater than number of samples." - if self.n_neighbors > X_train.shape[0]: - return True, "Number of neighbors greater than number of samples." - if X_train.shape[0] < self.window_size: - return True, "No enough samples to create a window" - return False, None - - def get_result(self): - # Anomaly : 1 - # Inlier : 0 - # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) diff --git a/solvers/ocsvm.py b/solvers/ocsvm.py deleted file mode 100644 index 268e57c..0000000 --- a/solvers/ocsvm.py +++ /dev/null @@ -1,88 +0,0 @@ -from benchopt import BaseSolver, safe_import_context - -with safe_import_context() as import_ctx: - from sklearn.svm import OneClassSVM - import numpy as np - - -class Solver(BaseSolver): - name = "OCSVM" - - install_cmd = "conda" - requirements = ["scikit-learn"] - - parameters = { - "nu": [0.001, 0.01, 0.05], - "gamma": [1e-5, 1e-2], - "kernel": ["rbf"], - "window": [True], - "window_size": [128], - "stride": [1], - } - - sampling_strategy = "run_once" - - def set_objective(self, X_train, y_test, X_test): - self.X_train = X_train - self.X_test, self.y_test = X_test, y_test - self.clf = OneClassSVM( - nu=self.nu, - kernel=self.kernel, - gamma=self.gamma, - ) - - if self.window: - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - self.X_train, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - self.X_test, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - self.flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) - self.flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) - - def run(self, _): - if self.window: - self.clf.fit(self.flatrain) - raw_y_hat = self.clf.predict(self.flatest) - raw_anomaly_score = self.clf.decision_function(self.flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat - ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score - ) - - def skip(self, X_train, X_test, y_test): - if X_train.shape[0] < self.window_size: - return True, "Window size is larger than dataset size." - return False, None - - def get_result(self): - return dict(y_hat=self.raw_y_hat) diff --git a/solvers/sktime_lof.py b/solvers/sktime_lof.py deleted file mode 100644 index 31e1c94..0000000 --- a/solvers/sktime_lof.py +++ /dev/null @@ -1,56 +0,0 @@ -from benchopt import BaseSolver -from benchopt import safe_import_context - -with safe_import_context() as import_ctx: - from sktime.annotation.lof import SubLOF - import pandas as pd - import numpy as np - - -class Solver(BaseSolver): - name = "SubLOF" - - install_cmd = "conda" - requirements = ["sktime", "pandas"] - - parameters = { - "n_neighbors": [5, 10, 20, 25, 40], - "window_size": [20, 64, 128], - "leaf_size": [30, 40], - "contamination": ["auto", 0.1, 0.2, 0.3], - } - - sampling_strategy = "run_once" - - def set_objective(self, X_train, y_test, X_test): - self.X_train = pd.DataFrame(X_train) - self.X_test, self.y_test = pd.DataFrame(X_test), y_test - self.clf = SubLOF( - n_neighbors=self.n_neighbors, - window_size=self.window_size, - leaf_size=self.leaf_size, - contamination=self.contamination, - n_jobs=-1, - novelty=True, - ) - - def run(self, _): - self.clf.fit(self.X_train) - self.raw_y_hat = self.clf.predict(self.X_test) - # self.raw_anomaly_score = self.clf.predict_score(self.X_test) - - def skip(self, X_train, y_test, X_test): - if self.n_neighbors > self.window_size: - return True, "Number of neighbors greater than window size" - if self.n_neighbors > X_train.shape[0]: - return True, "Number of neighbors greater than number of samples" - if self.leaf_size > X_train.shape[0]: - return True, "Leaf size greater than number of samples" - if self.window_size > X_train.shape[0]: - return True, "Window size greater than number of samples" - return False, None - - def get_result(self): - self.y_hat = np.zeros(self.X_test.shape[0]) - self.y_hat[self.raw_y_hat] = 1 - return dict(y_hat=self.y_hat) From 92269052583d9e50ebe011ac2d128c70e013fda6 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Thu, 4 Dec 2025 16:22:08 -0800 Subject: [PATCH 20/50] RFC dataset loading and reshaping for anomaly detection --- datasets/daphnet.py | 50 ++++++-- datasets/dodgers.py | 23 ++-- datasets/ecg.py | 25 +++- datasets/genesis.py | 18 +-- datasets/ghl.py | 18 +-- datasets/iops.py | 18 +-- datasets/kdd21.py | 19 +-- datasets/mgab.py | 18 +-- datasets/mitdb.py | 59 ++++----- datasets/msl.py | 7 ++ datasets/nab.py | 18 +-- datasets/occupancy.py | 22 ++-- datasets/opportunity.py | 18 +-- datasets/pattern.py | 66 +++++++++++ datasets/sensorscope.py | 18 +-- datasets/simulated.py | 8 +- datasets/smap.py | 7 ++ datasets/smd.py | 10 +- datasets/svdb.py | 84 ++++++++----- datasets/swat.py | 7 ++ datasets/trend.py | 83 +++++++++++++ datasets/wadi.py | 7 ++ datasets/yahoo.py | 20 ++-- objective.py | 29 ++++- solvers/anomalybert.py | 250 +++++++++++++++++++++++++++++++++++++++ solvers/autoencoder.py | 21 ++-- solvers/dagmm.py | 95 +++++++++++++++ solvers/matrixprofile.py | 44 +++---- solvers/rosecdl.py | 188 +++++++++++++++++++++++++++-- solvers/tsb_chronos.py | 10 +- 30 files changed, 1048 insertions(+), 212 deletions(-) create mode 100644 datasets/pattern.py create mode 100644 datasets/trend.py create mode 100644 solvers/anomalybert.py create mode 100644 solvers/dagmm.py diff --git a/datasets/daphnet.py b/datasets/daphnet.py index 9e4ce3e..25f4285 100644 --- a/datasets/daphnet.py +++ b/datasets/daphnet.py @@ -4,11 +4,12 @@ from pathlib import Path import numpy as np import pandas as pd + import matplotlib.pyplot as plt PATH = config.get_data_path("DAPHNET") -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False, number=-1): """ Load data from the database path for specified record IDs. @@ -16,6 +17,7 @@ def load_data(db_path, record_ids=None): db_path: Path to the database directory record_ids: List of record IDs to load. If None, loads all available records. + verbose: If True, print loading progress information. Returns: tuple: (X, y_true) where: @@ -24,10 +26,16 @@ def load_data(db_path, record_ids=None): """ db_path = Path(db_path) + if record_ids is not None and number > 0: + print("Warning: 'number' parameter is ignored when 'record_ids' is provided.") + if record_ids is None: # Get all available record files with .test.csv@X.out pattern record_files = list(db_path.glob("*.test.csv@*.out")) - record_ids = [f.name for f in record_files] + record_ids = [f.name.split(".")[0] for f in record_files] + if number > 0: + record_ids = record_ids[:number] + data_list = [] labels_list = [] @@ -36,11 +44,13 @@ def load_data(db_path, record_ids=None): record_files = list(db_path.glob(f"{record_id}.test.csv@*.out")) if not record_files: - print(f"No record files found for ID: {record_id}") + if verbose: + print(f"No record files found for ID: {record_id}") continue for record_file in record_files: - print(f"Loading record file: {record_file}") + if verbose: + print(f"Loading record file: {record_file}") # Load the record data record_data = pd.read_csv( record_file, header=None).dropna().to_numpy() @@ -49,8 +59,9 @@ def load_data(db_path, record_ids=None): data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print( - f"Insufficient columns for record file {record_file.name}") + if verbose: + print( + f"Insufficient columns for record file {record_file.name}") if not data_list: raise ValueError("No valid data found") @@ -91,7 +102,9 @@ class Dataset(BaseDataset): name = "DAPHNET" parameters = { - "recordings_id": [["S01R02E0"]], + # "recordings_id": [["S01R02E0"]], + "recordings_id": [None], # [["S01R02E0"]], + "number": [-1], "debug": [False], } @@ -100,7 +113,9 @@ def get_data(self): # X shape (n_recordings, n_samples) # y shape (n_recordings, n_samples) - X, y_true = load_data(PATH, self.recordings_id) + if self.recordings_id in (["all"], "all"): + self.recordings_id = None + X, y_true = load_data(PATH, self.recordings_id, number=self.number) X_test = X.copy() y_test = y_true.copy() @@ -112,10 +127,21 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + plt.figure(figsize=(6, 3)) + plt.plot(X_train[0, 0, :500], linewidth=1.2) + plt.plot(range(297, 305), X_train[0, 0, 297:305], color="orange", linewidth=3) + plt.title("Daphnet dataset") + plt.tight_layout() + plt.savefig("daphnet_example.png") + plt.close() + + print("PLOT SAVED") return dict( X_train=X_train, diff --git a/datasets/dodgers.py b/datasets/dodgers.py index 0418e89..c3c6e02 100644 --- a/datasets/dodgers.py +++ b/datasets/dodgers.py @@ -8,7 +8,7 @@ PATH = config.get_data_path("DODGERS") -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. @@ -16,6 +16,7 @@ def load_data(db_path, record_ids=None): db_path: Path to the database directory record_ids: List of record IDs to load. If None, loads all available records. + verbose: If True, print loading progress information. Returns: tuple: (X, y_true) where: @@ -46,9 +47,11 @@ def load_data(db_path, record_ids=None): data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for record {record_id}") + if verbose: + print(f"Insufficient columns for record {record_id}") else: - print(f"Record file not found: {record_file}") + if verbose: + print(f"Record file not found: {record_file}") if not data_list: raise ValueError("No valid data found") @@ -89,7 +92,8 @@ class Dataset(BaseDataset): name = "DODGERS" parameters = { - "recordings_id": [["101"]], + # "recordings_id": [["101"]], + "recordings_id": [None], "debug": [False], } @@ -98,6 +102,8 @@ def get_data(self): # X shape (n_recordings, n_samples) # y shape (n_recordings, n_samples) + if self.recordings_id in (["all"], "all"): + self.recordings_id = None X, y_true = load_data(PATH, self.recordings_id) X_test = X.copy() @@ -110,10 +116,11 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) return dict( X_train=X_train, diff --git a/datasets/ecg.py b/datasets/ecg.py index 743d5e2..38e147c 100644 --- a/datasets/ecg.py +++ b/datasets/ecg.py @@ -5,11 +5,10 @@ import numpy as np import pandas as pd - # PATH = config.get_data_path("ECG") - PATH = "/data/parietal/store2/data/tsb-uad/TSB-UAD-Public/ECG" + PATH = config.get_data_path("ECG") -def load_data(db_path, record_ids=None, verbose=False): +def load_data(db_path, record_ids=None, verbose=False, number=-1): """ Load data from the database path for specified record IDs. @@ -26,15 +25,27 @@ def load_data(db_path, record_ids=None, verbose=False): """ db_path = Path(db_path) + if record_ids is not None and number > 0: + print("Warning: 'number' parameter is ignored when 'record_ids' is provided.") + if record_ids is None: # Get all available record files record_files = list(db_path.glob("*.out")) record_ids = [f.stem for f in record_files] + if "MBA_ECG14046_data" in record_ids: + record_ids.remove("MBA_ECG14046_data") + if verbose: + print("Removed MBA_ECG14046_data from records due to issues") + + if number > 0: + record_ids = record_ids[:number] + print(record_ids) + data_list = [] labels_list = [] for record_id in record_ids: - record_file = db_path / f"MBA_ECG14046_data_{record_id}.out" + record_file = db_path / f"{record_id}.out" if record_file.exists(): # Load the record data record_data = pd.read_csv( @@ -87,14 +98,16 @@ class Dataset(BaseDataset): parameters = { "recordings_id": [["1", "2"]], "debug": [False], + "number": [-1], } def get_data(self): """Load the MITDB dataset.""" - # X shape (n_recordings, n_samples) # y shape (n_recordings, n_samples) - X, y_true = load_data(PATH, self.recordings_id) + if self.recordings_id in (["all"], "all"): + self.recordings_id = None + X, y_true = load_data(PATH, self.recordings_id, number=self.number) X_test = X.copy() y_test = y_true.copy() diff --git a/datasets/genesis.py b/datasets/genesis.py index 696f266..4e3f00d 100644 --- a/datasets/genesis.py +++ b/datasets/genesis.py @@ -8,7 +8,7 @@ PATH = config.get_data_path("GENESIS") -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. @@ -16,6 +16,7 @@ def load_data(db_path, record_ids=None): db_path: Path to the database directory record_ids: List of record IDs to load. If None, loads all available records. + verbose: If True, print loading progress information. Returns: tuple: (X, y_true) where: @@ -47,9 +48,11 @@ def load_data(db_path, record_ids=None): data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for record {record_id}") + if verbose: + print(f"Insufficient columns for record {record_id}") else: - print(f"Record file not found: {record_file}") + if verbose: + print(f"Record file not found: {record_file}") if not data_list: raise ValueError("No valid data found") @@ -111,10 +114,11 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) return dict( X_train=X_train, diff --git a/datasets/ghl.py b/datasets/ghl.py index 074862d..3da6f93 100644 --- a/datasets/ghl.py +++ b/datasets/ghl.py @@ -8,7 +8,7 @@ PATH = config.get_data_path("GHL") -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. @@ -16,6 +16,7 @@ def load_data(db_path, record_ids=None): db_path: Path to the database directory record_ids: List of record IDs to load. If None, loads all available records. + verbose: If True, print loading progress information. Returns: tuple: (X, y_true) where: @@ -51,9 +52,11 @@ def load_data(db_path, record_ids=None): data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for record {record_id}") + if verbose: + print(f"Insufficient columns for record {record_id}") else: - print(f"Record file not found: {record_file}") + if verbose: + print(f"Record file not found: {record_file}") if not data_list: raise ValueError("No valid data found") @@ -115,10 +118,11 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) return dict( X_train=X_train, diff --git a/datasets/iops.py b/datasets/iops.py index d15603d..7efcb1e 100644 --- a/datasets/iops.py +++ b/datasets/iops.py @@ -9,12 +9,13 @@ PATH = "/data/parietal/store2/data/tsb-uad/TSB-UAD-Public/IOPS/" -def load_data(db_path): +def load_data(db_path, verbose=False): """ Load train and test data from the database path. Args: db_path: Path to the database directory + verbose: If True, print loading progress information. Returns: tuple: (X_train, X_test, y_test) where: @@ -38,7 +39,8 @@ def load_data(db_path): if record_data.shape[1] >= 1: train_data_list.append(record_data[:, 0].astype(float)) else: - print(f"Insufficient columns for train file {train_file}") + if verbose: + print(f"Insufficient columns for train file {train_file}") # Load test data and labels test_data_list = [] @@ -49,7 +51,8 @@ def load_data(db_path): test_data_list.append(record_data[:, 0].astype(float)) test_labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for test file {test_file}") + if verbose: + print(f"Insufficient columns for test file {test_file}") if not train_data_list or not test_data_list: raise ValueError("No valid data found") @@ -124,10 +127,11 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) return dict( X_train=X_train, diff --git a/datasets/kdd21.py b/datasets/kdd21.py index 5fc690e..6e810f1 100644 --- a/datasets/kdd21.py +++ b/datasets/kdd21.py @@ -7,8 +7,7 @@ PATH = config.get_data_path("KDD21") - -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. @@ -16,6 +15,7 @@ def load_data(db_path, record_ids=None): db_path: Path to the database directory record_ids: List of record IDs to load. If None, loads all available records. + verbose: If True, print loading progress information. Returns: tuple: (X, y_true) where: @@ -45,9 +45,11 @@ def load_data(db_path, record_ids=None): data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for record {record_id}") + if verbose: + print(f"Insufficient columns for record {record_id}") else: - print(f"Record file not found for ID: {record_id}") + if verbose: + print(f"Record file not found for ID: {record_id}") if not data_list: raise ValueError("No valid data found") @@ -109,10 +111,11 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) return dict( X_train=X_train, diff --git a/datasets/mgab.py b/datasets/mgab.py index cfe610c..7006bbe 100644 --- a/datasets/mgab.py +++ b/datasets/mgab.py @@ -8,7 +8,7 @@ PATH = config.get_data_path("MGAB") -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. @@ -16,6 +16,7 @@ def load_data(db_path, record_ids=None): db_path: Path to the database directory record_ids: List of record IDs to load. If None, loads all available records. + verbose: If True, print loading progress information. Returns: tuple: (X, y_true) where: @@ -42,9 +43,11 @@ def load_data(db_path, record_ids=None): data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for record {record_id}") + if verbose: + print(f"Insufficient columns for record {record_id}") else: - print(f"Record file not found: {record_file}") + if verbose: + print(f"Record file not found: {record_file}") if not data_list: raise ValueError("No valid data found") @@ -95,6 +98,7 @@ def get_data(self): # X shape (n_recordings, n_samples) # y shape (n_recordings, n_samples) X, y_true = load_data(PATH, self.recordings_id) + n_recordings, _ = X.shape X_test = X.copy() y_test = y_true.copy() @@ -106,10 +110,10 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) return dict( X_train=X_train, diff --git a/datasets/mitdb.py b/datasets/mitdb.py index e9b03bc..c2637ba 100644 --- a/datasets/mitdb.py +++ b/datasets/mitdb.py @@ -8,7 +8,7 @@ PATH = config.get_data_path("MITDB") -def load_mitdb_data(db_path, record_ids=None): +def load_mitdb_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. @@ -27,9 +27,10 @@ def load_mitdb_data(db_path, record_ids=None): if record_ids is None: # Get all available record files with format like 100.test.csv@1.out record_files = list(db_path.glob("*.out")) - record_ids = [f.name for f in record_files] + record_ids = [str(f.name).split(".")[0] for f in record_files] - print(f"Loading records: {record_ids}") + if verbose: + print(f"Loading records: {record_ids}") data_list = [] labels_list = [] @@ -38,24 +39,30 @@ def load_mitdb_data(db_path, record_ids=None): record_files = list(db_path.glob(f"{record_id}*.out")) if record_files: if len(record_files) > 1: - print( - f"Multiple files found for record ID {record_id}, " - f"using the first one: {record_files[0]}" - ) + if verbose: + print( + f"Multiple files found for record ID {record_id}, " + f"using the first one: {record_files[0]}" + ) record_file = record_files[0] # Load the record data record_data = pd.read_csv( - record_file, header=None).dropna().to_numpy() + db_path / record_file, header=None).dropna().to_numpy() # Assuming first column is the data, second column is labels - print(f"Loaded record {record_id} with shape {record_data.shape}") + if verbose: + print( + f"Loaded record {record_id} with shape {record_data.shape}") if record_data.shape[1] >= 2: - print(f"Record {record_id} has sufficient columns") + if verbose: + print(f"Record {record_id} has sufficient columns") data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for record {record_id}") + if verbose: + print(f"Insufficient columns for record {record_id}") else: - print(f"Record file not found for ID: {record_id}") + if verbose: + print(f"Record file not found for ID: {db_path / record_id}") if not data_list: raise ValueError("No valid data found") @@ -96,7 +103,7 @@ class Dataset(BaseDataset): name = "MITDB" parameters = { - "recordings_id": [["100", "201"], ["100"]], + "recordings_id": [["100", "201", "109", "105", "111", "221"]], "debug": [False], } @@ -105,6 +112,8 @@ def get_data(self): # X shape (n_recordings, n_samples) # y shape (n_recordings, n_samples) + if self.recordings_id in (["all"], "all"): + self.recordings_id = None X, y_true = load_mitdb_data(PATH, self.recordings_id) X_test = X.copy() @@ -113,18 +122,14 @@ def get_data(self): X_train = X[:, : int(X.shape[1] * 0.1)] if self.debug: - print("Debug mode: limiting data to 1000 samples") - X_train = X_train[:, :1000] - X_test = X_test[:, :1000] - y_test = y_test[:, :1000] - - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) - - print( - f"X_train shape: {X_train.shape}, " - f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}" - ) + X_train = X_train[:, -2000:] + X_test = X_test[:, -2000:] + y_test = y_test[:, -2000:] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + return dict(X_train=X_train, y_test=y_test, X_test=X_test) diff --git a/datasets/msl.py b/datasets/msl.py index fe177ba..db73346 100644 --- a/datasets/msl.py +++ b/datasets/msl.py @@ -58,6 +58,13 @@ def get_data(self): X_test = X_test[:1000] y_test = y_test[:1000] + # Reshaping data to (n_recordings, n_features, n_samples) + # For MSL, treat as single recording + n_features = X_train.shape[1] + X_train = X_train.T.reshape(1, n_features, -1) + X_test = X_test.T.reshape(1, n_features, -1) + y_test = y_test.reshape(1, -1) + print(X_train.shape, X_test.shape, y_test.shape) return dict( diff --git a/datasets/nab.py b/datasets/nab.py index bba2f90..afa7612 100644 --- a/datasets/nab.py +++ b/datasets/nab.py @@ -8,7 +8,7 @@ PATH = config.get_data_path("NAB") -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. @@ -16,6 +16,7 @@ def load_data(db_path, record_ids=None): db_path: Path to the database directory record_ids: List of record IDs to load. If None, loads all available records. + verbose: If True, print loading progress information. Returns: tuple: (X, y_true) where: @@ -44,9 +45,11 @@ def load_data(db_path, record_ids=None): data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for record {record_id}") + if verbose: + print(f"Insufficient columns for record {record_id}") else: - print(f"Record file not found for: {record_id}") + if verbose: + print(f"Record file not found for: {record_id}") if not data_list: raise ValueError("No valid data found") @@ -108,10 +111,11 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) return dict( X_train=X_train, diff --git a/datasets/occupancy.py b/datasets/occupancy.py index 1a6f6fd..561aafd 100644 --- a/datasets/occupancy.py +++ b/datasets/occupancy.py @@ -8,13 +8,14 @@ PATH = config.get_data_path("OCCUPANCY") -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. Args: db_path: Path to the database directory record_ids: List of record IDs to load for testing. + verbose: If True, print loading progress information. Returns: tuple: (X_train, X_test, y_test) where: @@ -26,7 +27,8 @@ def load_data(db_path, record_ids=None): # Load training data train_files = sorted(list(db_path.glob("room-occupancy.train.csv@*.out"))) - print(train_files) + if verbose: + print(train_files) if not train_files: raise FileNotFoundError("No training files found.") train_data_list = [ @@ -52,7 +54,8 @@ def load_data(db_path, record_ids=None): list(db_path.glob(f"room-occupancy-{record_id}.test.csv@*.out")) ) if not test_files: - print(f"No test files found for record_id {record_id}") + if verbose: + print(f"No test files found for record_id {record_id}") continue for test_file in test_files: @@ -62,7 +65,9 @@ def load_data(db_path, record_ids=None): test_data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for record file {test_file.name}") + if verbose: + print( + f"Insufficient columns for record file {test_file.name}") if not test_data_list: raise ValueError("No valid test data found") @@ -123,10 +128,11 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) return dict( X_train=X_train, diff --git a/datasets/opportunity.py b/datasets/opportunity.py index 604c4b2..248d17e 100644 --- a/datasets/opportunity.py +++ b/datasets/opportunity.py @@ -8,7 +8,7 @@ PATH = config.get_data_path("OPPORTUNITY") -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. @@ -16,6 +16,7 @@ def load_data(db_path, record_ids=None): db_path: Path to the database directory record_ids: List of record IDs to load. If None, loads all available records. + verbose: If True, print loading progress information. Returns: tuple: (X, y_true) where: @@ -47,9 +48,11 @@ def load_data(db_path, record_ids=None): data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for record {record_id}") + if verbose: + print(f"Insufficient columns for record {record_id}") else: - print(f"Record file not found for pattern: {pattern}") + if verbose: + print(f"Record file not found for pattern: {pattern}") if not data_list: raise ValueError("No valid data found") @@ -111,10 +114,11 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) return dict( X_train=X_train, diff --git a/datasets/pattern.py b/datasets/pattern.py new file mode 100644 index 0000000..47c1dba --- /dev/null +++ b/datasets/pattern.py @@ -0,0 +1,66 @@ +from benchopt import BaseDataset, safe_import_context + +with safe_import_context() as import_ctx: + import numpy as np + from rosecdl.utils.utils_signal import generate_experiment + + +class Dataset(BaseDataset): + name = "Pattern" + + parameters = { + "n_samples": [10], + "n_times": [5000], + "debug": [False], + "random_state": [42], + "n_times_atom": [250], + } + + + def get_data(self): + if self.debug: + self.n_samples = 2 + self.n_times = 1000 + + size = self.n_times // 5000 + contamination_params = { + "n_atoms": 2, + "sparsity": 3, + "init_z": "constant", + "init_z_kwargs": {"value": 50}, + } + + simulation_params = { + "n_trials": self.n_samples * 2, + "n_channels": 2, + "n_times": self.n_times, + "n_atoms": 2, + "n_times_atom": self.n_times_atom, + "n_atoms_extra": 2, # extra atoms in the learned dictionary + "D_init": "random", + "window": True, + "contamination_params": contamination_params, + "init_d": "shapes", + "init_d_kwargs": {"shapes": ["sin", "gaussian"]}, + "init_z": "constant", + "init_z_kwargs": {"value": 1}, + "noise_std": 0.01, + "rng": self.random_state, + "sparsity": 20, + } + + X, _, _, _, info_contam = generate_experiment( + simulation_params=simulation_params, + return_info_contam=True, + ) + + X_train, X_test = X[: self.n_samples], X[self.n_samples :] + y_test = info_contam["outliers_mask"][self.n_samples :] + y_test = np.any(y_test, axis=1) + + + print(f"X_train shape: {X_train.shape}") + print(f"X_test shape: {X_test.shape}") + print(f"y_test shape: {y_test.shape}") + + return dict(X_train=X_train, y_test=y_test, X_test=X_test) diff --git a/datasets/sensorscope.py b/datasets/sensorscope.py index 64b4cab..1e5370d 100644 --- a/datasets/sensorscope.py +++ b/datasets/sensorscope.py @@ -8,7 +8,7 @@ PATH = config.get_data_path("SENSORSCOPE") -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. @@ -16,6 +16,7 @@ def load_data(db_path, record_ids=None): db_path: Path to the database directory record_ids: List of record IDs to load. If None, loads all available records. + verbose: If True, print loading progress information. Returns: tuple: (X, y_true) where: @@ -46,9 +47,11 @@ def load_data(db_path, record_ids=None): data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for record {record_id}") + if verbose: + print(f"Insufficient columns for record {record_id}") else: - print(f"Record file not found: {record_file}") + if verbose: + print(f"Record file not found: {record_file}") if not data_list: raise ValueError("No valid data found") @@ -110,10 +113,11 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) return dict( X_train=X_train, diff --git a/datasets/simulated.py b/datasets/simulated.py index ddf94fe..19b7ed2 100644 --- a/datasets/simulated.py +++ b/datasets/simulated.py @@ -12,10 +12,10 @@ class Dataset(BaseDataset): requirements = ["scikit-learn"] parameters = { - "n_samples": [10_000], - "n_features": [1], + "n_samples": [100_000], + "n_features": [6], "noise": [0.1], - "n_anomaly": [900], + "n_anomaly": [15_000], } test_parameters = { @@ -46,7 +46,7 @@ def get_data(self): # Adding anomalies y_test = np.zeros(self.n_samples) - for i in range(self.n_anomaly): + for _ in range(self.n_anomaly): idx = np.random.randint(self.n_samples) y_test[idx] = 1 diff --git a/datasets/smap.py b/datasets/smap.py index 86dd691..8d30ca9 100644 --- a/datasets/smap.py +++ b/datasets/smap.py @@ -63,6 +63,13 @@ def get_data(self): X_test = X_test[:1000] y_test = y_test[:1000] + # Reshaping data to (n_recordings, n_features, n_samples) + # For SMAP, treat as single recording + n_features = X_train.shape[1] + X_train = X_train.T.reshape(1, n_features, -1) + X_test = X_test.T.reshape(1, n_features, -1) + y_test = y_test.reshape(1, -1) + return dict( X_train=X_train, y_test=y_test, X_test=X_test ) diff --git a/datasets/smd.py b/datasets/smd.py index 8dacaea..8865fde 100644 --- a/datasets/smd.py +++ b/datasets/smd.py @@ -117,10 +117,12 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + # For SMD, treat as single recording + n_features = X_train.shape[1] + X_train = X_train.T.reshape(1, n_features, -1) + X_test = X_test.T.reshape(1, n_features, -1) + y_test = y_test.reshape(1, -1) return dict( X_train=X_train, diff --git a/datasets/svdb.py b/datasets/svdb.py index bd679f9..40c9083 100644 --- a/datasets/svdb.py +++ b/datasets/svdb.py @@ -4,11 +4,12 @@ from pathlib import Path import numpy as np import pandas as pd + import matplotlib.pyplot as plt PATH = config.get_data_path("SVDB") -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False, number=-1): """ Load data from the database path for specified record IDs. @@ -24,39 +25,46 @@ def load_data(db_path, record_ids=None): """ db_path = Path(db_path) + if record_ids is not None and number > 0: + print("Warning: 'number' parameter is ignored when 'record_ids' is provided.") + if record_ids is None: record_files = list(db_path.glob("*.test.csv@*.out")) - record_ids = [f.name for f in record_files] + record_ids = [f.name.split(".")[0] for f in record_files] + if number > 0: + record_ids = record_ids[:number] data_list = [] labels_list = [] for record_id in record_ids: # Handle case where record_id already includes the pattern - if record_id.endswith('.test.csv@*.out'): - pattern = record_id - else: - pattern = f"{record_id}.test.csv@*.out" - - # Find all matching files for this record_id - matching_files = list(db_path.glob(pattern)) - - if not matching_files: - print(f"No files found for record {record_id}") - continue - - for record_file in matching_files: - if record_file.exists(): - record_data = pd.read_csv( - record_file, header=None).dropna().to_numpy() - # Assuming first column is the data, second column is labels - if record_data.shape[1] >= 2: - data_list.append(record_data[:, 0].astype(float)) - labels_list.append(record_data[:, 1].astype(int)) - else: - print(f"Insufficient columns for file {record_file}") + record_files = list(db_path.glob(f"{record_id}*test.csv@*.out")) + if record_files: + if len(record_files) > 1: + if verbose: + print( + f"Multiple files found for record ID {record_id}, " + f"using the first one: {record_files[0]}" + ) + record_file = record_files[0] + # Load the record data + record_data = pd.read_csv( + db_path / record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if verbose: + print( + f"Loaded record {record_id} with shape {record_data.shape}") + if record_data.shape[1] >= 2: + if verbose: + print(f"Record {record_id} has sufficient columns") + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Record file not found: {record_file}") - + if verbose: + print(f"Insufficient columns for record {record_id}") + else: + if verbose: + print(f"Record file not found for ID: {db_path / record_id}") if not data_list: raise ValueError("No valid data found") @@ -95,6 +103,7 @@ class Dataset(BaseDataset): parameters = { "recordings_id": [["801"]], + "number": [-1], "debug": [False], } @@ -103,7 +112,9 @@ def get_data(self): # X shape (n_recordings, n_samples) # y shape (n_recordings, n_samples) - X, y_true = load_data(PATH, self.recordings_id) + if self.recordings_id in (["all"], "all"): + self.recordings_id = None + X, y_true = load_data(PATH, self.recordings_id, number=self.number) X_test = X.copy() y_test = y_true.copy() @@ -115,10 +126,21 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + plt.figure(figsize=(6, 3)) + plt.plot(X_train[0, 0, :500], linewidth=1.2) + plt.plot(range(350, 360), X_train[0, 0, 350:360], color="orange", linewidth=3) + plt.title("SVDB dataset") + plt.tight_layout() + plt.savefig("svdb_example.png") + plt.close() + + print("PLOT SAVED") return dict( X_train=X_train, diff --git a/datasets/swat.py b/datasets/swat.py index 5400d4f..ffd9758 100644 --- a/datasets/swat.py +++ b/datasets/swat.py @@ -45,6 +45,13 @@ def get_data(self): X_test = X_test[:1000] y_test = y_test[:1000] + # Reshaping data to (n_recordings, n_features, n_samples) + # For SWaT, treat as single recording + n_features = X_train.shape[1] + X_train = X_train.T.reshape(1, n_features, -1) + X_test = X_test.T.reshape(1, n_features, -1) + y_test = y_test.reshape(1, -1) + return dict( X_train=X_train, y_test=y_test, X_test=X_test ) diff --git a/datasets/trend.py b/datasets/trend.py new file mode 100644 index 0000000..2a9fa16 --- /dev/null +++ b/datasets/trend.py @@ -0,0 +1,83 @@ +from benchopt import BaseDataset, safe_import_context + +with safe_import_context() as import_ctx: + import numpy as np + from rosecdl.utils.utils_signal import generate_experiment + + +class Dataset(BaseDataset): + name = "Trend" + + parameters = { + "n_samples": [10], + "n_times": [5000], + "debug": [False], + "random_state": [42], + "n_times_atom": [250], + "trend_scale": [9], + "freq": [4], # frequency multiplier for the trend + } + + + def get_data(self): + if self.debug: + self.n_samples = 2 + self.n_times = 1000 + + size = self.n_times // 5000 + contamination_params = { + "n_atoms": 2, + "sparsity": 3, + "init_z": "constant", + "init_z_kwargs": {"value": 50}, + } + + simulation_params = { + "n_trials": self.n_samples * 2, + "n_channels": 2, + "n_times": self.n_times, + "n_atoms": 2, + "n_times_atom": self.n_times_atom, + "n_atoms_extra": 2, # extra atoms in the learned dictionary + "D_init": "random", + "window": True, + "contamination_params": contamination_params, + "init_d": "shapes", + "init_d_kwargs": {"shapes": ["sin", "gaussian"]}, + "init_z": "constant", + "init_z_kwargs": {"value": 1}, + "noise_std": 0.01, + "rng": self.random_state, + "sparsity": 20, + } + + X, _, _, _, info_contam = generate_experiment( + simulation_params=simulation_params, + return_info_contam=True, + ) + + # Add low frequency sinusoidal trend + t = np.linspace(0, self.freq * np.pi, self.n_times) + trend = self.trend_scale * np.sin(t) + X += trend[None, None, :] + + X_train, X_test = X[: self.n_samples], X[self.n_samples :] + y_test = info_contam["outliers_mask"][self.n_samples :] + y_test = np.any(y_test, axis=1) + + import matplotlib.pyplot as plt + # Plot example time series with trend + plt.figure(figsize=(10, 4)) + plt.plot(X_train[0, 0, :]) + plt.title('Example Time Series with Added Trend') + plt.xlabel('Time') + plt.ylabel('Value') + plt.legend() + plt.show() + + + print(f"X_train shape: {X_train.shape}") + print(f"X_test shape: {X_test.shape}") + print(f"y_test shape: {y_test.shape}") + + return dict(X_train=X_train, y_test=y_test, X_test=X_test) diff --git a/datasets/wadi.py b/datasets/wadi.py index d890ec6..c5c89be 100644 --- a/datasets/wadi.py +++ b/datasets/wadi.py @@ -66,6 +66,13 @@ def get_data(self): X_test = X_test[:1000] y_test = y_test[:1000] + # Reshaping data to (n_recordings, n_features, n_samples) + # For WADI, treat as single recording + n_features = X_train.shape[1] + X_train = X_train.T.reshape(1, n_features, -1) + X_test = X_test.T.reshape(1, n_features, -1) + y_test = y_test.reshape(1, 1, -1) + return dict( X_train=X_train, y_test=y_test, X_test=X_test ) diff --git a/datasets/yahoo.py b/datasets/yahoo.py index 5aff5de..adc3cb1 100644 --- a/datasets/yahoo.py +++ b/datasets/yahoo.py @@ -8,7 +8,7 @@ PATH = config.get_data_path("YAHOO") -def load_data(db_path, record_ids=None): +def load_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. @@ -51,7 +51,8 @@ def load_data(db_path, record_ids=None): matching_files.extend(list(db_path.glob(pattern))) if not matching_files: - print(f"No files found for record {record_id}") + if verbose: + print(f"No files found for record {record_id}") continue for record_file in matching_files: @@ -63,9 +64,11 @@ def load_data(db_path, record_ids=None): data_list.append(record_data[:, 0].astype(float)) labels_list.append(record_data[:, 1].astype(int)) else: - print(f"Insufficient columns for file {record_file}") + if verbose: + print(f"Insufficient columns for file {record_file}") else: - print(f"Record file not found: {record_file}") + if verbose: + print(f"Record file not found: {record_file}") if not data_list: raise ValueError("No valid data found") @@ -125,10 +128,11 @@ def get_data(self): X_test = X_test[:, :1000] y_test = y_test[:, :1000] - # Reshaping data to (n_samples, n_features) - X_train = X_train.reshape(-1, 1) - X_test = X_test.reshape(-1, 1) - y_test = y_test.reshape(-1, 1) + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) return dict( X_train=X_train, diff --git a/objective.py b/objective.py index b2cc69b..798b51d 100644 --- a/objective.py +++ b/objective.py @@ -13,9 +13,13 @@ with safe_import_context() as import_ctx: import numpy as np from sklearn.metrics import ( - precision_score, recall_score, f1_score, zero_one_loss + precision_score, + recall_score, + f1_score, + zero_one_loss, + roc_auc_score, + precision_recall_curve, ) - from TSB_AD.evaluation.metrics import get_metrics class Objective(BaseObjective): @@ -101,10 +105,23 @@ def evaluate_result(self, y_hat, raw_anomaly_score=None): "value": zoloss # having zoloss twice for the API }) - print("Computing TSB metrics") - if raw_anomaly_score is not None: - tsb_metrics = get_metrics(raw_anomaly_score, self.y_test, slidingWindow=1, version="opt_mem") - result.update(tsb_metrics) + # AUC-ROC and AUC-PR + auc_roc = roc_auc_score(self.y_test, raw_anomaly_score) + precision_curve, recall_curve, _ = precision_recall_curve(self.y_test, raw_anomaly_score) + auc_pr = -np.trapz(precision_curve, recall_curve) + + result["auc_roc"] = auc_roc + result["auc_pr"] = auc_pr + + # print("Computing TSB metrics") + # if raw_anomaly_score is not None: + # tsb_metrics = get_metrics(raw_anomaly_score, self.y_test, slidingWindow=1, version="opt_mem") + # result.update(tsb_metrics) + # end_time = perf_counter() + # print(f"TSB metrics computed in {end_time - start_time:.2f} seconds") + + for key, value in result.items(): + print(f"{key}: {value}") return result diff --git a/solvers/anomalybert.py b/solvers/anomalybert.py new file mode 100644 index 0000000..cc52896 --- /dev/null +++ b/solvers/anomalybert.py @@ -0,0 +1,250 @@ +from benchopt import BaseSolver, safe_import_context + +with safe_import_context() as import_ctx: + import sys + import os + import numpy as np + import torch + import torch.nn as nn + from torch.optim.lr_scheduler import CosineAnnealingLR + from tqdm import tqdm + + # Add AnomalyBERT to path + sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'AnomalyBERT')) + + from models.anomaly_transformer import get_anomaly_transformer + +class Solver(BaseSolver): + name = "AnomalyBERT" + sampling_strategy = "run_once" + + requirements = ["pip:timm","pip:torch", "pip:numpy", "pip:tqdm"] + + parameters = { + "patch_size": [1], + "d_embed": [512], + "n_layer": [6], + "batch_size": [128], + "lr": [0.0001], + "max_steps": [5000], + "n_patches": [512], + "seed": [548920], + "device": ["cuda:1"], + "window_sliding": [16], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + # X_train shape: (n_series, n_features, n_samples) + # We assume single series or concatenate them + if X_train.ndim == 3: + # Reshape to (n_samples, n_features) + # Assuming (1, n_features, n_samples) -> (n_samples, n_features) + # Or (n_series, n_features, n_samples) -> (n_series * n_samples, n_features) + # But we need to be careful about time continuity if we concatenate. + # For now, let's assume standard benchopt format which seems to be (1, n_features, n_samples) + self.X_train = np.transpose(X_train, (0, 2, 1)).reshape(-1, X_train.shape[1]).astype(np.float32) + self.X_test = np.transpose(X_test, (0, 2, 1)).reshape(-1, X_test.shape[1]).astype(np.float32) + else: + self.X_train = X_train.astype(np.float32) + self.X_test = X_test.astype(np.float32) + + def run(self, _): + torch.manual_seed(self.seed) + np.random.seed(self.seed) + + device = torch.device(self.device if torch.cuda.is_available() else 'cpu') + + train_data = self.X_train + d_data = train_data.shape[1] + + # Configuration + patch_size = self.patch_size + n_patches = self.n_patches # This corresponds to n_features (max_seq_len) in AnomalyBERT + data_seq_len = n_patches * patch_size + + if len(train_data) <= data_seq_len: + raise ValueError(f"Data length {len(train_data)} is smaller than sequence length {data_seq_len}") # Model + self.model = get_anomaly_transformer( + input_d_data=d_data, + output_d_data=1, # BCE loss + patch_size=patch_size, + d_embed=self.d_embed, + hidden_dim_rate=4., + max_seq_len=n_patches, + positional_encoding=None, + relative_position_embedding=True, + transformer_n_layer=self.n_layer, + transformer_n_head=8, + dropout=0.1 + ).to(device) + + # Optimizer + optimizer = torch.optim.AdamW(params=self.model.parameters(), lr=self.lr, weight_decay=1e-4) + scheduler = CosineAnnealingLR(optimizer, T_max=self.max_steps, eta_min=self.lr*0.01) + + train_loss_fn = nn.BCELoss().to(device) + sigmoid = nn.Sigmoid().to(device) + + # Data Augmentation Parameters + replacing_rate = (0.015, 0.15) + + replacing_table = list(np.random.randint(int(data_seq_len*replacing_rate[0]), int(data_seq_len*replacing_rate[1]), size=10000)) + replacing_table_index = 0 + replacing_table_length = 10000 + + soft_replacing_prob = 1 - 0.5 + uniform_replacing_prob = soft_replacing_prob - 0.15 + peak_noising_prob = uniform_replacing_prob - 0.15 + + replacing_weight = 0.7 + + def replacing_weights(interval_len): + warmup_len = interval_len // 10 + return np.concatenate((np.linspace(0, replacing_weight, num=warmup_len), + np.full(interval_len-2*warmup_len, replacing_weight), + np.linspace(replacing_weight, 0, num=warmup_len)), axis=None) + + valid_index_list = np.arange(len(train_data) - data_seq_len) + numerical_column = np.arange(d_data) # Assume all numerical + + # Training Loop + self.model.train() + for i in tqdm(range(self.max_steps)): + first_index = np.random.choice(valid_index_list, size=self.batch_size) + x = [] + for j in first_index: + x.append(torch.Tensor(train_data[j:j+data_seq_len].copy()).to(device)) + + # Replace data logic + current_index = replacing_table_index + replacing_table_index += self.batch_size + + if replacing_table_index > replacing_table_length: + replacing_lengths = replacing_table[current_index:] + replacing_table[:replacing_table_index-replacing_table_length] + replacing_table_index -= replacing_table_length + else: + replacing_lengths = replacing_table[current_index:replacing_table_index] + if replacing_table_index == replacing_table_length: + replacing_table_index = 0 + + replacing_lengths = np.array(replacing_lengths) + # replacing_index = np.random.randint(0, (len(train_data)-replacing_lengths+1)[:, np.newaxis], size=(self.batch_size, d_data)) + # Simplified replacing index to just pick random start points in train_data + # Note: train.py uses replacing_data which defaults to train_data + + target_index = np.random.randint(0, data_seq_len-replacing_lengths+1) + + replacing_type = np.random.uniform(0., 1., size=(self.batch_size,)) + replacing_dim_numerical = np.random.uniform(0., 1., size=(self.batch_size, d_data)) + replacing_dim_numerical = replacing_dim_numerical - np.maximum(replacing_dim_numerical.min(axis=1, keepdims=True), 0.3) <= 0.001 + + x_anomaly = torch.zeros(self.batch_size, data_seq_len, device=device) + + for j, tar, leng, typ, dim_num in zip(range(self.batch_size), target_index, replacing_lengths, replacing_type, replacing_dim_numerical): + if leng > 0: + _x = x[j].clone().transpose(0, 1) # (d_data, seq_len) + + # External interval replacing + if typ > soft_replacing_prob: + col_num = numerical_column[dim_num] + if len(col_num) > 0: + # Pick random interval from train_data + rep_start = np.random.randint(0, len(train_data) - leng) + random_interval = train_data[rep_start:rep_start+leng, col_num].copy() + + # Random flip + if np.random.rand() > 0.5: # Horizontal + random_interval = random_interval[::-1].copy() + if np.random.rand() > 0.5: # Vertical + random_interval = 1 - random_interval # Assuming normalized data? + + _x_temp = torch.from_numpy(random_interval).to(device).transpose(0, 1) # (n_cols, leng) + + weights = torch.from_numpy(replacing_weights(leng)).float().unsqueeze(0).to(device) + _x[col_num, tar:tar+leng] = _x_temp * weights + _x[col_num, tar:tar+leng] * (1 - weights) + + x_anomaly[j, tar:tar+leng] = 1 + x[j] = _x.transpose(0, 1) + + # Uniform replacing + elif typ > uniform_replacing_prob: + col_num = numerical_column[dim_num] + if len(col_num) > 0: + _x[col_num, tar:tar+leng] = torch.rand(len(col_num), leng, device=device) + x_anomaly[j, tar:tar+leng] = 1 + x[j] = _x.transpose(0, 1) + + # Peak noising + elif typ > peak_noising_prob: + col_num = numerical_column[dim_num] + if len(col_num) > 0: + peak_index = np.random.randint(0, leng) + peak_value = (_x[col_num, tar+peak_index] < 0.5).float().to(device) + peak_value = peak_value + (0.1 * (1 - 2 * peak_value)) * torch.rand(len(col_num), device=device) + _x[col_num, tar+peak_index] = peak_value + + tar_first = np.maximum(0, tar + peak_index - patch_size) + tar_last = tar + peak_index + patch_size + 1 + x_anomaly[j, tar_first:tar_last] = 1 + x[j] = _x.transpose(0, 1) + + z = torch.stack(x) + y = self.model(z) + y = y.squeeze(-1) + loss = train_loss_fn(sigmoid(y), x_anomaly) + + optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) + optimizer.step() + scheduler.step() + + def get_result(self): + device = torch.device(self.device if torch.cuda.is_available() else 'cpu') + self.model.eval() + + test_data = self.X_test + window_size = self.n_patches * self.patch_size + window_sliding = self.window_sliding # Default from estimate.py + batch_size = self.batch_size + + # We will just slide over the test data + + n_samples = len(test_data) + output_values = torch.zeros(n_samples, device=device) + n_overlap = torch.zeros(n_samples, device=device) + + sigmoid = nn.Sigmoid().to(device) + + with torch.no_grad(): + # Pad test data if needed or just handle boundaries + # estimate.py handles divisions. We'll assume one continuous sequence. + + # We need to batch the sliding windows + indices = list(range(0, n_samples - window_size + 1, window_sliding)) + + for i in range(0, len(indices), batch_size): + batch_indices = indices[i:i+batch_size] + x_batch = [] + for idx in batch_indices: + x_batch.append(test_data[idx:idx+window_size]) + + if not x_batch: + continue + + x_batch = torch.Tensor(np.stack(x_batch)).to(device) + y_batch = sigmoid(self.model(x_batch)).squeeze(-1) # (batch, window_size) + + for j, idx in enumerate(batch_indices): + output_values[idx:idx+window_size] += y_batch[j] + n_overlap[idx:idx+window_size] += 1 + + n_overlap[n_overlap == 0] = 1 + scores = (output_values / n_overlap).cpu().numpy() + + threshold = np.percentile(scores, (1 - 0.1) * 100) + y_hat = (scores > threshold).astype(int) + + return dict(y_hat=y_hat, raw_anomaly_score=scores) \ No newline at end of file diff --git a/solvers/autoencoder.py b/solvers/autoencoder.py index 3e8d000..2de1ff2 100644 --- a/solvers/autoencoder.py +++ b/solvers/autoencoder.py @@ -3,7 +3,7 @@ with safe_import_context() as import_ctx: from benchmark_utils.models import Autoencoder - from TSB_UAD.utils.slidingWindows import find_length + from TSB_AD.utils.slidingWindows import find_length import numpy as np @@ -11,12 +11,12 @@ class Solver(BaseSolver): name = "AE" install_cmd = "conda" - requirements = ["pip:tsb-uad"] + requirements = ["pip:tsb-uad", "scikit-learn"] parameters = { "window_size": [10, "auto"], "num_epochs": [100], - "batch_size": [128], + "batch_size": [1024], "learning_rate": [1e-3], "hidden_size": [64], "latent_size": [32], @@ -28,12 +28,15 @@ def set_objective(self, X_train, y_test, X_test): if self.window_size == "auto": self.window_size = find_length(X_train) - self.X_train = X_train.reshape(-1) - self.X_test = X_test.reshape(-1) - self.y_test = y_test + # Data received has shape (n_recordings, n_features, n_samples) + n_features = X_train.shape[1] + self.X_train = X_train.reshape(-1, n_features) + self.X_test = X_test.reshape(-1, n_features) + self.y_test = y_test.reshape(-1) + # For multivariate data, input_size = window_size * n_features self.clf = Autoencoder( - input_size=self.window_size, + input_size=self.window_size * n_features, sliding_window=self.window_size, latent_size=self.latent_size, hidden_size=self.hidden_size, @@ -44,10 +47,10 @@ def run(self, _): self.X_train, num_epochs=self.num_epochs, batch_size=self.batch_size, - learning_rate=self.learning_rate, + learning_rate=float(self.learning_rate), ) - self.clf.predict(self.X_test.reshape(-1, 1)) + self.clf.predict(self.X_test) score = self.clf.decision_scores_ self.score = ( diff --git a/solvers/dagmm.py b/solvers/dagmm.py new file mode 100644 index 0000000..6ea8a41 --- /dev/null +++ b/solvers/dagmm.py @@ -0,0 +1,95 @@ +from benchopt import BaseSolver, safe_import_context + +with safe_import_context() as import_ctx: + import numpy as np + import pandas as pd + from merlion.models.anomaly.dagmm import DAGMM, DAGMMConfig + from merlion.utils.time_series import TimeSeries + from sklearn.preprocessing import MinMaxScaler + + +class Solver(BaseSolver): + name = "DAGMM" + + install_cmd = "conda" + requirements = ["pip:salesforce-merlion", "pip:scikit-learn"] + + parameters = { + "gmm_k": [3], + "hidden_size": [256], + "sequence_len": [10], + "num_epochs": [10], + "lr": [1e-3], + "batch_size": [8192], + "lambda_energy": [0.1], + "lambda_cov": [0.005], + "device": ["cuda:3"] + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, y_test, X_test): + # Data received has shape (n_recordings, n_features, n_samples) + # We want (n_samples_total, n_features) + # So we transpose to (n_recordings, n_samples, n_features) and then reshape + + print(X_train.shape, X_test.shape, y_test.shape) + print(X_train.dtype, X_test.dtype, y_test.dtype) + print("Nan in X_train:", np.isnan(X_train).any(), np.isnan(X_train).sum()) + print("Nan in X_test:", np.isnan(X_test).any(), np.isnan(X_test).sum()) + print("Nan in y_test:", np.isnan(y_test).any(), np.isnan(y_test).sum()) + + n_features = X_train.shape[1] + self.X_train = X_train.transpose(0, 2, 1).reshape(-1, n_features) + self.X_test = X_test.transpose(0, 2, 1).reshape(-1, n_features) + self.y_test = y_test.reshape(-1) + # Convert to Merlion TimeSeries + # We use a default index since we don't have timestamps + train_df = pd.DataFrame(self.X_train) + test_df = pd.DataFrame(self.X_test) + + print("Dataframe OK") + + # Merlion expects a time index or it will generate one + self.train_data = TimeSeries.from_pd(train_df) + self.test_data = TimeSeries.from_pd(test_df) + + print("TimeSeries OK") + + # Configure DAGMM + config = DAGMMConfig( + gmm_k=self.gmm_k, + hidden_size=self.hidden_size, + sequence_len=self.sequence_len, + num_epochs=self.num_epochs, + lr=self.lr, + batch_size=self.batch_size, + lambda_energy=self.lambda_energy, + lambda_cov=self.lambda_cov, + device=self.device + ) + + self.model = DAGMM(config) + + + def run(self, _): + # Train + self.model.train(self.train_data) + + # Predict + # get_anomaly_score returns a TimeSeries of scores + scores_ts = self.model.get_anomaly_score(self.test_data) + self.scores = scores_ts.to_pd().values.flatten() + + def get_result(self): + # Normalize scores to 0-1 range for thresholding + scaler = MinMaxScaler(feature_range=(0, 1)) + scores_norm = scaler.fit_transform(self.scores.reshape(-1, 1)).ravel() + + # Simple thresholding + y_hat = np.where(scores_norm > 0.75, 1, 0) + + return dict( + y_hat=y_hat, + raw_anomaly_score=self.scores + ) diff --git a/solvers/matrixprofile.py b/solvers/matrixprofile.py index 94f0af5..d3a4ca4 100644 --- a/solvers/matrixprofile.py +++ b/solvers/matrixprofile.py @@ -2,9 +2,8 @@ from sklearn.preprocessing import MinMaxScaler with safe_import_context() as import_ctx: - from TSB_UAD.models.matrix_profile import MatrixProfile - from TSB_UAD.utils.slidingWindows import find_length - import math + from TSB_AD.models.MatrixProfile import MatrixProfile + from TSB_AD.utils.slidingWindows import find_length import numpy as np @@ -12,20 +11,31 @@ class Solver(BaseSolver): name = "MP" install_cmd = "conda" - requirements = ["pip:tsb-uad"] + requirements = ["pip:tsb-ad", "scikit-learn"] parameters = { - "window_size": [10, "auto"], + "window_size": [128, "auto"], } sampling_strategy = "run_once" def set_objective(self, X_train, y_test, X_test): + # Shapes received: (n_recordings, n_features, n_samples) self.X_train = X_train self.X_test, self.y_test = X_test, y_test + n_features = X_train.shape[1] + + self.X_train = self.X_train.reshape(-1, n_features) + self.X_test = self.X_test.reshape(-1, n_features) + if self.window_size == "auto": - self.window_size = find_length(X_train) + self.window_size = int(find_length(X_train.reshape(-1))) + + print("=====================") + print(f"window_size: {self.window_size}") + print("=====================") + self.clf = MatrixProfile( window=self.window_size, ) @@ -34,33 +44,27 @@ def run(self, _): print("Running Matrix Profile solver...") # Special solver, fitting on X_test self.clf.fit(self.X_test.reshape(-1)) + print("MP Fitted") self.scores = self.clf.decision_scores_ - score = ( + self.score = ( MinMaxScaler(feature_range=(0, 1)) .fit_transform(self.scores.reshape(-1, 1)) .ravel() ) - - print("MP Fitted") - - self.score = np.array( - [score[0]] * math.ceil((self.window_size - 1) / 2) - + list(score) - + [score[-1]] * ((self.window_size - 1) // 2) - ) - print("MP Scored") - print(f"Score shape: {score.shape}") + print(f"Score shape: {self.score.shape}") def skip(self, X_train, y_test, X_test): """Check if the solver can be skipped.""" - if find_length(X_train) == 0 and self.window_size == "auto": + if find_length(X_train.reshape(-1)) == 0 and self.window_size == "auto": return True, "Window size is 0" + if X_train.shape[1] != 1: + return True, "Matrix Profile only supports univariate data" return False, None def get_result(self): """Return the result of the solver.""" # Binarizing the scores to 0 and 1 # TEMPORARY SOLUTION - self.final_score = np.where(self.score > 0.75, 1, 0) - return dict(y_hat=self.final_score) + self.final_score = np.where(self.score > 0.90, 1, 0) + return dict(y_hat=self.final_score, raw_anomaly_score=self.score) diff --git a/solvers/rosecdl.py b/solvers/rosecdl.py index 333b1a4..a051406 100644 --- a/solvers/rosecdl.py +++ b/solvers/rosecdl.py @@ -1,9 +1,12 @@ -from re import X from benchopt import safe_import_context, BaseSolver with safe_import_context() as import_ctx: from rosecdl.rosecdl import RoseCDL + from TSB_AD.utils.slidingWindows import find_length import torch + import numpy as np + import matplotlib.pyplot as plt + from datetime import datetime class Solver(BaseSolver): @@ -14,14 +17,14 @@ class Solver(BaseSolver): parameters = { "n_components": [1], - "kernel_size": [64], + "kernel_size": ["auto"], "lmbd": [0.8], "scale_lmbd": [False], - "epochs": [50], + "epochs": [70], "max_batch": [None], "mini_batch_size": [600], "sample_window": [1_000], - "optimizer": ["adam"], + "optimizer": ["linesearch"], "n_iterations": [90], "window": [False], "outliers_kwargs": [ @@ -33,21 +36,26 @@ class Solver(BaseSolver): "opening_window": True, }, ], + "plot": [False], } sampling_strategy = "run_once" def set_objective(self, X_train, y_test, X_test): - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu" - ) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # We receive data in shape (n_recordings, n_features, n_samples) self.y_test = y_test - self.X_train = torch.tensor( - X_train, dtype=torch.float32, device=self.device) + self.X_train = torch.tensor(X_train, dtype=torch.float32, device=self.device) self.X_test = X_test + if self.kernel_size == "auto": + self.kernel_size = int(find_length(X_train.reshape(-1))) + + print("=====================") + print(f"kernel_size: {self.kernel_size}") + print("=====================") + self.clf = RoseCDL( n_components=self.n_components, n_channels=X_train.shape[1], @@ -67,6 +75,7 @@ def set_objective(self, X_train, y_test, X_test): def run(self, _): self.clf.fit(self.X_train) + del self.X_train # Free GPU memory for X_train after fitting self.y_pred = self.clf.get_outlier_mask(self.X_test) xh, zh = self.clf.csc( @@ -75,12 +84,169 @@ def run(self, _): err = self.clf.loss_fn.compute_patch_error( X_hat=xh, z_hat=zh, - X=torch.tensor(self.X_test, dtype=torch.float32, - device=self.device), + X=torch.tensor(self.X_test, dtype=torch.float32, device=self.device), ) err = err.cpu().detach().numpy() # Aggregate errors over channels self.err = err.sum(axis=1).reshape(-1) + del self.clf # Free GPU memory for the model + torch.cuda.empty_cache() # Release cached GPU memory + + def _plot_anomalies(self): + y_test_flat = self.y_test.flatten() + y_pred_flat = self.y_pred.flatten() + + dataset_name = str(self._objective._dataset).split("[")[0] + + true_positives_indices = np.where((y_test_flat == 1) & (y_pred_flat == 1))[0] + false_negatives_indices = np.where((y_test_flat == 1) & (y_pred_flat == 0))[0] + + if isinstance(self.X_test, torch.Tensor): + X_test_numpy = self.X_test.cpu().numpy() + else: + X_test_numpy = self.X_test + + # Select the first recording and squeeze to get (n_samples,) + X_test_squeezed = X_test_numpy[0].squeeze() + + from tueplots import bundles + + plt.rcParams.update(bundles.aistats2025()) + + # use no tex + plt.rcParams.update({"text.usetex": False}) + + plot_window_size = 1000 + min_overlap_ratio = 0.1 + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + max_plots = 5 + + # Plot up to 5 true positive examples with minimum overlap + if true_positives_indices.size > 0: + tp_plots_count = 0 + plotted_tp_anomalies = set() + for tp_idx in true_positives_indices: + if tp_plots_count >= max_plots: + break + + # Find the bounds of the true anomaly containing this tp_idx + true_start = tp_idx + while true_start > 0 and y_test_flat[true_start - 1] == 1: + true_start -= 1 + true_end = tp_idx + while ( + true_end < len(y_test_flat) - 1 and y_test_flat[true_end + 1] == 1 + ): + true_end += 1 + + if (true_start, true_end) in plotted_tp_anomalies: + continue + + true_len = true_end - true_start + 1 + # Calculate overlap + overlap_indices = np.where( + (y_test_flat[true_start : true_end + 1] == 1) + & (y_pred_flat[true_start : true_end + 1] == 1) + )[0] + overlap_len = len(overlap_indices) + + if true_len > 0 and (overlap_len / true_len) >= min_overlap_ratio: + plotted_tp_anomalies.add((true_start, true_end)) + start = max(0, tp_idx - plot_window_size // 2) + end = min(X_test_squeezed.shape[0], tp_idx + plot_window_size // 2) + + plt.figure(figsize=(3.25, 2)) + plt.plot(np.arange(start, end), X_test_squeezed[start:end]) + plt.title( + f"RoseCDL Successful Detection\n{dataset_name} dataset" + ) + plt.xlabel("Time") + plt.ylabel("Value") + + true_anomaly_indices = np.where(y_test_flat[start:end] == 1)[0] + if true_anomaly_indices.size > 0: + plt.axvspan( + start + true_anomaly_indices[0], + start + true_anomaly_indices[-1], + color="yellow", + alpha=0.5, + label="True Anomaly", + ) + + pred_anomaly_indices = np.where(y_pred_flat[start:end] == 1)[0] + if pred_anomaly_indices.size > 0: + plt.axvspan( + start + pred_anomaly_indices[0], + start + pred_anomaly_indices[-1], + color="red", + alpha=0.3, + label="Predicted Anomaly", + ) + + plt.legend() + plt.savefig( + f"anomaly_examples/{dataset_name.lower()}/rosecdl_successful_detection_{dataset_name.lower()}_{timestamp}_{tp_plots_count}.pdf", + format="pdf", + ) + plt.close() + tp_plots_count += 1 + + if tp_plots_count == 0: + print("Could not find a true positive with sufficient overlap to plot.") + + # Plot up to 5 false negative examples + if false_negatives_indices.size > 0: + fn_plots_count = 0 + plotted_fn_anomalies = set() + for fn_idx in false_negatives_indices: + if fn_plots_count >= max_plots: + break + + # Find the bounds of the true anomaly containing this fn_idx + true_start = fn_idx + while true_start > 0 and y_test_flat[true_start - 1] == 1: + true_start -= 1 + true_end = fn_idx + while ( + true_end < len(y_test_flat) - 1 and y_test_flat[true_end + 1] == 1 + ): + true_end += 1 + + if (true_start, true_end) in plotted_fn_anomalies: + continue + + plotted_fn_anomalies.add((true_start, true_end)) + start = max(0, fn_idx - plot_window_size // 2) + end = min(X_test_squeezed.shape[0], fn_idx + plot_window_size // 2) + + plt.figure(figsize=(3.25, 2)) + plt.plot(np.arange(start, end), X_test_squeezed[start:end]) + plt.title( + f"RoseCDL Failed Detection\n{dataset_name} dataset" + ) + plt.xlabel("Time") + plt.ylabel("Value") + + true_anomaly_indices = np.where(y_test_flat[start:end] == 1)[0] + if true_anomaly_indices.size > 0: + plt.axvspan( + start + true_anomaly_indices[0], + start + true_anomaly_indices[-1], + color="yellow", + alpha=0.5, + label="True Anomaly (missed)", + ) + + plt.legend() + plt.savefig( + f"anomaly_examples/{dataset_name.lower()}/rosecdl_failed_detection_{dataset_name.lower()}_{timestamp}_{fn_plots_count}.pdf", + format="pdf", + ) + plt.close() + fn_plots_count += 1 def get_result(self): + if self.plot: + self._plot_anomalies() + return dict(y_hat=self.y_pred, raw_anomaly_score=self.err) diff --git a/solvers/tsb_chronos.py b/solvers/tsb_chronos.py index a4cc51f..046d019 100644 --- a/solvers/tsb_chronos.py +++ b/solvers/tsb_chronos.py @@ -2,7 +2,9 @@ with safe_import_context() as import_ctx: from TSB_AD.models.Chronos import Chronos + from TSB_AD.utils.slidingWindows import find_length import numpy as np + import torch class Solver(BaseSolver): @@ -12,7 +14,7 @@ class Solver(BaseSolver): requirements = ["pip:tsb-ad"] parameters = { - "win_size": [1000], + "win_size": ["auto"], "prediction_length": [1], "model_size": ['base'], "batch_size": [32], @@ -26,6 +28,10 @@ def set_objective(self, X_train, y_test, X_test): self.data = self.data.reshape(-1, n_features) self.X_test = X_test.reshape(-1, n_features) + if self.win_size == "auto": + self.win_size = int(find_length(X_train.reshape(-1))) + + self.clf = Chronos( win_size=self.win_size, input_c=n_features, @@ -43,6 +49,8 @@ def run(self, _): # Map scores to predictions threshold = np.percentile(self.score, (1 - 0.1) * 100) self.y_hat = (self.score > threshold).astype(int) + del self.clf # Free memory for the model + torch.cuda.empty_cache() # Release cached GPU memory def get_result(self): return dict(y_hat=self.y_hat, raw_anomaly_score=self.score) From d3c3f24c0938093eb5e79159a831b74e08c39c4c Mon Sep 17 00:00:00 2001 From: jadyehya Date: Sat, 6 Dec 2025 11:50:51 -0800 Subject: [PATCH 21/50] solvers lint --- solvers/AR.py | 6 +- solvers/anomalybert.py | 149 +++++++++++++++++++++++++-------------- solvers/autoencoder.py | 3 + solvers/dagmm.py | 8 +-- solvers/matrixprofile.py | 3 +- solvers/tsb_chronos.py | 1 - solvers/tsb_timesfm.py | 4 +- solvers/tsb_timesnet.py | 6 +- solvers/vae.py | 10 +-- 9 files changed, 119 insertions(+), 71 deletions(-) diff --git a/solvers/AR.py b/solvers/AR.py index e524841..d8b6314 100644 --- a/solvers/AR.py +++ b/solvers/AR.py @@ -38,8 +38,10 @@ def set_objective(self, X_train, y_test, X_test): _, n_features, _ = X_train.shape - self.X_train = X_train.reshape(-1, n_features) # (n_samples, n_features) - self.X_test = X_test.reshape(-1, n_features) # (n_samples, n_features) + # (n_samples, n_features) + self.X_train = X_train.reshape(-1, n_features) + # (n_samples, n_features) + self.X_test = X_test.reshape(-1, n_features) self.y_test = y_test.reshape(-1) # (n_samples,) self.model = ARModel( diff --git a/solvers/anomalybert.py b/solvers/anomalybert.py index cc52896..165ee35 100644 --- a/solvers/anomalybert.py +++ b/solvers/anomalybert.py @@ -10,15 +10,17 @@ from tqdm import tqdm # Add AnomalyBERT to path - sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'AnomalyBERT')) + sys.path.append(os.path.join( + os.path.dirname(__file__), '..', 'AnomalyBERT')) from models.anomaly_transformer import get_anomaly_transformer + class Solver(BaseSolver): name = "AnomalyBERT" sampling_strategy = "run_once" - requirements = ["pip:timm","pip:torch", "pip:numpy", "pip:tqdm"] + requirements = ["pip:timm", "pip:torch", "pip:numpy", "pip:tqdm"] parameters = { "patch_size": [1], @@ -37,15 +39,15 @@ class Solver(BaseSolver): def set_objective(self, X_train, y_test, X_test): # X_train shape: (n_series, n_features, n_samples) - # We assume single series or concatenate them if X_train.ndim == 3: - # Reshape to (n_samples, n_features) - # Assuming (1, n_features, n_samples) -> (n_samples, n_features) - # Or (n_series, n_features, n_samples) -> (n_series * n_samples, n_features) - # But we need to be careful about time continuity if we concatenate. - # For now, let's assume standard benchopt format which seems to be (1, n_features, n_samples) - self.X_train = np.transpose(X_train, (0, 2, 1)).reshape(-1, X_train.shape[1]).astype(np.float32) - self.X_test = np.transpose(X_test, (0, 2, 1)).reshape(-1, X_test.shape[1]).astype(np.float32) + self.X_train = np.transpose( + X_train, (0, 2, 1)).reshape( + -1, X_train.shape[1] + ).astype(np.float32) + self.X_test = np.transpose( + X_test, (0, 2, 1)).reshape( + -1, X_test.shape[1] + ).astype(np.float32) else: self.X_train = X_train.astype(np.float32) self.X_test = X_test.astype(np.float32) @@ -54,21 +56,25 @@ def run(self, _): torch.manual_seed(self.seed) np.random.seed(self.seed) - device = torch.device(self.device if torch.cuda.is_available() else 'cpu') + device = torch.device( + self.device if torch.cuda.is_available() else 'cpu') train_data = self.X_train d_data = train_data.shape[1] # Configuration patch_size = self.patch_size - n_patches = self.n_patches # This corresponds to n_features (max_seq_len) in AnomalyBERT + # This corresponds to n_features (max_seq_len) in AnomalyBERT + n_patches = self.n_patches data_seq_len = n_patches * patch_size if len(train_data) <= data_seq_len: - raise ValueError(f"Data length {len(train_data)} is smaller than sequence length {data_seq_len}") # Model + raise ValueError( + f"Data length {len(train_data)} is smaller than " + f"sequence length {data_seq_len}") self.model = get_anomaly_transformer( input_d_data=d_data, - output_d_data=1, # BCE loss + output_d_data=1, # BCE loss patch_size=patch_size, d_embed=self.d_embed, hidden_dim_rate=4., @@ -81,8 +87,10 @@ def run(self, _): ).to(device) # Optimizer - optimizer = torch.optim.AdamW(params=self.model.parameters(), lr=self.lr, weight_decay=1e-4) - scheduler = CosineAnnealingLR(optimizer, T_max=self.max_steps, eta_min=self.lr*0.01) + optimizer = torch.optim.AdamW( + params=self.model.parameters(), lr=self.lr, weight_decay=1e-4) + scheduler = CosineAnnealingLR( + optimizer, T_max=self.max_steps, eta_min=self.lr*0.01) train_loss_fn = nn.BCELoss().to(device) sigmoid = nn.Sigmoid().to(device) @@ -90,7 +98,10 @@ def run(self, _): # Data Augmentation Parameters replacing_rate = (0.015, 0.15) - replacing_table = list(np.random.randint(int(data_seq_len*replacing_rate[0]), int(data_seq_len*replacing_rate[1]), size=10000)) + replacing_table = list(np.random.randint( + int(data_seq_len*replacing_rate[0]), + int(data_seq_len*replacing_rate[1]), + size=10000)) replacing_table_index = 0 replacing_table_length = 10000 @@ -102,68 +113,87 @@ def run(self, _): def replacing_weights(interval_len): warmup_len = interval_len // 10 - return np.concatenate((np.linspace(0, replacing_weight, num=warmup_len), - np.full(interval_len-2*warmup_len, replacing_weight), - np.linspace(replacing_weight, 0, num=warmup_len)), axis=None) + return np.concatenate(( + np.linspace(0, replacing_weight, num=warmup_len), + np.full(interval_len-2*warmup_len, replacing_weight), + np.linspace(replacing_weight, 0, num=warmup_len)), + axis=None) valid_index_list = np.arange(len(train_data) - data_seq_len) - numerical_column = np.arange(d_data) # Assume all numerical + numerical_column = np.arange(d_data) # Assume all numerical # Training Loop self.model.train() for i in tqdm(range(self.max_steps)): - first_index = np.random.choice(valid_index_list, size=self.batch_size) + first_index = np.random.choice( + valid_index_list, size=self.batch_size) x = [] for j in first_index: - x.append(torch.Tensor(train_data[j:j+data_seq_len].copy()).to(device)) + x.append(torch.Tensor( + train_data[j:j+data_seq_len].copy()).to(device)) # Replace data logic current_index = replacing_table_index replacing_table_index += self.batch_size if replacing_table_index > replacing_table_length: - replacing_lengths = replacing_table[current_index:] + replacing_table[:replacing_table_index-replacing_table_length] + replacing_lengths = replacing_table[current_index:] + \ + replacing_table[:replacing_table_index - + replacing_table_length] replacing_table_index -= replacing_table_length else: - replacing_lengths = replacing_table[current_index:replacing_table_index] + replacing_lengths = replacing_table[ + current_index:replacing_table_index + ] if replacing_table_index == replacing_table_length: replacing_table_index = 0 replacing_lengths = np.array(replacing_lengths) - # replacing_index = np.random.randint(0, (len(train_data)-replacing_lengths+1)[:, np.newaxis], size=(self.batch_size, d_data)) - # Simplified replacing index to just pick random start points in train_data - # Note: train.py uses replacing_data which defaults to train_data - target_index = np.random.randint(0, data_seq_len-replacing_lengths+1) + target_index = np.random.randint( + 0, data_seq_len-replacing_lengths+1) replacing_type = np.random.uniform(0., 1., size=(self.batch_size,)) - replacing_dim_numerical = np.random.uniform(0., 1., size=(self.batch_size, d_data)) - replacing_dim_numerical = replacing_dim_numerical - np.maximum(replacing_dim_numerical.min(axis=1, keepdims=True), 0.3) <= 0.001 - - x_anomaly = torch.zeros(self.batch_size, data_seq_len, device=device) - - for j, tar, leng, typ, dim_num in zip(range(self.batch_size), target_index, replacing_lengths, replacing_type, replacing_dim_numerical): + replacing_dim_numerical = np.random.uniform( + 0., 1., size=(self.batch_size, d_data)) + replacing_dim_numerical = replacing_dim_numerical - \ + np.maximum(replacing_dim_numerical.min( + axis=1, keepdims=True), 0.3) <= 0.001 + + x_anomaly = torch.zeros( + self.batch_size, data_seq_len, device=device) + + for j, tar, leng, typ, dim_num in zip( + range(self.batch_size), + target_index, replacing_lengths, + replacing_type, + replacing_dim_numerical): if leng > 0: - _x = x[j].clone().transpose(0, 1) # (d_data, seq_len) + _x = x[j].clone().transpose(0, 1) # (d_data, seq_len) # External interval replacing if typ > soft_replacing_prob: col_num = numerical_column[dim_num] if len(col_num) > 0: # Pick random interval from train_data - rep_start = np.random.randint(0, len(train_data) - leng) - random_interval = train_data[rep_start:rep_start+leng, col_num].copy() + rep_start = np.random.randint( + 0, len(train_data) - leng) + random_interval = train_data[rep_start:rep_start + + leng, col_num].copy() # Random flip - if np.random.rand() > 0.5: # Horizontal + if np.random.rand() > 0.5: # Horizontal random_interval = random_interval[::-1].copy() - if np.random.rand() > 0.5: # Vertical - random_interval = 1 - random_interval # Assuming normalized data? + if np.random.rand() > 0.5: # Vertical + random_interval = 1 - random_interval - _x_temp = torch.from_numpy(random_interval).to(device).transpose(0, 1) # (n_cols, leng) + _x_temp = torch.from_numpy(random_interval).to( + device).transpose(0, 1) # (n_cols, leng) - weights = torch.from_numpy(replacing_weights(leng)).float().unsqueeze(0).to(device) - _x[col_num, tar:tar+leng] = _x_temp * weights + _x[col_num, tar:tar+leng] * (1 - weights) + weights = torch.from_numpy(replacing_weights( + leng)).float().unsqueeze(0).to(device) + _x[col_num, tar:tar+leng] = _x_temp * weights + \ + _x[col_num, tar:tar+leng] * (1 - weights) x_anomaly[j, tar:tar+leng] = 1 x[j] = _x.transpose(0, 1) @@ -172,7 +202,10 @@ def replacing_weights(interval_len): elif typ > uniform_replacing_prob: col_num = numerical_column[dim_num] if len(col_num) > 0: - _x[col_num, tar:tar+leng] = torch.rand(len(col_num), leng, device=device) + _x[col_num, tar:tar + + leng] = torch.rand( + len(col_num), leng, device=device + ) x_anomaly[j, tar:tar+leng] = 1 x[j] = _x.transpose(0, 1) @@ -181,11 +214,16 @@ def replacing_weights(interval_len): col_num = numerical_column[dim_num] if len(col_num) > 0: peak_index = np.random.randint(0, leng) - peak_value = (_x[col_num, tar+peak_index] < 0.5).float().to(device) - peak_value = peak_value + (0.1 * (1 - 2 * peak_value)) * torch.rand(len(col_num), device=device) + peak_value = ( + _x[col_num, tar+peak_index] < 0.5 + ).float().to(device) + peak_value = peak_value + \ + (0.1 * (1 - 2 * peak_value)) * \ + torch.rand(len(col_num), device=device) _x[col_num, tar+peak_index] = peak_value - tar_first = np.maximum(0, tar + peak_index - patch_size) + tar_first = np.maximum( + 0, tar + peak_index - patch_size) tar_last = tar + peak_index + patch_size + 1 x_anomaly[j, tar_first:tar_last] = 1 x[j] = _x.transpose(0, 1) @@ -202,12 +240,13 @@ def replacing_weights(interval_len): scheduler.step() def get_result(self): - device = torch.device(self.device if torch.cuda.is_available() else 'cpu') + device = torch.device( + self.device if torch.cuda.is_available() else 'cpu') self.model.eval() test_data = self.X_test window_size = self.n_patches * self.patch_size - window_sliding = self.window_sliding # Default from estimate.py + window_sliding = self.window_sliding # Default from estimate.py batch_size = self.batch_size # We will just slide over the test data @@ -220,10 +259,11 @@ def get_result(self): with torch.no_grad(): # Pad test data if needed or just handle boundaries - # estimate.py handles divisions. We'll assume one continuous sequence. + # estimate.py handles divisions. We assume 1 continuous sequence. # We need to batch the sliding windows - indices = list(range(0, n_samples - window_size + 1, window_sliding)) + indices = list( + range(0, n_samples - window_size + 1, window_sliding)) for i in range(0, len(indices), batch_size): batch_indices = indices[i:i+batch_size] @@ -235,7 +275,8 @@ def get_result(self): continue x_batch = torch.Tensor(np.stack(x_batch)).to(device) - y_batch = sigmoid(self.model(x_batch)).squeeze(-1) # (batch, window_size) + # (batch, window_size) + y_batch = sigmoid(self.model(x_batch)).squeeze(-1) for j, idx in enumerate(batch_indices): output_values[idx:idx+window_size] += y_batch[j] @@ -247,4 +288,4 @@ def get_result(self): threshold = np.percentile(scores, (1 - 0.1) * 100) y_hat = (scores > threshold).astype(int) - return dict(y_hat=y_hat, raw_anomaly_score=scores) \ No newline at end of file + return dict(y_hat=y_hat, raw_anomaly_score=scores) diff --git a/solvers/autoencoder.py b/solvers/autoencoder.py index 2de1ff2..db251c6 100644 --- a/solvers/autoencoder.py +++ b/solvers/autoencoder.py @@ -63,6 +63,9 @@ def skip(self, X_train, y_test, X_test): """Check if the solver can be skipped.""" if find_length(X_train) == 0 and self.window_size == "auto": return True, "Window size is 0" + from torch.cuda import is_available + if not is_available(): + return True, "AE requires a GPU to run." return False, None def get_result(self): diff --git a/solvers/dagmm.py b/solvers/dagmm.py index 6ea8a41..64529fa 100644 --- a/solvers/dagmm.py +++ b/solvers/dagmm.py @@ -29,13 +29,10 @@ class Solver(BaseSolver): sampling_strategy = "run_once" def set_objective(self, X_train, y_test, X_test): - # Data received has shape (n_recordings, n_features, n_samples) - # We want (n_samples_total, n_features) - # So we transpose to (n_recordings, n_samples, n_features) and then reshape - print(X_train.shape, X_test.shape, y_test.shape) print(X_train.dtype, X_test.dtype, y_test.dtype) - print("Nan in X_train:", np.isnan(X_train).any(), np.isnan(X_train).sum()) + print("Nan in X_train:", np.isnan( + X_train).any(), np.isnan(X_train).sum()) print("Nan in X_test:", np.isnan(X_test).any(), np.isnan(X_test).sum()) print("Nan in y_test:", np.isnan(y_test).any(), np.isnan(y_test).sum()) @@ -71,7 +68,6 @@ def set_objective(self, X_train, y_test, X_test): self.model = DAGMM(config) - def run(self, _): # Train self.model.train(self.train_data) diff --git a/solvers/matrixprofile.py b/solvers/matrixprofile.py index d3a4ca4..739de4f 100644 --- a/solvers/matrixprofile.py +++ b/solvers/matrixprofile.py @@ -56,7 +56,8 @@ def run(self, _): def skip(self, X_train, y_test, X_test): """Check if the solver can be skipped.""" - if find_length(X_train.reshape(-1)) == 0 and self.window_size == "auto": + if (find_length(X_train.reshape(-1)) == 0) and ( + self.window_size == "auto"): return True, "Window size is 0" if X_train.shape[1] != 1: return True, "Matrix Profile only supports univariate data" diff --git a/solvers/tsb_chronos.py b/solvers/tsb_chronos.py index 046d019..1f7cfe9 100644 --- a/solvers/tsb_chronos.py +++ b/solvers/tsb_chronos.py @@ -31,7 +31,6 @@ def set_objective(self, X_train, y_test, X_test): if self.win_size == "auto": self.win_size = int(find_length(X_train.reshape(-1))) - self.clf = Chronos( win_size=self.win_size, input_c=n_features, diff --git a/solvers/tsb_timesfm.py b/solvers/tsb_timesfm.py index 77f03fc..6784dfe 100644 --- a/solvers/tsb_timesfm.py +++ b/solvers/tsb_timesfm.py @@ -3,6 +3,7 @@ with safe_import_context() as import_ctx: from TSB_AD.model_wrapper import run_TimesFM import numpy as np + import torch class Solver(BaseSolver): @@ -12,7 +13,7 @@ class Solver(BaseSolver): requirements = ["pip:tsb-ad"] parameters = { - "win_size": [96], + "win_size": [256], } sampling_strategy = "run_once" @@ -29,6 +30,7 @@ def run(self, _): win_size=self.win_size, ) self.raw_anomaly_score = self.y_hat[-len(self.X_test):] + torch.cuda.empty_cache() # Release cached GPU memory def get_result(self): threshold = np.percentile(self.raw_anomaly_score, 90) diff --git a/solvers/tsb_timesnet.py b/solvers/tsb_timesnet.py index ef4e253..2333208 100644 --- a/solvers/tsb_timesnet.py +++ b/solvers/tsb_timesnet.py @@ -2,6 +2,7 @@ with safe_import_context() as import_ctx: from TSB_AD.models.TimesNet import TimesNet + import torch class Solver(BaseSolver): @@ -11,7 +12,7 @@ class Solver(BaseSolver): requirements = ["pip:tsb-ad"] parameters = { - "window_size": [96], + "window_size": [256], "lr": [1e-4], } @@ -39,6 +40,9 @@ def run(self, _): self.raw_anomaly_score = self.clf.decision_function(self.X_test) print("TimesNet done") + del self.clf.model + del self.clf + torch.cuda.empty_cache() # Release cached GPU memory def get_result(self): self.y_hat = (self.raw_anomaly_score > 0).astype(int) diff --git a/solvers/vae.py b/solvers/vae.py index 085f9af..c55f73d 100644 --- a/solvers/vae.py +++ b/solvers/vae.py @@ -10,7 +10,7 @@ class Solver(BaseSolver): name = "VAE" install_cmd = "conda" - requirements = ["pyod", "tqdm", "pip:torch"] + requirements = ["pip:pyod", "pip:torch"] sampling_strategy = "run_once" @@ -49,10 +49,10 @@ def set_objective(self, X_train, y_test, X_test): if self.window: self.Xw_train = np.lib.stride_tricks.sliding_window_view( - X_train, - window_shape=self.window_size+self.horizon, - axis=0 - ).transpose(0, 2, 1) + X_train, + window_shape=self.window_size+self.horizon, + axis=0 + ).transpose(0, 2, 1) if self.X_test is not None: self.Xw_test = np.lib.stride_tricks.sliding_window_view( From 32e701cddde0e86a456eec31596347a764e1c744 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Sat, 6 Dec 2025 11:51:05 -0800 Subject: [PATCH 22/50] Remove TSB-OCSVM solver implementation --- solvers/tsb_ocsvm.py | 77 -------------------------------------------- 1 file changed, 77 deletions(-) delete mode 100644 solvers/tsb_ocsvm.py diff --git a/solvers/tsb_ocsvm.py b/solvers/tsb_ocsvm.py deleted file mode 100644 index 7a76396..0000000 --- a/solvers/tsb_ocsvm.py +++ /dev/null @@ -1,77 +0,0 @@ -from benchopt import BaseSolver, safe_import_context -from sklearn.preprocessing import MinMaxScaler - -with safe_import_context() as import_ctx: - from TSB_UAD.models.ocsvm import OCSVM - from TSB_UAD.models.feature import Window - from TSB_UAD.utils.slidingWindows import find_length - import math - import numpy as np - - -class Solver(BaseSolver): - name = "TSB-OCVSM" - - install_cmd = "conda" - requirements = ["pip:tsb-uad"] - - parameters = { - "window_size": [10, "auto"], - } - - sampling_strategy = "run_once" - - def set_objective(self, X_train, y_test, X_test): - if self.window_size == "auto": - self.window_size = find_length(X_train) - - X_train = X_train.reshape(-1) - X_test = X_test.reshape(-1) - - X_train = Window(window=self.window_size).convert(X_train).to_numpy() - X_test = Window(window=self.window_size).convert(X_test).to_numpy() - - self.X_train = MinMaxScaler( - feature_range=(0, 1)).fit_transform(X_train.T).T - self.X_test = MinMaxScaler( - feature_range=(0, 1)).fit_transform(X_test.T).T - - self.y_test = y_test.reshape(-1) - - self.clf = OCSVM(nu=0.05, max_iter=200) - - def run(self, _): - print("Running OCSVM solver...") - # Special solver, fitting on X_test - self.clf.fit(self.X_train, self.X_test) - score = self.clf.decision_scores_ - - print("OCSVM Fitted") - - score = np.array( - [score[0]] * math.ceil((self.window_size - 1) / 2) - + list(score) - + [score[-1]] * ((self.window_size - 1) // 2) - ) - - self.score = ( - MinMaxScaler(feature_range=(0, 1)) - .fit_transform(score.reshape(-1, 1)) - .ravel() - ) - - print("MP Scored") - print(f"Score shape: {score.shape}") - - def skip(self, X_train, y_test, X_test): - """Check if the solver can be skipped.""" - if find_length(X_train) == 0 and self.window_size == "auto": - return True, "Window size is 0" - return False, None - - def get_result(self): - """Return the result of the solver.""" - # Binarizing the scores to 0 and 1 - # TEMPORARY SOLUTION - self.final_score = np.where(self.score > 0.75, 1, 0) - return dict(y_hat=self.final_score) From 365875a8f58b54efd003decb2240fae9c3d5df12 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Sat, 6 Dec 2025 11:51:20 -0800 Subject: [PATCH 23/50] lint + removed plotting --- solvers/rosecdl.py | 168 ++------------------------------------------- 1 file changed, 6 insertions(+), 162 deletions(-) diff --git a/solvers/rosecdl.py b/solvers/rosecdl.py index a051406..2c0fa4e 100644 --- a/solvers/rosecdl.py +++ b/solvers/rosecdl.py @@ -4,9 +4,6 @@ from rosecdl.rosecdl import RoseCDL from TSB_AD.utils.slidingWindows import find_length import torch - import numpy as np - import matplotlib.pyplot as plt - from datetime import datetime class Solver(BaseSolver): @@ -42,11 +39,13 @@ class Solver(BaseSolver): sampling_strategy = "run_once" def set_objective(self, X_train, y_test, X_test): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # We receive data in shape (n_recordings, n_features, n_samples) self.y_test = y_test - self.X_train = torch.tensor(X_train, dtype=torch.float32, device=self.device) + self.X_train = torch.tensor( + X_train, dtype=torch.float32, device=self.device) self.X_test = X_test if self.kernel_size == "auto": @@ -84,7 +83,8 @@ def run(self, _): err = self.clf.loss_fn.compute_patch_error( X_hat=xh, z_hat=zh, - X=torch.tensor(self.X_test, dtype=torch.float32, device=self.device), + X=torch.tensor(self.X_test, dtype=torch.float32, + device=self.device), ) err = err.cpu().detach().numpy() # Aggregate errors over channels @@ -92,161 +92,5 @@ def run(self, _): del self.clf # Free GPU memory for the model torch.cuda.empty_cache() # Release cached GPU memory - def _plot_anomalies(self): - y_test_flat = self.y_test.flatten() - y_pred_flat = self.y_pred.flatten() - - dataset_name = str(self._objective._dataset).split("[")[0] - - true_positives_indices = np.where((y_test_flat == 1) & (y_pred_flat == 1))[0] - false_negatives_indices = np.where((y_test_flat == 1) & (y_pred_flat == 0))[0] - - if isinstance(self.X_test, torch.Tensor): - X_test_numpy = self.X_test.cpu().numpy() - else: - X_test_numpy = self.X_test - - # Select the first recording and squeeze to get (n_samples,) - X_test_squeezed = X_test_numpy[0].squeeze() - - from tueplots import bundles - - plt.rcParams.update(bundles.aistats2025()) - - # use no tex - plt.rcParams.update({"text.usetex": False}) - - plot_window_size = 1000 - min_overlap_ratio = 0.1 - timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - max_plots = 5 - - # Plot up to 5 true positive examples with minimum overlap - if true_positives_indices.size > 0: - tp_plots_count = 0 - plotted_tp_anomalies = set() - for tp_idx in true_positives_indices: - if tp_plots_count >= max_plots: - break - - # Find the bounds of the true anomaly containing this tp_idx - true_start = tp_idx - while true_start > 0 and y_test_flat[true_start - 1] == 1: - true_start -= 1 - true_end = tp_idx - while ( - true_end < len(y_test_flat) - 1 and y_test_flat[true_end + 1] == 1 - ): - true_end += 1 - - if (true_start, true_end) in plotted_tp_anomalies: - continue - - true_len = true_end - true_start + 1 - # Calculate overlap - overlap_indices = np.where( - (y_test_flat[true_start : true_end + 1] == 1) - & (y_pred_flat[true_start : true_end + 1] == 1) - )[0] - overlap_len = len(overlap_indices) - - if true_len > 0 and (overlap_len / true_len) >= min_overlap_ratio: - plotted_tp_anomalies.add((true_start, true_end)) - start = max(0, tp_idx - plot_window_size // 2) - end = min(X_test_squeezed.shape[0], tp_idx + plot_window_size // 2) - - plt.figure(figsize=(3.25, 2)) - plt.plot(np.arange(start, end), X_test_squeezed[start:end]) - plt.title( - f"RoseCDL Successful Detection\n{dataset_name} dataset" - ) - plt.xlabel("Time") - plt.ylabel("Value") - - true_anomaly_indices = np.where(y_test_flat[start:end] == 1)[0] - if true_anomaly_indices.size > 0: - plt.axvspan( - start + true_anomaly_indices[0], - start + true_anomaly_indices[-1], - color="yellow", - alpha=0.5, - label="True Anomaly", - ) - - pred_anomaly_indices = np.where(y_pred_flat[start:end] == 1)[0] - if pred_anomaly_indices.size > 0: - plt.axvspan( - start + pred_anomaly_indices[0], - start + pred_anomaly_indices[-1], - color="red", - alpha=0.3, - label="Predicted Anomaly", - ) - - plt.legend() - plt.savefig( - f"anomaly_examples/{dataset_name.lower()}/rosecdl_successful_detection_{dataset_name.lower()}_{timestamp}_{tp_plots_count}.pdf", - format="pdf", - ) - plt.close() - tp_plots_count += 1 - - if tp_plots_count == 0: - print("Could not find a true positive with sufficient overlap to plot.") - - # Plot up to 5 false negative examples - if false_negatives_indices.size > 0: - fn_plots_count = 0 - plotted_fn_anomalies = set() - for fn_idx in false_negatives_indices: - if fn_plots_count >= max_plots: - break - - # Find the bounds of the true anomaly containing this fn_idx - true_start = fn_idx - while true_start > 0 and y_test_flat[true_start - 1] == 1: - true_start -= 1 - true_end = fn_idx - while ( - true_end < len(y_test_flat) - 1 and y_test_flat[true_end + 1] == 1 - ): - true_end += 1 - - if (true_start, true_end) in plotted_fn_anomalies: - continue - - plotted_fn_anomalies.add((true_start, true_end)) - start = max(0, fn_idx - plot_window_size // 2) - end = min(X_test_squeezed.shape[0], fn_idx + plot_window_size // 2) - - plt.figure(figsize=(3.25, 2)) - plt.plot(np.arange(start, end), X_test_squeezed[start:end]) - plt.title( - f"RoseCDL Failed Detection\n{dataset_name} dataset" - ) - plt.xlabel("Time") - plt.ylabel("Value") - - true_anomaly_indices = np.where(y_test_flat[start:end] == 1)[0] - if true_anomaly_indices.size > 0: - plt.axvspan( - start + true_anomaly_indices[0], - start + true_anomaly_indices[-1], - color="yellow", - alpha=0.5, - label="True Anomaly (missed)", - ) - - plt.legend() - plt.savefig( - f"anomaly_examples/{dataset_name.lower()}/rosecdl_failed_detection_{dataset_name.lower()}_{timestamp}_{fn_plots_count}.pdf", - format="pdf", - ) - plt.close() - fn_plots_count += 1 - def get_result(self): - if self.plot: - self._plot_anomalies() - return dict(y_hat=self.y_pred, raw_anomaly_score=self.err) From 8e2921d6189b8e691bf75730a1556f798e7e9ed8 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Sat, 6 Dec 2025 11:54:28 -0800 Subject: [PATCH 24/50] datasets lint --- datasets/daphnet.py | 17 +++++++++++++---- datasets/ecg.py | 3 ++- datasets/kdd21.py | 1 + datasets/mitdb.py | 3 ++- datasets/occupancy.py | 3 ++- datasets/pattern.py | 17 +++++++---------- datasets/simulated.py | 2 +- datasets/svdb.py | 9 ++++++--- datasets/trend.py | 17 +++++++---------- 9 files changed, 41 insertions(+), 31 deletions(-) diff --git a/datasets/daphnet.py b/datasets/daphnet.py index 25f4285..66694f0 100644 --- a/datasets/daphnet.py +++ b/datasets/daphnet.py @@ -27,7 +27,10 @@ def load_data(db_path, record_ids=None, verbose=False, number=-1): db_path = Path(db_path) if record_ids is not None and number > 0: - print("Warning: 'number' parameter is ignored when 'record_ids' is provided.") + print( + "Warning: 'number' parameter is " + "ignored when 'record_ids' is provided." + ) if record_ids is None: # Get all available record files with .test.csv@X.out pattern @@ -36,7 +39,6 @@ def load_data(db_path, record_ids=None, verbose=False, number=-1): if number > 0: record_ids = record_ids[:number] - data_list = [] labels_list = [] for record_id in record_ids: @@ -61,7 +63,9 @@ def load_data(db_path, record_ids=None, verbose=False, number=-1): else: if verbose: print( - f"Insufficient columns for record file {record_file.name}") + f"Insufficient columns " + f"for record file {record_file.name}" + ) if not data_list: raise ValueError("No valid data found") @@ -108,6 +112,10 @@ class Dataset(BaseDataset): "debug": [False], } + test_parameters = { + "debug": [True], + } + def get_data(self): """Load the DAPHNET dataset.""" @@ -135,7 +143,8 @@ def get_data(self): plt.figure(figsize=(6, 3)) plt.plot(X_train[0, 0, :500], linewidth=1.2) - plt.plot(range(297, 305), X_train[0, 0, 297:305], color="orange", linewidth=3) + plt.plot(range(297, 305), + X_train[0, 0, 297:305], color="orange", linewidth=3) plt.title("Daphnet dataset") plt.tight_layout() plt.savefig("daphnet_example.png") diff --git a/datasets/ecg.py b/datasets/ecg.py index 38e147c..04357e5 100644 --- a/datasets/ecg.py +++ b/datasets/ecg.py @@ -26,7 +26,8 @@ def load_data(db_path, record_ids=None, verbose=False, number=-1): db_path = Path(db_path) if record_ids is not None and number > 0: - print("Warning: 'number' parameter is ignored when 'record_ids' is provided.") + print("Warning: 'number' parameter is " + "ignored when 'record_ids' is provided.") if record_ids is None: # Get all available record files diff --git a/datasets/kdd21.py b/datasets/kdd21.py index 6e810f1..6691919 100644 --- a/datasets/kdd21.py +++ b/datasets/kdd21.py @@ -7,6 +7,7 @@ PATH = config.get_data_path("KDD21") + def load_data(db_path, record_ids=None, verbose=False): """ Load data from the database path for specified record IDs. diff --git a/datasets/mitdb.py b/datasets/mitdb.py index c2637ba..3af87bd 100644 --- a/datasets/mitdb.py +++ b/datasets/mitdb.py @@ -51,7 +51,8 @@ def load_mitdb_data(db_path, record_ids=None, verbose=False): # Assuming first column is the data, second column is labels if verbose: print( - f"Loaded record {record_id} with shape {record_data.shape}") + f"Loaded record {record_id} " + f"with shape {record_data.shape}") if record_data.shape[1] >= 2: if verbose: print(f"Record {record_id} has sufficient columns") diff --git a/datasets/occupancy.py b/datasets/occupancy.py index 561aafd..a985586 100644 --- a/datasets/occupancy.py +++ b/datasets/occupancy.py @@ -67,7 +67,8 @@ def load_data(db_path, record_ids=None, verbose=False): else: if verbose: print( - f"Insufficient columns for record file {test_file.name}") + f"Insufficient columns " + f"for record file {test_file.name}") if not test_data_list: raise ValueError("No valid test data found") diff --git a/datasets/pattern.py b/datasets/pattern.py index 47c1dba..0545cc4 100644 --- a/datasets/pattern.py +++ b/datasets/pattern.py @@ -16,19 +16,17 @@ class Dataset(BaseDataset): "n_times_atom": [250], } - def get_data(self): if self.debug: self.n_samples = 2 self.n_times = 1000 - size = self.n_times // 5000 contamination_params = { - "n_atoms": 2, - "sparsity": 3, - "init_z": "constant", - "init_z_kwargs": {"value": 50}, - } + "n_atoms": 2, + "sparsity": 3, + "init_z": "constant", + "init_z_kwargs": {"value": 50}, + } simulation_params = { "n_trials": self.n_samples * 2, @@ -54,11 +52,10 @@ def get_data(self): return_info_contam=True, ) - X_train, X_test = X[: self.n_samples], X[self.n_samples :] - y_test = info_contam["outliers_mask"][self.n_samples :] + X_train, X_test = X[: self.n_samples], X[self.n_samples:] + y_test = info_contam["outliers_mask"][self.n_samples:] y_test = np.any(y_test, axis=1) - print(f"X_train shape: {X_train.shape}") print(f"X_test shape: {X_test.shape}") print(f"y_test shape: {y_test.shape}") diff --git a/datasets/simulated.py b/datasets/simulated.py index 19b7ed2..28b5474 100644 --- a/datasets/simulated.py +++ b/datasets/simulated.py @@ -20,7 +20,7 @@ class Dataset(BaseDataset): test_parameters = { "n_samples": [500], - "n_features": [5], + "n_features": [1], "noise": [0.1], "n_anomaly": [90], } diff --git a/datasets/svdb.py b/datasets/svdb.py index 40c9083..31f72b8 100644 --- a/datasets/svdb.py +++ b/datasets/svdb.py @@ -26,7 +26,8 @@ def load_data(db_path, record_ids=None, verbose=False, number=-1): db_path = Path(db_path) if record_ids is not None and number > 0: - print("Warning: 'number' parameter is ignored when 'record_ids' is provided.") + print("Warning: 'number' parameter is " + "ignored when 'record_ids' is provided.") if record_ids is None: record_files = list(db_path.glob("*.test.csv@*.out")) @@ -53,7 +54,8 @@ def load_data(db_path, record_ids=None, verbose=False, number=-1): # Assuming first column is the data, second column is labels if verbose: print( - f"Loaded record {record_id} with shape {record_data.shape}") + f"Loaded record {record_id} " + f"with shape {record_data.shape}") if record_data.shape[1] >= 2: if verbose: print(f"Record {record_id} has sufficient columns") @@ -134,7 +136,8 @@ def get_data(self): plt.figure(figsize=(6, 3)) plt.plot(X_train[0, 0, :500], linewidth=1.2) - plt.plot(range(350, 360), X_train[0, 0, 350:360], color="orange", linewidth=3) + plt.plot(range(350, 360), + X_train[0, 0, 350:360], color="orange", linewidth=3) plt.title("SVDB dataset") plt.tight_layout() plt.savefig("svdb_example.png") diff --git a/datasets/trend.py b/datasets/trend.py index 2a9fa16..a1a4cfb 100644 --- a/datasets/trend.py +++ b/datasets/trend.py @@ -18,19 +18,17 @@ class Dataset(BaseDataset): "freq": [4], # frequency multiplier for the trend } - def get_data(self): if self.debug: self.n_samples = 2 self.n_times = 1000 - size = self.n_times // 5000 contamination_params = { - "n_atoms": 2, - "sparsity": 3, - "init_z": "constant", - "init_z_kwargs": {"value": 50}, - } + "n_atoms": 2, + "sparsity": 3, + "init_z": "constant", + "init_z_kwargs": {"value": 50}, + } simulation_params = { "n_trials": self.n_samples * 2, @@ -61,8 +59,8 @@ def get_data(self): trend = self.trend_scale * np.sin(t) X += trend[None, None, :] - X_train, X_test = X[: self.n_samples], X[self.n_samples :] - y_test = info_contam["outliers_mask"][self.n_samples :] + X_train, X_test = X[: self.n_samples], X[self.n_samples:] + y_test = info_contam["outliers_mask"][self.n_samples:] y_test = np.any(y_test, axis=1) import matplotlib.pyplot as plt @@ -75,7 +73,6 @@ def get_data(self): plt.legend() plt.show() - print(f"X_train shape: {X_train.shape}") print(f"X_test shape: {X_test.shape}") print(f"y_test shape: {y_test.shape}") From 5b4730be752275421b960180cce848403e895e39 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Sat, 6 Dec 2025 11:56:06 -0800 Subject: [PATCH 25/50] lint and clean objective --- objective.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/objective.py b/objective.py index 798b51d..2dc92f1 100644 --- a/objective.py +++ b/objective.py @@ -107,19 +107,13 @@ def evaluate_result(self, y_hat, raw_anomaly_score=None): # AUC-ROC and AUC-PR auc_roc = roc_auc_score(self.y_test, raw_anomaly_score) - precision_curve, recall_curve, _ = precision_recall_curve(self.y_test, raw_anomaly_score) + precision_curve, recall_curve, _ = precision_recall_curve( + self.y_test, raw_anomaly_score) auc_pr = -np.trapz(precision_curve, recall_curve) result["auc_roc"] = auc_roc result["auc_pr"] = auc_pr - # print("Computing TSB metrics") - # if raw_anomaly_score is not None: - # tsb_metrics = get_metrics(raw_anomaly_score, self.y_test, slidingWindow=1, version="opt_mem") - # result.update(tsb_metrics) - # end_time = perf_counter() - # print(f"TSB metrics computed in {end_time - start_time:.2f} seconds") - for key, value in result.items(): print(f"{key}: {value}") From bc1fd65090f1a900aae55e8d430fd42368618e76 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Sat, 6 Dec 2025 11:56:26 -0800 Subject: [PATCH 26/50] lint test_config.py --- test_config.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test_config.py b/test_config.py index 92e34d5..e9cfba4 100644 --- a/test_config.py +++ b/test_config.py @@ -23,3 +23,12 @@ def check_test_solver_install(solver_class): # if solver_class.name.lower() == "transformer": # if get_cuda_version() is None: # pytest.xfail("Transformer needs a working GPU hardware.") + + +def check_test_get_data(data_class): + if data_class.name.lower() in [ + "daphnet", "dodgers", "ecg", "genesis", "ghl", + "iops", "kdd21", "mgab", "mitdb", "msl", "nab", + "occupancy", "opportunity" + ]: + pytest.xfail(f"{data_class.name} dataset is not downloaded.") From c4042a4ad495e73c9aec2a6f23c84e34d2e9c3ac Mon Sep 17 00:00:00 2001 From: jadyehya Date: Sat, 6 Dec 2025 12:01:41 -0800 Subject: [PATCH 27/50] lint --- solvers/legacy/isolation-forest.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/solvers/legacy/isolation-forest.py b/solvers/legacy/isolation-forest.py index 58910d0..9df46c7 100644 --- a/solvers/legacy/isolation-forest.py +++ b/solvers/legacy/isolation-forest.py @@ -88,8 +88,8 @@ def run(self, _): self.raw_y_hat = self.clf.predict(X_test_flat) self.raw_anomaly_score = self.clf.decision_function(X_test_flat) - # Reshape back to (n_recordings, n_samples) for single feature case - # For now, assume we take the first feature or average across features + # Reshape to (n_recordings, n_samples) for single feature case + # We assume we take the first feature or average across features self.raw_y_hat = self.raw_y_hat.reshape(n_recordings, n_samples) self.raw_anomaly_score = self.raw_anomaly_score.reshape( n_recordings, n_samples) @@ -106,5 +106,7 @@ def get_result(self): # Inlier : 0 # To ignore : -1 # For now, take the first recording - self.y_hat = self.raw_y_hat[0] if self.raw_y_hat.ndim > 1 else self.raw_y_hat + self.y_hat = self.raw_y_hat[0] if ( + self.raw_y_hat.ndim > 1 + ) else self.raw_y_hat return dict(y_hat=self.y_hat) From 023d396b3070265e1fd7bd362731ba68a6621da3 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Sat, 6 Dec 2025 12:06:47 -0800 Subject: [PATCH 28/50] commented unsupported feature --- solvers/dagmm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/solvers/dagmm.py b/solvers/dagmm.py index 64529fa..8c92668 100644 --- a/solvers/dagmm.py +++ b/solvers/dagmm.py @@ -23,7 +23,7 @@ class Solver(BaseSolver): "batch_size": [8192], "lambda_energy": [0.1], "lambda_cov": [0.005], - "device": ["cuda:3"] + # "device": ["cuda:3"] } sampling_strategy = "run_once" @@ -63,7 +63,7 @@ def set_objective(self, X_train, y_test, X_test): batch_size=self.batch_size, lambda_energy=self.lambda_energy, lambda_cov=self.lambda_cov, - device=self.device + # device=self.device ) self.model = DAGMM(config) From 1cc55b67889447819c66fcc02aba6a0f184cab68 Mon Sep 17 00:00:00 2001 From: tommoral Date: Sun, 7 Dec 2025 18:01:31 +0100 Subject: [PATCH 29/50] FIX skip test with proper syntax --- test_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_config.py b/test_config.py index e9cfba4..7376cd9 100644 --- a/test_config.py +++ b/test_config.py @@ -5,7 +5,7 @@ from benchopt.utils.sys_info import get_cuda_version -def check_test_solver_install(solver_class): +def check_test_solver_install(benchmark, solver_class): """Hook called in `test_solver_install`. If one solver needs to be skip/xfailed on some @@ -25,7 +25,7 @@ def check_test_solver_install(solver_class): # pytest.xfail("Transformer needs a working GPU hardware.") -def check_test_get_data(data_class): +def check_test_dataset_get_data(benchmark, data_class): if data_class.name.lower() in [ "daphnet", "dodgers", "ecg", "genesis", "ghl", "iops", "kdd21", "mgab", "mitdb", "msl", "nab", From f7b9e745c60f7a3a06e147e0ccfd715c208378d8 Mon Sep 17 00:00:00 2001 From: tommoral Date: Sun, 7 Dec 2025 18:18:18 +0100 Subject: [PATCH 30/50] FIX test running on get_data --- objective.py | 17 +++++++++-------- test_config.py | 3 ++- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/objective.py b/objective.py index 2dc92f1..43cceec 100644 --- a/objective.py +++ b/objective.py @@ -33,7 +33,7 @@ def get_one_result(self): Used to get the shape of the result. Our algorithms will return an array of labels of shape (n_samples,) """ - return dict(y_hat=np.ones(self.X_test.shape[0])) + return dict(y_hat=np.zeros_like(self.y_test)) def set_data(self, X_train, y_test, X_test): "Set the data to compute the objective." @@ -106,13 +106,14 @@ def evaluate_result(self, y_hat, raw_anomaly_score=None): }) # AUC-ROC and AUC-PR - auc_roc = roc_auc_score(self.y_test, raw_anomaly_score) - precision_curve, recall_curve, _ = precision_recall_curve( - self.y_test, raw_anomaly_score) - auc_pr = -np.trapz(precision_curve, recall_curve) - - result["auc_roc"] = auc_roc - result["auc_pr"] = auc_pr + if raw_anomaly_score is not None: + auc_roc = roc_auc_score(self.y_test, raw_anomaly_score) + precision_curve, recall_curve, _ = precision_recall_curve( + self.y_test, raw_anomaly_score) + auc_pr = -np.trapz(precision_curve, recall_curve) + + result["auc_roc"] = auc_roc + result["auc_pr"] = auc_pr for key, value in result.items(): print(f"{key}: {value}") diff --git a/test_config.py b/test_config.py index 7376cd9..3dd38fb 100644 --- a/test_config.py +++ b/test_config.py @@ -29,6 +29,7 @@ def check_test_dataset_get_data(benchmark, data_class): if data_class.name.lower() in [ "daphnet", "dodgers", "ecg", "genesis", "ghl", "iops", "kdd21", "mgab", "mitdb", "msl", "nab", - "occupancy", "opportunity" + "occupancy", "opportunity", "sensorscope", "smd", + "svdb", "yahoo" ]: pytest.xfail(f"{data_class.name} dataset is not downloaded.") From 3c64f1ce42baf768804ded2279b3b55300ce2552 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Sun, 7 Dec 2025 11:13:57 -0800 Subject: [PATCH 31/50] xfailing anomalybert installation --- test_config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test_config.py b/test_config.py index 3dd38fb..45f17ee 100644 --- a/test_config.py +++ b/test_config.py @@ -16,6 +16,10 @@ def check_test_solver_install(benchmark, solver_class): if get_cuda_version() is None: pytest.xfail("Deep IsolationForest needs a working GPU hardware.") + if solver_class.name.lower() == "anomalybert": + pytest.xfail("AnomalyBERT needs to be installed locally from repo" \ + " at https://github.com/Jhryu30/AnomalyBERT.git") + # if solver_class.name.lower() == "lstm": # if get_cuda_version() is None: # pytest.xfail("LSTM needs a working GPU hardware.") From eb6135534a0cfc4ce5909b6dd112f6e4b9baa0a9 Mon Sep 17 00:00:00 2001 From: jadyehya Date: Sun, 7 Dec 2025 11:16:12 -0800 Subject: [PATCH 32/50] lint test_config --- test_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_config.py b/test_config.py index 45f17ee..77dd85a 100644 --- a/test_config.py +++ b/test_config.py @@ -17,8 +17,8 @@ def check_test_solver_install(benchmark, solver_class): pytest.xfail("Deep IsolationForest needs a working GPU hardware.") if solver_class.name.lower() == "anomalybert": - pytest.xfail("AnomalyBERT needs to be installed locally from repo" \ - " at https://github.com/Jhryu30/AnomalyBERT.git") + pytest.xfail("AnomalyBERT needs to be installed locally from repo" + " at https://github.com/Jhryu30/AnomalyBERT.git") # if solver_class.name.lower() == "lstm": # if get_cuda_version() is None: From 09cdbb071ceb30d40aefaf2cfc45793612e186c3 Mon Sep 17 00:00:00 2001 From: tommoral Date: Sun, 7 Dec 2025 22:15:39 +0100 Subject: [PATCH 33/50] CLN remove safe_import_context + adapt pip --- solvers/AR.py | 18 +++++++++--------- solvers/anomalybert.py | 26 ++++++++++++-------------- solvers/autoencoder.py | 12 ++++++------ solvers/dagmm.py | 15 +++++++-------- solvers/legacy/abod.py | 2 +- solvers/legacy/cblof.py | 2 +- solvers/legacy/dif.py | 2 +- solvers/lstm.py | 19 +++++++++---------- solvers/matrixprofile.py | 11 +++++------ solvers/rosecdl.py | 13 +++++++------ solvers/tsb_chronos.py | 13 ++++++------- solvers/tsb_timesfm.py | 11 +++++------ solvers/tsb_timesnet.py | 9 ++++----- solvers/vae.py | 11 +++++------ solvers/vanilla-transformer.py | 19 +++++++++---------- 15 files changed, 87 insertions(+), 96 deletions(-) diff --git a/solvers/AR.py b/solvers/AR.py index d8b6314..95ea7e1 100644 --- a/solvers/AR.py +++ b/solvers/AR.py @@ -1,20 +1,20 @@ # AR model -from benchopt import BaseSolver, safe_import_context -from benchmark_utils import mean_overlaping_pred +from benchopt import BaseSolver + +import torch +from torch import optim, nn +import numpy as np +from tqdm import tqdm -with safe_import_context() as import_ctx: - import torch - from torch import optim, nn - import numpy as np - from tqdm import tqdm - from benchmark_utils.models import ARModel +from benchmark_utils.models import ARModel +from benchmark_utils import mean_overlaping_pred class Solver(BaseSolver): name = "AR" # AutoRegressive Linear model install_cmd = "conda" - requirements = ["pip:torch", "tqdm"] + requirements = ["pip::torch", "tqdm"] sampling_strategy = "run_once" diff --git a/solvers/anomalybert.py b/solvers/anomalybert.py index 165ee35..008bd75 100644 --- a/solvers/anomalybert.py +++ b/solvers/anomalybert.py @@ -1,26 +1,24 @@ -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - import sys - import os - import numpy as np - import torch - import torch.nn as nn - from torch.optim.lr_scheduler import CosineAnnealingLR - from tqdm import tqdm +import sys +from pathlib import Path +import numpy as np +import torch +import torch.nn as nn +from torch.optim.lr_scheduler import CosineAnnealingLR +from tqdm import tqdm - # Add AnomalyBERT to path - sys.path.append(os.path.join( - os.path.dirname(__file__), '..', 'AnomalyBERT')) +# Add AnomalyBERT to path +sys.path.append(str(Path(__file__).parent.parent / 'AnomalyBERT')) - from models.anomaly_transformer import get_anomaly_transformer +from models.anomaly_transformer import get_anomaly_transformer class Solver(BaseSolver): name = "AnomalyBERT" sampling_strategy = "run_once" - requirements = ["pip:timm", "pip:torch", "pip:numpy", "pip:tqdm"] + requirements = ["pip::timm", "pip::torch", "pip::numpy", "pip::tqdm"] parameters = { "patch_size": [1], diff --git a/solvers/autoencoder.py b/solvers/autoencoder.py index db251c6..3f68a14 100644 --- a/solvers/autoencoder.py +++ b/solvers/autoencoder.py @@ -1,17 +1,17 @@ -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver + +import numpy as np +from TSB_AD.utils.slidingWindows import find_length from sklearn.preprocessing import MinMaxScaler -with safe_import_context() as import_ctx: - from benchmark_utils.models import Autoencoder - from TSB_AD.utils.slidingWindows import find_length - import numpy as np +from benchmark_utils.models import Autoencoder class Solver(BaseSolver): name = "AE" install_cmd = "conda" - requirements = ["pip:tsb-uad", "scikit-learn"] + requirements = ["pip::tsb-uad", "scikit-learn"] parameters = { "window_size": [10, "auto"], diff --git a/solvers/dagmm.py b/solvers/dagmm.py index 8c92668..9d54fa0 100644 --- a/solvers/dagmm.py +++ b/solvers/dagmm.py @@ -1,18 +1,17 @@ -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - import numpy as np - import pandas as pd - from merlion.models.anomaly.dagmm import DAGMM, DAGMMConfig - from merlion.utils.time_series import TimeSeries - from sklearn.preprocessing import MinMaxScaler +import numpy as np +import pandas as pd +from merlion.models.anomaly.dagmm import DAGMM, DAGMMConfig +from merlion.utils.time_series import TimeSeries +from sklearn.preprocessing import MinMaxScaler class Solver(BaseSolver): name = "DAGMM" install_cmd = "conda" - requirements = ["pip:salesforce-merlion", "pip:scikit-learn"] + requirements = ["pip::salesforce-merlion", "pip::scikit-learn"] parameters = { "gmm_k": [3], diff --git a/solvers/legacy/abod.py b/solvers/legacy/abod.py index 6ff02ae..6e4d54e 100644 --- a/solvers/legacy/abod.py +++ b/solvers/legacy/abod.py @@ -12,7 +12,7 @@ class Solver(BaseSolver): name = "ABOD" # Angle-Based Outlier Detection install_cmd = "conda" - requirements = ["pip:pyod"] + requirements = ["pip::pyod"] parameters = { "contamination": [5e-4, 0.1, 0.2, 0.3], diff --git a/solvers/legacy/cblof.py b/solvers/legacy/cblof.py index 3e44432..60b0994 100644 --- a/solvers/legacy/cblof.py +++ b/solvers/legacy/cblof.py @@ -12,7 +12,7 @@ class Solver(BaseSolver): name = "CBLOF" install_cmd = "conda" - requirements = ["pip:pyod"] + requirements = ["pip::pyod"] parameters = { "contamination": [5e-4, 0.01, 0.02, 0.03, 0.04], diff --git a/solvers/legacy/dif.py b/solvers/legacy/dif.py index 6aeef8e..25d2f54 100644 --- a/solvers/legacy/dif.py +++ b/solvers/legacy/dif.py @@ -11,7 +11,7 @@ class Solver(BaseSolver): name = "DIF" install_cmd = "conda" - requirements = ["pip:pyod"] + requirements = ["pip::pyod"] parameters = { "contamination": [0.05, 0.1, 0.2], diff --git a/solvers/lstm.py b/solvers/lstm.py index b3e128f..6cb6a37 100644 --- a/solvers/lstm.py +++ b/solvers/lstm.py @@ -1,21 +1,20 @@ # LSTM Autoencoder -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - import torch - import torch.nn as nn - import torch.optim as optim - import numpy as np - from torch.utils.data import DataLoader - from tqdm import tqdm - from benchmark_utils.models import AutoEncoderLSTM +import torch +import torch.nn as nn +import torch.optim as optim +import numpy as np +from torch.utils.data import DataLoader +from tqdm import tqdm +from benchmark_utils.models import AutoEncoderLSTM class Solver(BaseSolver): name = "LSTM" install_cmd = "conda" - requirements = ["pip:torch", "tqdm"] + requirements = ["pip::torch", "tqdm"] sampling_strategy = "run_once" diff --git a/solvers/matrixprofile.py b/solvers/matrixprofile.py index 739de4f..cdad2a9 100644 --- a/solvers/matrixprofile.py +++ b/solvers/matrixprofile.py @@ -1,17 +1,16 @@ -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver from sklearn.preprocessing import MinMaxScaler -with safe_import_context() as import_ctx: - from TSB_AD.models.MatrixProfile import MatrixProfile - from TSB_AD.utils.slidingWindows import find_length - import numpy as np +import numpy as np +from TSB_AD.models.MatrixProfile import MatrixProfile +from TSB_AD.utils.slidingWindows import find_length class Solver(BaseSolver): name = "MP" install_cmd = "conda" - requirements = ["pip:tsb-ad", "scikit-learn"] + requirements = ["pip::tsb-ad", "scikit-learn"] parameters = { "window_size": [128, "auto"], diff --git a/solvers/rosecdl.py b/solvers/rosecdl.py index 2c0fa4e..97ee18d 100644 --- a/solvers/rosecdl.py +++ b/solvers/rosecdl.py @@ -1,16 +1,17 @@ -from benchopt import safe_import_context, BaseSolver +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - from rosecdl.rosecdl import RoseCDL - from TSB_AD.utils.slidingWindows import find_length - import torch +import torch +from rosecdl.rosecdl import RoseCDL +from TSB_AD.utils.slidingWindows import find_length class Solver(BaseSolver): name = "RoseCDL" install_cmd = "conda" - requirements = ["pip:rosecdl", "pip:torch"] + requirements = [ + "pip::git+https://github.com/tommoral/rosecdl.git", "pip::torch" + ] parameters = { "n_components": [1], diff --git a/solvers/tsb_chronos.py b/solvers/tsb_chronos.py index 1f7cfe9..c9f12be 100644 --- a/solvers/tsb_chronos.py +++ b/solvers/tsb_chronos.py @@ -1,17 +1,16 @@ -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - from TSB_AD.models.Chronos import Chronos - from TSB_AD.utils.slidingWindows import find_length - import numpy as np - import torch +import torch +import numpy as np +from TSB_AD.models.Chronos import Chronos +from TSB_AD.utils.slidingWindows import find_length class Solver(BaseSolver): name = "TSB-Chronos" install_cmd = "conda" - requirements = ["pip:tsb-ad"] + requirements = ["pip::tsb-ad"] parameters = { "win_size": ["auto"], diff --git a/solvers/tsb_timesfm.py b/solvers/tsb_timesfm.py index 6784dfe..a5a7346 100644 --- a/solvers/tsb_timesfm.py +++ b/solvers/tsb_timesfm.py @@ -1,16 +1,15 @@ -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - from TSB_AD.model_wrapper import run_TimesFM - import numpy as np - import torch +import torch +import numpy as np +from TSB_AD.model_wrapper import run_TimesFM class Solver(BaseSolver): name = "TSB-TimesFM" install_cmd = "conda" - requirements = ["pip:tsb-ad"] + requirements = ["pip::tsb-ad"] parameters = { "win_size": [256], diff --git a/solvers/tsb_timesnet.py b/solvers/tsb_timesnet.py index 2333208..baad0e8 100644 --- a/solvers/tsb_timesnet.py +++ b/solvers/tsb_timesnet.py @@ -1,15 +1,14 @@ -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - from TSB_AD.models.TimesNet import TimesNet - import torch +import torch +from TSB_AD.models.TimesNet import TimesNet class Solver(BaseSolver): name = "TSB-TimesNet" install_cmd = "conda" - requirements = ["pip:tsb-ad"] + requirements = ["pip::tsb-ad"] parameters = { "window_size": [256], diff --git a/solvers/vae.py b/solvers/vae.py index c55f73d..8456918 100644 --- a/solvers/vae.py +++ b/solvers/vae.py @@ -1,16 +1,15 @@ -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - from pyod.models.vae import VAE - import numpy as np - import torch +import torch +import numpy as np +from pyod.models.vae import VAE class Solver(BaseSolver): name = "VAE" install_cmd = "conda" - requirements = ["pip:pyod", "pip:torch"] + requirements = ["pip::pyod", "pip::torch"] sampling_strategy = "run_once" diff --git a/solvers/vanilla-transformer.py b/solvers/vanilla-transformer.py index 677cfca..25eb373 100644 --- a/solvers/vanilla-transformer.py +++ b/solvers/vanilla-transformer.py @@ -1,21 +1,20 @@ # Vanilla Transformer -from benchopt import BaseSolver, safe_import_context -from benchmark_utils import mean_overlaping_pred +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - import torch - import torch.nn as nn - import torch.optim as optim - import numpy as np - from tqdm import tqdm - from benchmark_utils.models import TransformerModel +import torch +import torch.nn as nn +import torch.optim as optim +import numpy as np +from tqdm import tqdm +from benchmark_utils import mean_overlaping_pred +from benchmark_utils.models import TransformerModel class Solver(BaseSolver): name = "Transformer" install_cmd = "conda" - requirements = ["pip:torch", "tqdm"] + requirements = ["pip::torch", "tqdm"] sampling_strategy = "run_once" From b89e3b5e441d81ca1c850462d44362a38733d7ce Mon Sep 17 00:00:00 2001 From: tommoral Date: Tue, 9 Dec 2025 00:20:47 +0100 Subject: [PATCH 34/50] ENH improve tests --- benchmark_utils/models.py | 7 +- benchmark_utils/windowing.py | 113 +++++++++++++++++++++++++++++ datasets/msl.py | 2 - datasets/simulated.py | 6 +- objective.py | 2 +- solvers/AR.py | 13 +++- solvers/anomalybert.py | 2 +- solvers/autoencoder.py | 8 +- solvers/dagmm.py | 10 +-- solvers/legacy/abod.py | 11 +-- solvers/legacy/cblof.py | 11 +-- solvers/legacy/dif.py | 11 +-- solvers/legacy/isolation-forest.py | 12 +-- solvers/legacy/lof.py | 11 +-- solvers/legacy/ocsvm.py | 11 +-- solvers/lstm.py | 75 ++++++++----------- solvers/matrixprofile.py | 6 +- solvers/rosecdl.py | 3 +- solvers/tsb_chronos.py | 2 +- solvers/tsb_timesfm.py | 4 +- solvers/tsb_timesnet.py | 2 +- solvers/vae.py | 78 ++++++++++---------- solvers/vanilla-transformer.py | 113 +++++++++++++---------------- test_config.py | 2 +- 24 files changed, 281 insertions(+), 234 deletions(-) create mode 100644 benchmark_utils/windowing.py diff --git a/benchmark_utils/models.py b/benchmark_utils/models.py index 1432e97..ad38592 100644 --- a/benchmark_utils/models.py +++ b/benchmark_utils/models.py @@ -211,7 +211,7 @@ def fit( X, num_epochs=50, learning_rate=1e-3, - device="cuda", + device=None, batch_size=32 ): """ @@ -227,6 +227,11 @@ def fit( Returns: List of training losses per epoch """ + if device is None: + device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu" + ) + # Convert to tensor if numpy array if isinstance(X, np.ndarray): X = torch.from_numpy(X).float() diff --git a/benchmark_utils/windowing.py b/benchmark_utils/windowing.py new file mode 100644 index 0000000..40399b9 --- /dev/null +++ b/benchmark_utils/windowing.py @@ -0,0 +1,113 @@ +import numpy as np +import torch +from torch.utils.data import TensorDataset + + +def make_windows(X, window_size=32, stride=1, padding=False): + """Create a windowed view of the data. + + Parameters + ---------- + X : np.ndarray + Input data of shape (n_samples, n_features, n_times). + window_size : int + Size of the sliding window. + stride : int + Stride of the sliding window. + + Returns + ------- + windows : np.ndarray + A windowed view of the data in shape: + (n_eff_samples, window_size, n_features) + """ + + if padding: + n_samples, n_features, n_times = X.shape + n_pad = (window_size - stride + n_times % stride) % stride + pad_width = ((0, 0), (0, 0), (0, n_pad)) + X = np.pad(X, pad_width=pad_width, mode='constant') + + return np.lib.stride_tricks.sliding_window_view( + X, window_shape=window_size, axis=-1 + )[..., ::stride, :].transpose(0, 2, 1, 3).reshape( + -1, X.shape[1], window_size + ).transpose(0, 2, 1) + + +def make_windowed_dataset(X, y=None, window_size=32, stride=1): + """ + Create a DataLoader with windowed views of the data. + + Parameters + ---------- + X : np.ndarray + Input data of shape (n_samples, n_features, n_times). + y : np.ndarray, optional + Target data of shape (n_samples, n_times). + window_size : int + Size of the sliding window. + stride : int + Stride of the sliding window. + + Returns + ------- + Dataset + A PyTorch Dataset with windowed data in shape: + (n_eff_samples, window_size, n_features) + """ + + if window_size is not None: + X = make_windows(X, window_size, stride) + + X_tensor = torch.tensor(X, dtype=torch.float32) + + if y is not None: + if window_size is not None: + y = np.lib.stride_tricks.sliding_window_view( + y, window_shape=window_size, axis=-1 + )[..., ::stride, :].reshape(-1, window_size) + + y_tensor = torch.tensor(y, dtype=torch.float32) + dataset = TensorDataset(X_tensor, y_tensor) + else: + dataset = TensorDataset(X_tensor) + + return dataset + + +def reconstruct_from_windows(windows, stride, batch, n_features): + """Reconstruct the original signal from overlapping windows + + Parameters + ---------- + windows : np.ndarray + The overlapping windows of shape (batch*n_windows, window_size, n_features) + stride : int + The stride used to create the windows + batch : int + The batch size used when creating the windows + n_features : int + The number of features in the original signal + """ + # windows: (batch*n_windows, window_size, n_features) + w = windows.shape[1] + windows = windows.reshape(batch, -1, w, n_features) + b, nw, ws, nf = windows.shape + nt = (nw - 1) * stride + ws + + # allocate accumulator + counts for correct overlap averaging + acc = np.zeros((b, nf, nt)) + cnt = np.zeros((nt,), dtype=int) + + # build index map for overlap positions + idx = np.arange(ws)[:, None] + stride * np.arange(nw) + + # add windows efficiently + np.add.at(acc, (slice(None), slice(None), idx.ravel()), + windows.transpose(0, 3, 1, 2).reshape(b, nf, -1)) + + # count contributions + np.add.at(cnt, idx.ravel(), 1) + + return acc / cnt \ No newline at end of file diff --git a/datasets/msl.py b/datasets/msl.py index db73346..bb7067e 100644 --- a/datasets/msl.py +++ b/datasets/msl.py @@ -65,8 +65,6 @@ def get_data(self): X_test = X_test.T.reshape(1, n_features, -1) y_test = y_test.reshape(1, -1) - print(X_train.shape, X_test.shape, y_test.shape) - return dict( X_train=X_train, y_test=y_test, X_test=X_test ) diff --git a/datasets/simulated.py b/datasets/simulated.py index 28b5474..f41c1b8 100644 --- a/datasets/simulated.py +++ b/datasets/simulated.py @@ -19,10 +19,10 @@ class Dataset(BaseDataset): } test_parameters = { - "n_samples": [500], - "n_features": [1], + "n_samples": [64], + "n_features": [2], "noise": [0.1], - "n_anomaly": [90], + "n_anomaly": [9], } def get_data(self): diff --git a/objective.py b/objective.py index 43cceec..660d7c1 100644 --- a/objective.py +++ b/objective.py @@ -122,5 +122,5 @@ def evaluate_result(self, y_hat, raw_anomaly_score=None): def get_objective(self): return dict( - X_train=self.X_train, y_test=self.y_test, X_test=self.X_test + X_train=self.X_train, X_test=self.X_test ) diff --git a/solvers/AR.py b/solvers/AR.py index 95ea7e1..ba2446e 100644 --- a/solvers/AR.py +++ b/solvers/AR.py @@ -28,7 +28,14 @@ class Solver(BaseSolver): "percentile": [99.4], } - def set_objective(self, X_train, y_test, X_test): + test_config = { + 'solver': { + "n_epochs": 1, + "window_size": 16, + } + } + + def set_objective(self, X_train, X_test): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" @@ -42,7 +49,6 @@ def set_objective(self, X_train, y_test, X_test): self.X_train = X_train.reshape(-1, n_features) # (n_samples, n_features) self.X_test = X_test.reshape(-1, n_features) - self.y_test = y_test.reshape(-1) # (n_samples,) self.model = ARModel( n_features, @@ -59,7 +65,6 @@ def set_objective(self, X_train, y_test, X_test): print("IN AR") print("X_train shape", self.X_train.shape) print("X_test shape", self.X_test.shape) - print("y_test shape", self.y_test.shape) if self.X_train is not None: # (n_windows, window_size+horizon, n_features) @@ -157,7 +162,7 @@ def run(self, _): self.predictions = np.max(predictions, axis=1) # Skipping the solver call if a condition is met - def skip(self, X_train, X_test, y_test): + def skip(self, X_train, X_test): if X_train.shape[0]*X_train.shape[2] < self.window_size + self.horizon: return True, "No enough training samples" if X_test.shape[0]*X_test.shape[2] < self.window_size + self.horizon: diff --git a/solvers/anomalybert.py b/solvers/anomalybert.py index 008bd75..abcda9a 100644 --- a/solvers/anomalybert.py +++ b/solvers/anomalybert.py @@ -35,7 +35,7 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): # X_train shape: (n_series, n_features, n_samples) if X_train.ndim == 3: self.X_train = np.transpose( diff --git a/solvers/autoencoder.py b/solvers/autoencoder.py index 3f68a14..f5a138e 100644 --- a/solvers/autoencoder.py +++ b/solvers/autoencoder.py @@ -24,7 +24,7 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): if self.window_size == "auto": self.window_size = find_length(X_train) @@ -32,7 +32,6 @@ def set_objective(self, X_train, y_test, X_test): n_features = X_train.shape[1] self.X_train = X_train.reshape(-1, n_features) self.X_test = X_test.reshape(-1, n_features) - self.y_test = y_test.reshape(-1) # For multivariate data, input_size = window_size * n_features self.clf = Autoencoder( @@ -59,13 +58,10 @@ def run(self, _): .ravel() ) - def skip(self, X_train, y_test, X_test): + def skip(self, X_train, X_test): """Check if the solver can be skipped.""" if find_length(X_train) == 0 and self.window_size == "auto": return True, "Window size is 0" - from torch.cuda import is_available - if not is_available(): - return True, "AE requires a GPU to run." return False, None def get_result(self): diff --git a/solvers/dagmm.py b/solvers/dagmm.py index 9d54fa0..99182f4 100644 --- a/solvers/dagmm.py +++ b/solvers/dagmm.py @@ -27,18 +27,10 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): - print(X_train.shape, X_test.shape, y_test.shape) - print(X_train.dtype, X_test.dtype, y_test.dtype) - print("Nan in X_train:", np.isnan( - X_train).any(), np.isnan(X_train).sum()) - print("Nan in X_test:", np.isnan(X_test).any(), np.isnan(X_test).sum()) - print("Nan in y_test:", np.isnan(y_test).any(), np.isnan(y_test).sum()) - + def set_objective(self, X_train, X_test): n_features = X_train.shape[1] self.X_train = X_train.transpose(0, 2, 1).reshape(-1, n_features) self.X_test = X_test.transpose(0, 2, 1).reshape(-1, n_features) - self.y_test = y_test.reshape(-1) # Convert to Merlion TimeSeries # We use a default index since we don't have timestamps train_df = pd.DataFrame(self.X_train) diff --git a/solvers/legacy/abod.py b/solvers/legacy/abod.py index 6e4d54e..21d7226 100644 --- a/solvers/legacy/abod.py +++ b/solvers/legacy/abod.py @@ -24,9 +24,9 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test self.clf = ABOD( n_neighbors=self.n_neighbors, contamination=self.contamination, @@ -48,11 +48,6 @@ def run(self, _): self.X_test, window_shape=self.window_size, axis=0 )[::self.stride].transpose(0, 2, 1) - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - # Flattening the data for the model flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) @@ -86,7 +81,7 @@ def run(self, _): ) # Function used to skip a solver call when n_neighbors >= window_size - def skip(self, X_train, X_test, y_test): + def skip(self, X_train, X_test): if self.n_neighbors >= self.window_size: return True, "Number of neighbors greater than number of samples." return False, None diff --git a/solvers/legacy/cblof.py b/solvers/legacy/cblof.py index 60b0994..e3170a9 100644 --- a/solvers/legacy/cblof.py +++ b/solvers/legacy/cblof.py @@ -24,9 +24,9 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test self.clf = CBLOF( contamination=self.contamination, n_clusters=self.n_clusters @@ -47,11 +47,6 @@ def run(self, _): self.X_test, window_shape=self.window_size, axis=0 )[::self.stride].transpose(0, 2, 1) - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - # Flattening the data for the model flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) @@ -84,7 +79,7 @@ def run(self, _): ) # Skipping the solver call if a condition is met - def skip(self, X_train, X_test, y_test): + def skip(self, X_train, X_test): if X_train.shape[0] < self.window_size: return True, "No enough samples to create a window" return False, None diff --git a/solvers/legacy/dif.py b/solvers/legacy/dif.py index 25d2f54..bc0ac0d 100644 --- a/solvers/legacy/dif.py +++ b/solvers/legacy/dif.py @@ -22,9 +22,9 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test # Device is automatically selected by the model # if device=None self.clf = DIF(contamination=self.contamination, device=None) @@ -44,11 +44,6 @@ def run(self, _): self.X_test, window_shape=self.window_size, axis=0 )[::self.stride].transpose(0, 2, 1) - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - # Flattening the data for the model flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) @@ -80,7 +75,7 @@ def run(self, _): np.full(result_shape, -1), self.raw_anomaly_score ) - def skip(self, X_train, X_test, y_test): + def skip(self, X_train, X_test): if X_train.shape[0] < self.window_size: return True, "Not enough samples to create a window" return False, None diff --git a/solvers/legacy/isolation-forest.py b/solvers/legacy/isolation-forest.py index 9df46c7..d8ce688 100644 --- a/solvers/legacy/isolation-forest.py +++ b/solvers/legacy/isolation-forest.py @@ -23,9 +23,9 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test n_recordings, n_features, n_samples = self.X_train.shape self.clf = IsolationForest(contamination=self.contamination) @@ -45,12 +45,6 @@ def run(self, _): self.X_test, window_shape=self.window_size, axis=2 )[:, :, ::self.stride].transpose(0, 1, 3, 2) - if self.y_test is not None: - n_recordings, _, n_samples = self.y_test.shape - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=2 - )[:, :, ::self.stride] - # Flatten for sklearn flatrain = self.Xw_train.reshape( self.Xw_train.shape[0] * self.Xw_train.shape[1], -1) @@ -94,7 +88,7 @@ def run(self, _): self.raw_anomaly_score = self.raw_anomaly_score.reshape( n_recordings, n_samples) - def skip(self, X_train, X_test, y_test): + def skip(self, X_train, X_test): # Skip if dataset size is smaller than window size _, _, n_samples = X_train.shape if n_samples < self.window_size: diff --git a/solvers/legacy/lof.py b/solvers/legacy/lof.py index 1ce2058..be3c64f 100644 --- a/solvers/legacy/lof.py +++ b/solvers/legacy/lof.py @@ -24,9 +24,9 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test self.clf = LocalOutlierFactor( novelty=True, n_neighbors=self.n_neighbors, @@ -46,11 +46,6 @@ def run(self, _): self.X_test, window_shape=self.window_size, axis=0 )[::self.stride].transpose(0, 2, 1) - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) @@ -81,7 +76,7 @@ def run(self, _): np.full(result_shape, -1), self.raw_anomaly_score ) - def skip(self, X_train, y_test, X_test): + def skip(self, X_train, X_test): if self.n_neighbors > self.window_size: return True, "Number of neighbors greater than number of samples." if self.n_neighbors > X_train.shape[0]: diff --git a/solvers/legacy/ocsvm.py b/solvers/legacy/ocsvm.py index 268e57c..7489076 100644 --- a/solvers/legacy/ocsvm.py +++ b/solvers/legacy/ocsvm.py @@ -22,9 +22,9 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test self.clf = OneClassSVM( nu=self.nu, kernel=self.kernel, @@ -42,11 +42,6 @@ def set_objective(self, X_train, y_test, X_test): self.X_test, window_shape=self.window_size, axis=0 )[::self.stride].transpose(0, 2, 1) - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - self.flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) self.flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) @@ -79,7 +74,7 @@ def run(self, _): np.full(result_shape, -1), self.raw_anomaly_score ) - def skip(self, X_train, X_test, y_test): + def skip(self, X_train, X_test): if X_train.shape[0] < self.window_size: return True, "Window size is larger than dataset size." return False, None diff --git a/solvers/lstm.py b/solvers/lstm.py index 6cb6a37..1c126ad 100644 --- a/solvers/lstm.py +++ b/solvers/lstm.py @@ -8,6 +8,8 @@ from torch.utils.data import DataLoader from tqdm import tqdm from benchmark_utils.models import AutoEncoderLSTM +from benchmark_utils.windowing import make_windowed_dataset +from benchmark_utils.windowing import reconstruct_from_windows class Solver(BaseSolver): @@ -23,7 +25,6 @@ class Solver(BaseSolver): "batch_size": [32], "n_epochs": [50], "lr": [1e-5], - "window": [True], "window_size": [256], # window_size = seq_len "stride": [1], "percentile": [97], @@ -31,20 +32,23 @@ class Solver(BaseSolver): "decoder_layers": [32], } - def prepare_data(self, *data): - # return tensors on device - return (torch.tensor( - d, dtype=torch.float32, device=self.device) - for d in data) + test_config = { + 'solver': { + "embedding_dim": 2, + "batch_size": 1, + "n_epochs": 1, + "window_size": 16, + } + } - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test self.n_features = X_train.shape[1] self.seq_len = self.window_size @@ -58,33 +62,15 @@ def set_objective(self, X_train, y_test, X_test): self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.criterion = nn.MSELoss() - if self.window: - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - self.X_train, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - self.Xw_train = torch.tensor( - self.Xw_train, dtype=torch.float32 - ) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - self.X_test, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - self.Xw_test = torch.tensor( - self.Xw_test, dtype=torch.float32 - ) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] + self.Xw_train = make_windowed_dataset( + self.X_train, window_size=self.window_size, + stride=self.stride + ) - self.yw_test = torch.tensor( - self.yw_test, dtype=torch.float32 - ) + self.Xw_test = make_windowed_dataset( + self.X_test, window_size=self.window_size, + stride=self.stride + ) self.train_loader = DataLoader( self.Xw_train, batch_size=self.batch_size, shuffle=True, @@ -104,7 +90,7 @@ def run(self, _): for epoch in ti: self.model.train() train_loss = 0 - for i, x in enumerate(self.train_loader): + for x, in self.train_loader: x = x.to(self.device) @@ -125,17 +111,16 @@ def run(self, _): # Test loop self.model.eval() raw_reconstruction = [] - for x in self.test_loader: + for x, in self.test_loader: x = x.to(self.device) - - x_hat = self.model(x) + with torch.no_grad(): + x_hat = self.model(x) raw_reconstruction.append(x_hat.detach().cpu().numpy()) - - raw_reconstruction = np.concatenate(raw_reconstruction, axis=0) - - reconstructed_data = np.concatenate( - [raw_reconstruction[0], raw_reconstruction[1:, -1, :]], axis=0 + reconstructed_data = np.concatenate(raw_reconstruction, axis=0) + reconstructed_data = reconstruct_from_windows( + reconstructed_data, stride=self.stride, + batch=len(self.X_test), n_features=self.n_features ) reconstruction_err = np.mean( @@ -147,8 +132,8 @@ def run(self, _): reconstruction_err, self.percentile), 1, 0 ) - def skip(self, X_train, X_test, y_test): - if X_train.shape[0] < self.window_size: + def skip(self, X_train, X_test): + if X_train.shape[-1] < self.window_size: return True, "Not enough samples to create a window." return False, None diff --git a/solvers/matrixprofile.py b/solvers/matrixprofile.py index cdad2a9..48ed961 100644 --- a/solvers/matrixprofile.py +++ b/solvers/matrixprofile.py @@ -18,10 +18,10 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): # Shapes received: (n_recordings, n_features, n_samples) self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test n_features = X_train.shape[1] @@ -53,7 +53,7 @@ def run(self, _): print("MP Scored") print(f"Score shape: {self.score.shape}") - def skip(self, X_train, y_test, X_test): + def skip(self, X_train, X_test): """Check if the solver can be skipped.""" if (find_length(X_train.reshape(-1)) == 0) and ( self.window_size == "auto"): diff --git a/solvers/rosecdl.py b/solvers/rosecdl.py index 97ee18d..beabd79 100644 --- a/solvers/rosecdl.py +++ b/solvers/rosecdl.py @@ -39,12 +39,11 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # We receive data in shape (n_recordings, n_features, n_samples) - self.y_test = y_test self.X_train = torch.tensor( X_train, dtype=torch.float32, device=self.device) self.X_test = X_test diff --git a/solvers/tsb_chronos.py b/solvers/tsb_chronos.py index c9f12be..9c87358 100644 --- a/solvers/tsb_chronos.py +++ b/solvers/tsb_chronos.py @@ -21,7 +21,7 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): _, n_features, _ = X_train.shape self.data = np.append(X_train, X_test, axis=2) self.data = self.data.reshape(-1, n_features) diff --git a/solvers/tsb_timesfm.py b/solvers/tsb_timesfm.py index a5a7346..77a69b7 100644 --- a/solvers/tsb_timesfm.py +++ b/solvers/tsb_timesfm.py @@ -9,7 +9,7 @@ class Solver(BaseSolver): name = "TSB-TimesFM" install_cmd = "conda" - requirements = ["pip::tsb-ad"] + requirements = ["pip::tsb-ad", "pip::timesfm"] parameters = { "win_size": [256], @@ -17,7 +17,7 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): _, n_features, _ = X_train.shape self.data = np.append(X_train, X_test, axis=2) self.data = self.data.reshape(-1, n_features) diff --git a/solvers/tsb_timesnet.py b/solvers/tsb_timesnet.py index baad0e8..76ed261 100644 --- a/solvers/tsb_timesnet.py +++ b/solvers/tsb_timesnet.py @@ -17,7 +17,7 @@ class Solver(BaseSolver): sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): _, n_features, _ = X_train.shape self.X_train = X_train.reshape(-1, n_features) self.X_test = X_test.reshape(-1, n_features) diff --git a/solvers/vae.py b/solvers/vae.py index 8456918..75e66c9 100644 --- a/solvers/vae.py +++ b/solvers/vae.py @@ -4,6 +4,8 @@ import numpy as np from pyod.models.vae import VAE +from benchmark_utils.windowing import make_windows + class Solver(BaseSolver): name = "VAE" @@ -16,61 +18,55 @@ class Solver(BaseSolver): parameters = { "contamination": [0.005, 0.05, 0.1, 0.2], "n_epochs": [50], - "window": [False], "window_size": [256], "horizon": [0], "stride": [1], "batch_size": [128], - "preprocessing": [True, False], + "preprocessing": [True], "latent_dim": [2, 5, 10], - "batch_norm": [True, False], + "batch_norm": [True], "dropout_rate": [0.1, 0.2, 0.5], } + test_config = { + 'solver': { + "n_epochs": 1, + "window_size": 16, + } + } - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) self.X_train = X_train - self.X_test, self.y_test = X_test, y_test - - self.clf = VAE(contamination=self.contamination, - preprocessing=self.preprocessing, - batch_size=self.batch_size, - epoch_num=self.n_epochs, - device=self.device, - latent_dim=self.latent_dim, - batch_norm=self.batch_norm, - dropout_rate=self.dropout_rate, - ) - - if self.window: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - X_train, - window_shape=self.window_size+self.horizon, - axis=0 - ).transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - X_test, - window_shape=self.window_size+self.horizon, - axis=0 - ).transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - self.yw_test = torch.tensor( - self.yw_test, dtype=torch.float32 - ) - else: - self.Xw_train = X_train - self.Xw_test = X_test + self.X_test = X_test + + self.Xw_train = make_windows( + X_train, + window_size=self.window_size, + stride=self.stride + ).reshape(-1, self.window_size * X_train.shape[1]) + + self.Xw_test = make_windows( + X_test, + window_size=self.window_size+self.horizon, + stride=self.stride, + padding=True + ).reshape(-1, self.window_size * X_train.shape[1]) + + self.clf = VAE( + contamination=self.contamination, + preprocessing=self.preprocessing, + batch_size=min(self.batch_size, len(self.Xw_train)), + epoch_num=self.n_epochs, + device=self.device, + latent_dim=self.latent_dim, + batch_norm=self.batch_norm, + dropout_rate=self.dropout_rate, + lr=1e-5 + ) def run(self, _): self.clf.fit(self.Xw_train) diff --git a/solvers/vanilla-transformer.py b/solvers/vanilla-transformer.py index 25eb373..60b06be 100644 --- a/solvers/vanilla-transformer.py +++ b/solvers/vanilla-transformer.py @@ -1,13 +1,16 @@ # Vanilla Transformer from benchopt import BaseSolver +import numpy as np +from tqdm import tqdm import torch import torch.nn as nn import torch.optim as optim -import numpy as np -from tqdm import tqdm -from benchmark_utils import mean_overlaping_pred +from torch.utils.data import DataLoader + from benchmark_utils.models import TransformerModel +from benchmark_utils.windowing import make_windowed_dataset +from benchmark_utils.windowing import reconstruct_from_windows class Solver(BaseSolver): @@ -26,20 +29,25 @@ class Solver(BaseSolver): "n_epochs": [50], "lr": [1e-5], "horizon": [1], - "window": [True], "window_size": [256], "stride": [1], "percentile": [97], } + test_config = { + 'solver': { + "n_epochs": 1, + "window_size": 16, + } + } - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test self.model = TransformerModel( n_features=X_train.shape[1], @@ -56,30 +64,22 @@ def set_objective(self, X_train, y_test, X_test): self.optimizer, mode='min', factor=0.5, patience=5 ) - # Using only windowed data, parameter used only for consistency - if self.window: - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - X_train, - window_shape=self.window_size+self.horizon, - axis=0 - ).transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - X_test, - window_shape=self.window_size+self.horizon, - axis=0 - ).transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - self.yw_test = torch.tensor( - self.yw_test, dtype=torch.float32 - ) + self.Xw_train = make_windowed_dataset( + X_train, + window_size=self.window_size+self.horizon, + stride=self.stride + ) + self.Xw_test = make_windowed_dataset( + X_test, + window_size=self.window_size+self.horizon, + stride=self.stride + ) + self.train_loader = DataLoader( + self.Xw_train, batch_size=self.batch_size, shuffle=True, + ) + self.test_loader = DataLoader( + self.Xw_test, batch_size=self.batch_size, shuffle=False, + ) def run(self, _): self.model.to(self.device) @@ -95,13 +95,10 @@ def run(self, _): for epoch in ti: self.model.train() total_loss = 0 - for i in range(0, len(self.Xw_train), self.batch_size): - x = torch.tensor( - self.Xw_train[i:i+self.batch_size, :self.window_size, :], - dtype=torch.float32).to(self.device) - y = torch.tensor( - self.Xw_train[i:i+self.batch_size, -self.horizon:, :], - dtype=torch.float32).to(self.device) + for x, in self.train_loader: + x = x.to(self.device) + y = x[:, -self.horizon:] + x = x[:, :-self.horizon] self.optimizer.zero_grad() output = self.model(x) @@ -133,50 +130,42 @@ def run(self, _): # Test loop self.model.eval() - batch_size = 1024 all_predictions = [] with torch.no_grad(): - for i in range(0, len(self.Xw_test), batch_size): - batch = torch.tensor( - self.Xw_test[i:i+batch_size, :self.window_size, :], - dtype=torch.float32 - ).to(self.device) - - batch_predictions = self.model(batch) - - if batch_predictions.is_cuda: - batch_predictions = batch_predictions.cpu().numpy() - else: - batch_predictions = batch_predictions.numpy() - - all_predictions.append(batch_predictions) + for x, in self.test_loader: + batch = x[:, :self.window_size].to(self.device) + with torch.no_grad(): + batch_predictions = self.model(batch) + all_predictions.append(batch_predictions.cpu().numpy()) xw_hat = np.concatenate(all_predictions, axis=0) # Continue with the rest of your code for reconstructing predictions x_hat = np.zeros_like(self.X_test) - 1 - x_hat[self.window_size:self.window_size+self.horizon] = xw_hat[0] - x_hat[self.window_size+self.horizon:] = mean_overlaping_pred( - xw_hat, 1) + x_hat[..., self.window_size:] = reconstruct_from_windows( + xw_hat, stride=self.stride, batch=len(self.X_test), + n_features=self.X_test.shape[1] + ) # Calculating the percentile value for the threshold percentile_value = np.percentile( - np.abs(self.X_test[self.window_size:] - x_hat[self.window_size:]), + np.abs(self.X_test[..., self.window_size:] + - x_hat[..., self.window_size:]), self.percentile ) # Thresholding - predictions = np.zeros_like(x_hat)-1 - predictions[self.window_size:] = np.where( - np.abs(self.X_test[self.window_size:] - - x_hat[self.window_size:]) > percentile_value, 1, 0 + predictions = np.zeros_like(self.X_test)-1 + predictions[..., self.window_size:] = np.where( + np.abs(self.X_test[..., self.window_size:] - + x_hat[..., self.window_size:]) > percentile_value, 1, 0 ) self.predictions = np.max(predictions, axis=1) - def skip(self, X_train, X_test, y_test): - if X_train.shape[0] < self.window_size + self.horizon: + def skip(self, X_train, X_test): + if X_train.shape[-1] < self.window_size + self.horizon: return True, "No enough training samples" return False, None diff --git a/test_config.py b/test_config.py index 77dd85a..fd149be 100644 --- a/test_config.py +++ b/test_config.py @@ -32,7 +32,7 @@ def check_test_solver_install(benchmark, solver_class): def check_test_dataset_get_data(benchmark, data_class): if data_class.name.lower() in [ "daphnet", "dodgers", "ecg", "genesis", "ghl", - "iops", "kdd21", "mgab", "mitdb", "msl", "nab", + "iops", "kdd21", "mgab", "mitdb", "nab", "occupancy", "opportunity", "sensorscope", "smd", "svdb", "yahoo" ]: From 4d2bbcedd7fdcf56581866481f521d6d27988b8b Mon Sep 17 00:00:00 2001 From: Jad Yehya Date: Tue, 12 May 2026 17:30:27 +0200 Subject: [PATCH 35/50] CLN remove safe_import_context --- benchmark_utils/__init__.py | 4 +--- benchmark_utils/metrics.py | 5 +---- datasets/daphnet.py | 13 ++++++------- datasets/dodgers.py | 11 +++++------ datasets/ecg.py | 11 +++++------ datasets/genesis.py | 11 +++++------ datasets/ghl.py | 11 +++++------ datasets/iops.py | 13 ++++++------- datasets/kdd21.py | 11 +++++------ datasets/mgab.py | 11 +++++------ datasets/mitdb.py | 11 +++++------ datasets/msl.py | 7 +++---- datasets/nab.py | 11 +++++------ datasets/occupancy.py | 11 +++++------ datasets/opportunity.py | 11 +++++------ datasets/pattern.py | 7 +++---- datasets/psm.py | 7 +++---- datasets/sensorscope.py | 11 +++++------ datasets/simulated.py | 7 +++---- datasets/smap.py | 9 ++++----- datasets/smd.py | 11 +++++------ datasets/svdb.py | 13 ++++++------- datasets/swat.py | 13 ++++++------- datasets/trend.py | 7 +++---- datasets/wadi.py | 13 ++++++------- datasets/yahoo.py | 11 +++++------ objective.py | 21 ++++++++++----------- solvers/legacy/abod.py | 6 ++---- solvers/legacy/cblof.py | 6 ++---- solvers/legacy/dif.py | 6 ++---- solvers/legacy/isolation-forest.py | 6 ++---- solvers/legacy/lof.py | 6 ++---- solvers/legacy/ocsvm.py | 7 +++---- 33 files changed, 139 insertions(+), 180 deletions(-) diff --git a/benchmark_utils/__init__.py b/benchmark_utils/__init__.py index 5f8fa37..cd5be6c 100644 --- a/benchmark_utils/__init__.py +++ b/benchmark_utils/__init__.py @@ -3,11 +3,9 @@ # name `benchmark_utils`, and code defined inside will be importable using # the usual import syntax -from benchopt import safe_import_context from pathlib import Path -with safe_import_context() as import_ctx: - import numpy as np +import numpy as np def mean_overlaping_pred(predictions, stride): diff --git a/benchmark_utils/metrics.py b/benchmark_utils/metrics.py index 4607670..febe234 100644 --- a/benchmark_utils/metrics.py +++ b/benchmark_utils/metrics.py @@ -1,7 +1,4 @@ -from benchopt import safe_import_context - -with safe_import_context() as import_ctx: - import numpy as np +import numpy as np def soft_precision(y_true: np.ndarray, diff --git a/datasets/daphnet.py b/datasets/daphnet.py index 66694f0..e2d42bc 100644 --- a/datasets/daphnet.py +++ b/datasets/daphnet.py @@ -1,12 +1,11 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd - import matplotlib.pyplot as plt +from pathlib import Path +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt - PATH = config.get_data_path("DAPHNET") +PATH = config.get_data_path("DAPHNET") def load_data(db_path, record_ids=None, verbose=False, number=-1): diff --git a/datasets/dodgers.py b/datasets/dodgers.py index c3c6e02..f3c6879 100644 --- a/datasets/dodgers.py +++ b/datasets/dodgers.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("DODGERS") +PATH = config.get_data_path("DODGERS") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/ecg.py b/datasets/ecg.py index 04357e5..81f1986 100644 --- a/datasets/ecg.py +++ b/datasets/ecg.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("ECG") +PATH = config.get_data_path("ECG") def load_data(db_path, record_ids=None, verbose=False, number=-1): diff --git a/datasets/genesis.py b/datasets/genesis.py index 4e3f00d..8425d89 100644 --- a/datasets/genesis.py +++ b/datasets/genesis.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("GENESIS") +PATH = config.get_data_path("GENESIS") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/ghl.py b/datasets/ghl.py index 3da6f93..dd102dd 100644 --- a/datasets/ghl.py +++ b/datasets/ghl.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("GHL") +PATH = config.get_data_path("GHL") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/iops.py b/datasets/iops.py index 7efcb1e..12df7ef 100644 --- a/datasets/iops.py +++ b/datasets/iops.py @@ -1,12 +1,11 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("IOPS") - PATH = "/data/parietal/store2/data/tsb-uad/TSB-UAD-Public/IOPS/" +PATH = config.get_data_path("IOPS") +PATH = "/data/parietal/store2/data/tsb-uad/TSB-UAD-Public/IOPS/" def load_data(db_path, verbose=False): diff --git a/datasets/kdd21.py b/datasets/kdd21.py index 6691919..3d0da0b 100644 --- a/datasets/kdd21.py +++ b/datasets/kdd21.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("KDD21") +PATH = config.get_data_path("KDD21") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/mgab.py b/datasets/mgab.py index 7006bbe..ac00972 100644 --- a/datasets/mgab.py +++ b/datasets/mgab.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("MGAB") +PATH = config.get_data_path("MGAB") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/mitdb.py b/datasets/mitdb.py index 3af87bd..7f811d0 100644 --- a/datasets/mitdb.py +++ b/datasets/mitdb.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("MITDB") +PATH = config.get_data_path("MITDB") def load_mitdb_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/msl.py b/datasets/msl.py index bb7067e..aaa2fe2 100644 --- a/datasets/msl.py +++ b/datasets/msl.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - import numpy as np - import requests +import numpy as np +import requests # Create global variables to store the urls URL_XTRAIN = ( diff --git a/datasets/nab.py b/datasets/nab.py index afa7612..20a0960 100644 --- a/datasets/nab.py +++ b/datasets/nab.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("NAB") +PATH = config.get_data_path("NAB") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/occupancy.py b/datasets/occupancy.py index a985586..cddb6e5 100644 --- a/datasets/occupancy.py +++ b/datasets/occupancy.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("OCCUPANCY") +PATH = config.get_data_path("OCCUPANCY") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/opportunity.py b/datasets/opportunity.py index 248d17e..3968a2b 100644 --- a/datasets/opportunity.py +++ b/datasets/opportunity.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("OPPORTUNITY") +PATH = config.get_data_path("OPPORTUNITY") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/pattern.py b/datasets/pattern.py index 0545cc4..42b4cd3 100644 --- a/datasets/pattern.py +++ b/datasets/pattern.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset -with safe_import_context() as import_ctx: - import numpy as np - from rosecdl.utils.utils_signal import generate_experiment +import numpy as np +from rosecdl.utils.utils_signal import generate_experiment class Dataset(BaseDataset): diff --git a/datasets/psm.py b/datasets/psm.py index bd5e60f..b5ce22c 100644 --- a/datasets/psm.py +++ b/datasets/psm.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - import requests - import pandas as pd +import requests +import pandas as pd URL_XTRAIN = ( "https://drive.google.com/uc?&id=1d3tAbYTj0CZLhB7z3IDTfTRg3E7qj_tw" diff --git a/datasets/sensorscope.py b/datasets/sensorscope.py index 1e5370d..7bcdb9a 100644 --- a/datasets/sensorscope.py +++ b/datasets/sensorscope.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("SENSORSCOPE") +PATH = config.get_data_path("SENSORSCOPE") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/simulated.py b/datasets/simulated.py index f41c1b8..a91b101 100644 --- a/datasets/simulated.py +++ b/datasets/simulated.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset -with safe_import_context() as import_ctx: - from sklearn.datasets import make_regression - import numpy as np +from sklearn.datasets import make_regression +import numpy as np class Dataset(BaseDataset): diff --git a/datasets/smap.py b/datasets/smap.py index 8d30ca9..756250e 100644 --- a/datasets/smap.py +++ b/datasets/smap.py @@ -1,9 +1,8 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - import numpy as np - import requests - # from sklearn.model_selection import TimeSeriesSplit +import numpy as np +import requests +# from sklearn.model_selection import TimeSeriesSplit URL_XTRAIN = ( "https://drive.google.com/uc?&id=1e_JhpIURD" diff --git a/datasets/smd.py b/datasets/smd.py index 8865fde..d258391 100644 --- a/datasets/smd.py +++ b/datasets/smd.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("SMD") +PATH = config.get_data_path("SMD") def load_data(db_path, record_ids=None): diff --git a/datasets/svdb.py b/datasets/svdb.py index 31f72b8..ea127c6 100644 --- a/datasets/svdb.py +++ b/datasets/svdb.py @@ -1,12 +1,11 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd - import matplotlib.pyplot as plt +from pathlib import Path +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt - PATH = config.get_data_path("SVDB") +PATH = config.get_data_path("SVDB") def load_data(db_path, record_ids=None, verbose=False, number=-1): diff --git a/datasets/swat.py b/datasets/swat.py index ffd9758..943aad6 100644 --- a/datasets/swat.py +++ b/datasets/swat.py @@ -1,14 +1,13 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset from benchopt.config import get_data_path from benchmark_utils import check_data -with safe_import_context() as import_ctx: - import pandas as pd +import pandas as pd - # Checking if the data is available - PATH = get_data_path(key="SWaT") - TRAIN_PATH = check_data(PATH, "SWaT", "train") - TEST_PATH = check_data(PATH, "SWaT", "test") +# Checking if the data is available +PATH = get_data_path(key="SWaT") +TRAIN_PATH = check_data(PATH, "SWaT", "train") +TEST_PATH = check_data(PATH, "SWaT", "test") class Dataset(BaseDataset): diff --git a/datasets/trend.py b/datasets/trend.py index a1a4cfb..44db101 100644 --- a/datasets/trend.py +++ b/datasets/trend.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset -with safe_import_context() as import_ctx: - import numpy as np - from rosecdl.utils.utils_signal import generate_experiment +import numpy as np +from rosecdl.utils.utils_signal import generate_experiment class Dataset(BaseDataset): diff --git a/datasets/wadi.py b/datasets/wadi.py index c5c89be..1c5c502 100644 --- a/datasets/wadi.py +++ b/datasets/wadi.py @@ -1,14 +1,13 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset from benchopt.config import get_data_path from benchmark_utils import check_data -with safe_import_context() as import_ctx: - import pandas as pd +import pandas as pd - # Checking if the data is available - PATH = get_data_path(key="WADI") - TRAIN_PATH = check_data(PATH, "WADI", "train") - TEST_PATH = check_data(PATH, "WADI", "test") +# Checking if the data is available +PATH = get_data_path(key="WADI") +TRAIN_PATH = check_data(PATH, "WADI", "train") +TEST_PATH = check_data(PATH, "WADI", "test") class Dataset(BaseDataset): diff --git a/datasets/yahoo.py b/datasets/yahoo.py index adc3cb1..181ef0e 100644 --- a/datasets/yahoo.py +++ b/datasets/yahoo.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("YAHOO") +PATH = config.get_data_path("YAHOO") def load_data(db_path, record_ids=None, verbose=False): diff --git a/objective.py b/objective.py index 660d7c1..48ee825 100644 --- a/objective.py +++ b/objective.py @@ -1,4 +1,4 @@ -from benchopt import BaseObjective, safe_import_context +from benchopt import BaseObjective from benchmark_utils.metrics import ( soft_precision as soft_precision_score, soft_recall as soft_recall_score, @@ -10,16 +10,15 @@ f1_t as f1_t_score ) -with safe_import_context() as import_ctx: - import numpy as np - from sklearn.metrics import ( - precision_score, - recall_score, - f1_score, - zero_one_loss, - roc_auc_score, - precision_recall_curve, - ) +import numpy as np +from sklearn.metrics import ( + precision_score, + recall_score, + f1_score, + zero_one_loss, + roc_auc_score, + precision_recall_curve, +) class Objective(BaseObjective): diff --git a/solvers/legacy/abod.py b/solvers/legacy/abod.py index 21d7226..52e6e53 100644 --- a/solvers/legacy/abod.py +++ b/solvers/legacy/abod.py @@ -1,11 +1,9 @@ # ABOD solver from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from pyod.models.abod import ABOD - import numpy as np +from pyod.models.abod import ABOD +import numpy as np class Solver(BaseSolver): diff --git a/solvers/legacy/cblof.py b/solvers/legacy/cblof.py index e3170a9..452be07 100644 --- a/solvers/legacy/cblof.py +++ b/solvers/legacy/cblof.py @@ -1,11 +1,9 @@ # Cluster Based Local Outlier Factor (CBLOF) solver from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from pyod.models.cblof import CBLOF - import numpy as np +from pyod.models.cblof import CBLOF +import numpy as np class Solver(BaseSolver): diff --git a/solvers/legacy/dif.py b/solvers/legacy/dif.py index bc0ac0d..b3b1f5f 100644 --- a/solvers/legacy/dif.py +++ b/solvers/legacy/dif.py @@ -1,10 +1,8 @@ # Deep Isolation Forest from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from pyod.models.dif import DIF - import numpy as np +from pyod.models.dif import DIF +import numpy as np class Solver(BaseSolver): diff --git a/solvers/legacy/isolation-forest.py b/solvers/legacy/isolation-forest.py index d8ce688..9215294 100644 --- a/solvers/legacy/isolation-forest.py +++ b/solvers/legacy/isolation-forest.py @@ -1,11 +1,9 @@ # Isolation Forest solver from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from sklearn.ensemble import IsolationForest - import numpy as np +from sklearn.ensemble import IsolationForest +import numpy as np class Solver(BaseSolver): diff --git a/solvers/legacy/lof.py b/solvers/legacy/lof.py index be3c64f..00e6534 100644 --- a/solvers/legacy/lof.py +++ b/solvers/legacy/lof.py @@ -1,11 +1,9 @@ # Local Outlier Factor from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from sklearn.neighbors import LocalOutlierFactor - import numpy as np +from sklearn.neighbors import LocalOutlierFactor +import numpy as np class Solver(BaseSolver): diff --git a/solvers/legacy/ocsvm.py b/solvers/legacy/ocsvm.py index 7489076..96f2f6f 100644 --- a/solvers/legacy/ocsvm.py +++ b/solvers/legacy/ocsvm.py @@ -1,8 +1,7 @@ -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - from sklearn.svm import OneClassSVM - import numpy as np +from sklearn.svm import OneClassSVM +import numpy as np class Solver(BaseSolver): From 722fca76401d452e694b02bd8c28b87f73e68141 Mon Sep 17 00:00:00 2001 From: Jad Date: Mon, 18 May 2026 14:43:50 +0200 Subject: [PATCH 36/50] CLN remove safe_import_context (#32) --- benchmark_utils/__init__.py | 4 +--- benchmark_utils/metrics.py | 5 +---- datasets/daphnet.py | 13 ++++++------- datasets/dodgers.py | 11 +++++------ datasets/ecg.py | 11 +++++------ datasets/genesis.py | 11 +++++------ datasets/ghl.py | 11 +++++------ datasets/iops.py | 13 ++++++------- datasets/kdd21.py | 11 +++++------ datasets/mgab.py | 11 +++++------ datasets/mitdb.py | 11 +++++------ datasets/msl.py | 7 +++---- datasets/nab.py | 11 +++++------ datasets/occupancy.py | 11 +++++------ datasets/opportunity.py | 11 +++++------ datasets/pattern.py | 7 +++---- datasets/psm.py | 7 +++---- datasets/sensorscope.py | 11 +++++------ datasets/simulated.py | 7 +++---- datasets/smap.py | 9 ++++----- datasets/smd.py | 11 +++++------ datasets/svdb.py | 13 ++++++------- datasets/swat.py | 13 ++++++------- datasets/trend.py | 7 +++---- datasets/wadi.py | 13 ++++++------- datasets/yahoo.py | 11 +++++------ objective.py | 21 ++++++++++----------- solvers/legacy/abod.py | 6 ++---- solvers/legacy/cblof.py | 6 ++---- solvers/legacy/dif.py | 6 ++---- solvers/legacy/isolation-forest.py | 6 ++---- solvers/legacy/lof.py | 6 ++---- solvers/legacy/ocsvm.py | 7 +++---- 33 files changed, 139 insertions(+), 180 deletions(-) diff --git a/benchmark_utils/__init__.py b/benchmark_utils/__init__.py index 5f8fa37..cd5be6c 100644 --- a/benchmark_utils/__init__.py +++ b/benchmark_utils/__init__.py @@ -3,11 +3,9 @@ # name `benchmark_utils`, and code defined inside will be importable using # the usual import syntax -from benchopt import safe_import_context from pathlib import Path -with safe_import_context() as import_ctx: - import numpy as np +import numpy as np def mean_overlaping_pred(predictions, stride): diff --git a/benchmark_utils/metrics.py b/benchmark_utils/metrics.py index 4607670..febe234 100644 --- a/benchmark_utils/metrics.py +++ b/benchmark_utils/metrics.py @@ -1,7 +1,4 @@ -from benchopt import safe_import_context - -with safe_import_context() as import_ctx: - import numpy as np +import numpy as np def soft_precision(y_true: np.ndarray, diff --git a/datasets/daphnet.py b/datasets/daphnet.py index 66694f0..e2d42bc 100644 --- a/datasets/daphnet.py +++ b/datasets/daphnet.py @@ -1,12 +1,11 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd - import matplotlib.pyplot as plt +from pathlib import Path +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt - PATH = config.get_data_path("DAPHNET") +PATH = config.get_data_path("DAPHNET") def load_data(db_path, record_ids=None, verbose=False, number=-1): diff --git a/datasets/dodgers.py b/datasets/dodgers.py index c3c6e02..f3c6879 100644 --- a/datasets/dodgers.py +++ b/datasets/dodgers.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("DODGERS") +PATH = config.get_data_path("DODGERS") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/ecg.py b/datasets/ecg.py index 04357e5..81f1986 100644 --- a/datasets/ecg.py +++ b/datasets/ecg.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("ECG") +PATH = config.get_data_path("ECG") def load_data(db_path, record_ids=None, verbose=False, number=-1): diff --git a/datasets/genesis.py b/datasets/genesis.py index 4e3f00d..8425d89 100644 --- a/datasets/genesis.py +++ b/datasets/genesis.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("GENESIS") +PATH = config.get_data_path("GENESIS") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/ghl.py b/datasets/ghl.py index 3da6f93..dd102dd 100644 --- a/datasets/ghl.py +++ b/datasets/ghl.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("GHL") +PATH = config.get_data_path("GHL") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/iops.py b/datasets/iops.py index 7efcb1e..12df7ef 100644 --- a/datasets/iops.py +++ b/datasets/iops.py @@ -1,12 +1,11 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("IOPS") - PATH = "/data/parietal/store2/data/tsb-uad/TSB-UAD-Public/IOPS/" +PATH = config.get_data_path("IOPS") +PATH = "/data/parietal/store2/data/tsb-uad/TSB-UAD-Public/IOPS/" def load_data(db_path, verbose=False): diff --git a/datasets/kdd21.py b/datasets/kdd21.py index 6691919..3d0da0b 100644 --- a/datasets/kdd21.py +++ b/datasets/kdd21.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("KDD21") +PATH = config.get_data_path("KDD21") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/mgab.py b/datasets/mgab.py index 7006bbe..ac00972 100644 --- a/datasets/mgab.py +++ b/datasets/mgab.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("MGAB") +PATH = config.get_data_path("MGAB") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/mitdb.py b/datasets/mitdb.py index 3af87bd..7f811d0 100644 --- a/datasets/mitdb.py +++ b/datasets/mitdb.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("MITDB") +PATH = config.get_data_path("MITDB") def load_mitdb_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/msl.py b/datasets/msl.py index bb7067e..aaa2fe2 100644 --- a/datasets/msl.py +++ b/datasets/msl.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - import numpy as np - import requests +import numpy as np +import requests # Create global variables to store the urls URL_XTRAIN = ( diff --git a/datasets/nab.py b/datasets/nab.py index afa7612..20a0960 100644 --- a/datasets/nab.py +++ b/datasets/nab.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("NAB") +PATH = config.get_data_path("NAB") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/occupancy.py b/datasets/occupancy.py index a985586..cddb6e5 100644 --- a/datasets/occupancy.py +++ b/datasets/occupancy.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("OCCUPANCY") +PATH = config.get_data_path("OCCUPANCY") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/opportunity.py b/datasets/opportunity.py index 248d17e..3968a2b 100644 --- a/datasets/opportunity.py +++ b/datasets/opportunity.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("OPPORTUNITY") +PATH = config.get_data_path("OPPORTUNITY") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/pattern.py b/datasets/pattern.py index 0545cc4..42b4cd3 100644 --- a/datasets/pattern.py +++ b/datasets/pattern.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset -with safe_import_context() as import_ctx: - import numpy as np - from rosecdl.utils.utils_signal import generate_experiment +import numpy as np +from rosecdl.utils.utils_signal import generate_experiment class Dataset(BaseDataset): diff --git a/datasets/psm.py b/datasets/psm.py index bd5e60f..b5ce22c 100644 --- a/datasets/psm.py +++ b/datasets/psm.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - import requests - import pandas as pd +import requests +import pandas as pd URL_XTRAIN = ( "https://drive.google.com/uc?&id=1d3tAbYTj0CZLhB7z3IDTfTRg3E7qj_tw" diff --git a/datasets/sensorscope.py b/datasets/sensorscope.py index 1e5370d..7bcdb9a 100644 --- a/datasets/sensorscope.py +++ b/datasets/sensorscope.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("SENSORSCOPE") +PATH = config.get_data_path("SENSORSCOPE") def load_data(db_path, record_ids=None, verbose=False): diff --git a/datasets/simulated.py b/datasets/simulated.py index f41c1b8..a91b101 100644 --- a/datasets/simulated.py +++ b/datasets/simulated.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset -with safe_import_context() as import_ctx: - from sklearn.datasets import make_regression - import numpy as np +from sklearn.datasets import make_regression +import numpy as np class Dataset(BaseDataset): diff --git a/datasets/smap.py b/datasets/smap.py index 8d30ca9..756250e 100644 --- a/datasets/smap.py +++ b/datasets/smap.py @@ -1,9 +1,8 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - import numpy as np - import requests - # from sklearn.model_selection import TimeSeriesSplit +import numpy as np +import requests +# from sklearn.model_selection import TimeSeriesSplit URL_XTRAIN = ( "https://drive.google.com/uc?&id=1e_JhpIURD" diff --git a/datasets/smd.py b/datasets/smd.py index 8865fde..d258391 100644 --- a/datasets/smd.py +++ b/datasets/smd.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("SMD") +PATH = config.get_data_path("SMD") def load_data(db_path, record_ids=None): diff --git a/datasets/svdb.py b/datasets/svdb.py index 31f72b8..ea127c6 100644 --- a/datasets/svdb.py +++ b/datasets/svdb.py @@ -1,12 +1,11 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd - import matplotlib.pyplot as plt +from pathlib import Path +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt - PATH = config.get_data_path("SVDB") +PATH = config.get_data_path("SVDB") def load_data(db_path, record_ids=None, verbose=False, number=-1): diff --git a/datasets/swat.py b/datasets/swat.py index ffd9758..943aad6 100644 --- a/datasets/swat.py +++ b/datasets/swat.py @@ -1,14 +1,13 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset from benchopt.config import get_data_path from benchmark_utils import check_data -with safe_import_context() as import_ctx: - import pandas as pd +import pandas as pd - # Checking if the data is available - PATH = get_data_path(key="SWaT") - TRAIN_PATH = check_data(PATH, "SWaT", "train") - TEST_PATH = check_data(PATH, "SWaT", "test") +# Checking if the data is available +PATH = get_data_path(key="SWaT") +TRAIN_PATH = check_data(PATH, "SWaT", "train") +TEST_PATH = check_data(PATH, "SWaT", "test") class Dataset(BaseDataset): diff --git a/datasets/trend.py b/datasets/trend.py index a1a4cfb..44db101 100644 --- a/datasets/trend.py +++ b/datasets/trend.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset -with safe_import_context() as import_ctx: - import numpy as np - from rosecdl.utils.utils_signal import generate_experiment +import numpy as np +from rosecdl.utils.utils_signal import generate_experiment class Dataset(BaseDataset): diff --git a/datasets/wadi.py b/datasets/wadi.py index c5c89be..1c5c502 100644 --- a/datasets/wadi.py +++ b/datasets/wadi.py @@ -1,14 +1,13 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset from benchopt.config import get_data_path from benchmark_utils import check_data -with safe_import_context() as import_ctx: - import pandas as pd +import pandas as pd - # Checking if the data is available - PATH = get_data_path(key="WADI") - TRAIN_PATH = check_data(PATH, "WADI", "train") - TEST_PATH = check_data(PATH, "WADI", "test") +# Checking if the data is available +PATH = get_data_path(key="WADI") +TRAIN_PATH = check_data(PATH, "WADI", "train") +TEST_PATH = check_data(PATH, "WADI", "test") class Dataset(BaseDataset): diff --git a/datasets/yahoo.py b/datasets/yahoo.py index adc3cb1..181ef0e 100644 --- a/datasets/yahoo.py +++ b/datasets/yahoo.py @@ -1,11 +1,10 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - from pathlib import Path - import numpy as np - import pandas as pd +from pathlib import Path +import numpy as np +import pandas as pd - PATH = config.get_data_path("YAHOO") +PATH = config.get_data_path("YAHOO") def load_data(db_path, record_ids=None, verbose=False): diff --git a/objective.py b/objective.py index 660d7c1..48ee825 100644 --- a/objective.py +++ b/objective.py @@ -1,4 +1,4 @@ -from benchopt import BaseObjective, safe_import_context +from benchopt import BaseObjective from benchmark_utils.metrics import ( soft_precision as soft_precision_score, soft_recall as soft_recall_score, @@ -10,16 +10,15 @@ f1_t as f1_t_score ) -with safe_import_context() as import_ctx: - import numpy as np - from sklearn.metrics import ( - precision_score, - recall_score, - f1_score, - zero_one_loss, - roc_auc_score, - precision_recall_curve, - ) +import numpy as np +from sklearn.metrics import ( + precision_score, + recall_score, + f1_score, + zero_one_loss, + roc_auc_score, + precision_recall_curve, +) class Objective(BaseObjective): diff --git a/solvers/legacy/abod.py b/solvers/legacy/abod.py index 21d7226..52e6e53 100644 --- a/solvers/legacy/abod.py +++ b/solvers/legacy/abod.py @@ -1,11 +1,9 @@ # ABOD solver from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from pyod.models.abod import ABOD - import numpy as np +from pyod.models.abod import ABOD +import numpy as np class Solver(BaseSolver): diff --git a/solvers/legacy/cblof.py b/solvers/legacy/cblof.py index e3170a9..452be07 100644 --- a/solvers/legacy/cblof.py +++ b/solvers/legacy/cblof.py @@ -1,11 +1,9 @@ # Cluster Based Local Outlier Factor (CBLOF) solver from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from pyod.models.cblof import CBLOF - import numpy as np +from pyod.models.cblof import CBLOF +import numpy as np class Solver(BaseSolver): diff --git a/solvers/legacy/dif.py b/solvers/legacy/dif.py index bc0ac0d..b3b1f5f 100644 --- a/solvers/legacy/dif.py +++ b/solvers/legacy/dif.py @@ -1,10 +1,8 @@ # Deep Isolation Forest from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from pyod.models.dif import DIF - import numpy as np +from pyod.models.dif import DIF +import numpy as np class Solver(BaseSolver): diff --git a/solvers/legacy/isolation-forest.py b/solvers/legacy/isolation-forest.py index d8ce688..9215294 100644 --- a/solvers/legacy/isolation-forest.py +++ b/solvers/legacy/isolation-forest.py @@ -1,11 +1,9 @@ # Isolation Forest solver from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from sklearn.ensemble import IsolationForest - import numpy as np +from sklearn.ensemble import IsolationForest +import numpy as np class Solver(BaseSolver): diff --git a/solvers/legacy/lof.py b/solvers/legacy/lof.py index be3c64f..00e6534 100644 --- a/solvers/legacy/lof.py +++ b/solvers/legacy/lof.py @@ -1,11 +1,9 @@ # Local Outlier Factor from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from sklearn.neighbors import LocalOutlierFactor - import numpy as np +from sklearn.neighbors import LocalOutlierFactor +import numpy as np class Solver(BaseSolver): diff --git a/solvers/legacy/ocsvm.py b/solvers/legacy/ocsvm.py index 7489076..96f2f6f 100644 --- a/solvers/legacy/ocsvm.py +++ b/solvers/legacy/ocsvm.py @@ -1,8 +1,7 @@ -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - from sklearn.svm import OneClassSVM - import numpy as np +from sklearn.svm import OneClassSVM +import numpy as np class Solver(BaseSolver): From 6ab2823ba46e957ee8e443a2577c16ad7a479d48 Mon Sep 17 00:00:00 2001 From: Jad Yehya Date: Mon, 18 May 2026 16:55:21 +0200 Subject: [PATCH 37/50] Clear separation between `anomaly_scores` and optional `anomaly_predictions` (binary, used via solver-side `cutoff`). --- benchmark_utils/predictions.py | 34 ++++ objective.py | 361 +++++++++++++++++++++++++-------- tests/test_objective.py | 110 ++++++++++ tests/test_predictions.py | 33 +++ 4 files changed, 459 insertions(+), 79 deletions(-) create mode 100644 benchmark_utils/predictions.py create mode 100644 tests/test_objective.py create mode 100644 tests/test_predictions.py diff --git a/benchmark_utils/predictions.py b/benchmark_utils/predictions.py new file mode 100644 index 0000000..7517dbc --- /dev/null +++ b/benchmark_utils/predictions.py @@ -0,0 +1,34 @@ +import numpy as np + + +def cutoff_scores(anomaly_scores, cutoff=None): + """Turn anomaly scores into binary predictions using a contamination rate. + + Larger scores are assumed to be more anomalous. NaN entries are preserved + as ``-1`` ignore labels so they are masked by the objective. + """ + if cutoff is None: + return None + + validate_cutoff(cutoff) + + scores = np.asarray(anomaly_scores) + predictions = np.full(scores.shape, -1, dtype=int) + valid = ~np.isnan(scores) + if not np.any(valid): + return predictions + + threshold = np.quantile(scores[valid], 1 - cutoff) + + predictions[valid] = (scores[valid] >= threshold).astype(int) + return predictions + + +def validate_cutoff(cutoff): + if cutoff is None: + raise ValueError("cutoff must be provided.") + if not 0 < cutoff < 1: + raise ValueError( + "cutoff must be in (0, 1), " + f"got {cutoff!r}." + ) diff --git a/objective.py b/objective.py index 48ee825..04dbde5 100644 --- a/objective.py +++ b/objective.py @@ -3,21 +3,22 @@ soft_precision as soft_precision_score, soft_recall as soft_recall_score, soft_f1 as soft_f1_score, - ctt, ttc, + ctt, + ttc, extract_anomaly_ranges, precision_t as precision_t_score, recall_t as recall_t_score, - f1_t as f1_t_score + f1_t as f1_t_score, ) import numpy as np from sklearn.metrics import ( + average_precision_score, precision_score, recall_score, f1_score, zero_one_loss, roc_auc_score, - precision_recall_curve, ) @@ -27,99 +28,301 @@ class Objective(BaseObjective): install_cmd = "conda" requirements = ["scikit-learn"] + parameters = { + "score_metrics": [("auc_pr", "auc_roc")], + "prediction_metrics": [None], + } + + detection_ranges = (1, 3, 5, 10, 20) + default_prediction_metrics = ( + "precision", + "recall", + "f1", + "precision_t", + "recall_t", + "f1_t", + "ctt", + "ttc", + "zoloss", + "soft_precision", + "soft_recall", + "soft_f1", + ) + def get_one_result(self): - """Return one solution for which the objective can be computed, - Used to get the shape of the result. - Our algorithms will return an array of labels of shape (n_samples,) - """ - return dict(y_hat=np.zeros_like(self.y_test)) + """Return one solution for which the objective can be computed.""" + score_metrics = self._normalize_metrics( + getattr(self, "score_metrics", ("auc_pr", "auc_roc")) + ) + prediction_metrics = self._expand_prediction_metrics( + getattr(self, "prediction_metrics", None) + ) + + result = {} + if score_metrics: + result["anomaly_scores"] = np.zeros_like( + self.y_test, dtype=float + ) + if prediction_metrics: + result["anomaly_predictions"] = np.zeros_like( + self.y_test, dtype=int + ) + return result def set_data(self, X_train, y_test, X_test): "Set the data to compute the objective." self.X_train = X_train self.X_test, self.y_test = X_test, y_test - def evaluate_result(self, y_hat, raw_anomaly_score=None): - """Evaluate the result provided by the solver.""" - print("y_hat shape", y_hat.shape) - print("self.y_test shape", self.y_test.shape) + def evaluate_result( + self, + anomaly_scores=None, + anomaly_predictions=None, + ): + """Evaluate the result provided by the solver. + + anomaly_scores is the score-based solver output. + anomaly_predictions is optional and only needed when requesting + prediction-based metrics. + """ + score_metrics = self._normalize_metrics( + getattr(self, "score_metrics", ("auc_pr", "auc_roc")) + ) + prediction_metrics = self._expand_prediction_metrics( + getattr(self, "prediction_metrics", None) + ) - to_discard = (y_hat == -1).sum() - self.y_test = self.y_test.reshape(-1)[to_discard:] - y_hat = y_hat.reshape(-1)[to_discard:] + if score_metrics and anomaly_scores is None: + raise ValueError("score_metrics require an anomaly_scores array.") + if prediction_metrics and anomaly_predictions is None: + raise ValueError( + "prediction_metrics require an anomaly_predictions array.") - print("y_hat shape after discard", y_hat.shape) - print("self.y_test shape after discard", self.y_test.shape) + y_true, scores, predictions = self._align_inputs( + anomaly_scores=anomaly_scores, + anomaly_predictions=anomaly_predictions, + ) result = {} - detection_ranges = [1, 3, 5, 10, 20] + if score_metrics: + result.update( + self._compute_score_metrics( + y_true=y_true, + anomaly_scores=scores, + metrics=score_metrics, + ) + ) + if prediction_metrics: + result.update( + self._compute_prediction_metrics( + y_true=y_true, + anomaly_predictions=predictions, + metrics=prediction_metrics, + ) + ) - # Standard metrics - precision = precision_score(self.y_test, y_hat, zero_division=0) - recall = recall_score(self.y_test, y_hat, zero_division=0) - f1 = f1_score(self.y_test, y_hat, zero_division=0) + # Setting value to 0. The actual value is not used for ranking. + result["value"] = 0.0 + return result - anomaly_ranges = extract_anomaly_ranges(self.y_test) - prediction_ranges = extract_anomaly_ranges(y_hat) + def get_objective(self): + return dict(X_train=self.X_train, X_test=self.X_test) - precision_t = precision_t_score(anomaly_ranges, prediction_ranges) - recall_t = recall_t_score(anomaly_ranges, prediction_ranges) - f1_t = f1_t_score(anomaly_ranges, prediction_ranges) + def _normalize_metrics(self, metrics): + if metrics is None: + return () + if isinstance(metrics, str): + if metrics == "all": + return ("auc_pr", "auc_roc") + return (metrics,) + return tuple(metric for metric in metrics if metric is not None) - result.update({ - "precision": precision, - "recall": recall, - "f1": f1 - }) + def _expand_prediction_metrics(self, metrics): + metrics = self._normalize_prediction_metrics(metrics) + expanded = [] - for range_value in detection_ranges: - soft_precision = soft_precision_score( - self.y_test, y_hat, detection_range=range_value - ) - soft_recall = soft_recall_score( - self.y_test, y_hat, detection_range=range_value - ) - soft_f1 = soft_f1_score(soft_precision, soft_recall) - - result.update({ - f"soft_precision_{range_value}": soft_precision, - f"soft_recall_{range_value}": soft_recall, - f"soft_f1_{range_value}": soft_f1 - }) - - zoloss = zero_one_loss(self.y_test, y_hat) - - # Other metrics - cct_score = ctt(self.y_test, y_hat) - ttc_score = ttc(self.y_test, y_hat) - - # Add remaining metrics to the result dictionary - result.update({ - "precision_t": precision_t, - "recall_t": recall_t, - "f1_t": f1_t, - "cct": cct_score, - "ttc": ttc_score, - "zoloss": zoloss, - "value": zoloss # having zoloss twice for the API - }) - - # AUC-ROC and AUC-PR - if raw_anomaly_score is not None: - auc_roc = roc_auc_score(self.y_test, raw_anomaly_score) - precision_curve, recall_curve, _ = precision_recall_curve( - self.y_test, raw_anomaly_score) - auc_pr = -np.trapz(precision_curve, recall_curve) - - result["auc_roc"] = auc_roc - result["auc_pr"] = auc_pr - - for key, value in result.items(): - print(f"{key}: {value}") + for metric in metrics: + if metric == "all": + metric = self.default_prediction_metrics + else: + metric = (metric,) + + for name in metric: + if name in { + "soft_precision", + "soft_recall", + "soft_f1", + }: + expanded.extend( + f"{name}_{detection_range}" + for detection_range in self.detection_ranges + ) + else: + expanded.append(name) + return tuple(expanded) + + def _normalize_prediction_metrics(self, metrics): + if metrics is None: + return () + if isinstance(metrics, str): + return (metrics,) + return tuple(metric for metric in metrics if metric is not None) + + def _align_inputs(self, anomaly_scores, anomaly_predictions): + # flatten everything before aligning lengths. + y_true = np.asarray(self.y_test).reshape(-1) + scores = self._as_flat_array(anomaly_scores) + predictions = self._as_flat_array(anomaly_predictions) + + # Only align against arrays that were returned. This keeps + # score-only and prediction-only evaluations valid. + arrays = [array for array in ( + scores, predictions) if array is not None] + if not arrays: + return y_true, None, None + + # Windowed solvers return fewer outputs than y_test because the + # first timestamps have no full context window. Keep the last samples, + # which correspond to the part of y_test the solver scored. + length = min([len(y_true)] + [len(array) for array in arrays]) + y_true = y_true[-length:] + if scores is not None: + scores = scores[-length:] + if predictions is not None: + predictions = predictions[-length:] + + # Drop invalid positions. NaN score padding and -1 prediction padding + # When both scores and predictions are present, the same mask is + # applied to keep mixed metric requests on the same timestamps. + valid = np.ones(length, dtype=bool) + if scores is not None: + valid &= ~np.isnan(scores) + if predictions is not None: + valid &= ~np.isnan(predictions) + valid &= predictions != -1 + + y_true = y_true[valid] + if scores is not None: + scores = scores[valid] + if predictions is not None: + predictions = predictions[valid] + + return y_true, scores, predictions + + def _as_flat_array(self, array): + if array is None: + return None + return np.asarray(array).reshape(-1) + + def _compute_score_metrics(self, y_true, anomaly_scores, metrics): + if len(y_true) == 0: + return {metric: np.nan for metric in metrics} + + result = {} + for metric in metrics: + if metric == "auc_roc": + result[metric] = self._safe_auc_roc(y_true, anomaly_scores) + elif metric == "auc_pr": + result[metric] = self._auc_pr(y_true, anomaly_scores) + else: + raise ValueError(f"Unknown score metric: {metric}") return result - def get_objective(self): - return dict( - X_train=self.X_train, X_test=self.X_test + def _compute_prediction_metrics( + self, + y_true, + anomaly_predictions, + metrics, + ): + if len(y_true) == 0: + return {metric: np.nan for metric in metrics} + + result = {} + anomaly_ranges = None + prediction_ranges = None + + for metric in metrics: + if metric == "precision": + result[metric] = precision_score( + y_true, anomaly_predictions, zero_division=0 + ) + elif metric == "recall": + result[metric] = recall_score( + y_true, anomaly_predictions, zero_division=0 + ) + elif metric == "f1": + result[metric] = f1_score( + y_true, anomaly_predictions, zero_division=0) + elif metric == "zoloss": + result[metric] = zero_one_loss(y_true, anomaly_predictions) + elif metric in {"precision_t", "recall_t", "f1_t"}: + if anomaly_ranges is None: + anomaly_ranges, prediction_ranges = self._get_ranges( + y_true, anomaly_predictions + ) + if metric == "precision_t": + result[metric] = precision_t_score( + anomaly_ranges, prediction_ranges + ) + elif metric == "recall_t": + result[metric] = recall_t_score( + anomaly_ranges, prediction_ranges) + else: + result[metric] = f1_t_score( + anomaly_ranges, prediction_ranges) + elif metric == "ctt": + result[metric] = ctt(y_true, anomaly_predictions) + elif metric == "ttc": + result[metric] = ttc(y_true, anomaly_predictions) + elif metric.startswith("soft_precision_"): + detection_range = self._parse_detection_range( + metric, "soft_precision") + result[metric] = soft_precision_score( + y_true, + anomaly_predictions, + detection_range=detection_range, + ) + elif metric.startswith("soft_recall_"): + detection_range = self._parse_detection_range( + metric, "soft_recall") + result[metric] = soft_recall_score( + y_true, + anomaly_predictions, + detection_range=detection_range, + ) + elif metric.startswith("soft_f1_"): + detection_range = self._parse_detection_range( + metric, "soft_f1") + result[metric] = soft_f1_score( + y_true, + anomaly_predictions, + detection_range=detection_range, + ) + else: + raise ValueError(f"Unknown prediction metric: {metric}") + + return result + + def _get_ranges(self, y_true, anomaly_predictions): + return ( + extract_anomaly_ranges(y_true), + extract_anomaly_ranges(anomaly_predictions), ) + + def _parse_detection_range(self, metric, prefix): + suffix = metric.replace(f"{prefix}_", "", 1) + try: + return int(suffix) + except ValueError as exc: + raise ValueError( + f"Invalid detection range in prediction metric: {metric}" + ) from exc + + def _safe_auc_roc(self, y_true, anomaly_scores): + return roc_auc_score(y_true, anomaly_scores) + + def _auc_pr(self, y_true, anomaly_scores): + if len(np.unique(y_true)) == 1: + return np.nan + return average_precision_score(y_true, anomaly_scores) diff --git a/tests/test_objective.py b/tests/test_objective.py new file mode 100644 index 0000000..6a50afc --- /dev/null +++ b/tests/test_objective.py @@ -0,0 +1,110 @@ +import numpy as np +import pytest + +from objective import Objective + + +def make_objective(score_metrics=("auc_pr", "auc_roc"), + prediction_metrics=None): + objective = Objective() + objective.score_metrics = score_metrics + objective.prediction_metrics = prediction_metrics + objective.set_data( + X_train=np.empty((1, 1, 6)), + y_test=np.array([0, 0, 1, 0, 1, 0]), + X_test=np.empty((1, 1, 6)), + ) + return objective + + +def test_default_evaluation_uses_score_metrics_only(): + objective = make_objective() + scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2]) + + result = objective.evaluate_result(anomaly_scores=scores) + + assert result["auc_pr"] == pytest.approx(1.0) + assert result["auc_roc"] == pytest.approx(1.0) + assert result["value"] == pytest.approx(0.0) + assert "precision" not in result + + +def test_score_and_prediction_metrics_use_canonical_keys(): + objective = make_objective( + score_metrics=("auc_pr",), + prediction_metrics=("precision",), + ) + scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2]) + predictions = np.array([0, 0, 1, 0, 1, 0]) + + result = objective.evaluate_result( + anomaly_scores=scores, + anomaly_predictions=predictions, + ) + + assert result["auc_pr"] == pytest.approx(1.0) + assert result["precision"] == pytest.approx(1.0) + + +def test_prediction_metrics_are_opt_in(): + objective = make_objective( + prediction_metrics=("precision", "recall", "f1", "zoloss"), + ) + scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2]) + predictions = np.array([0, 0, 1, 0, 1, 0]) + + result = objective.evaluate_result( + anomaly_scores=scores, + anomaly_predictions=predictions, + ) + + assert result["precision"] == pytest.approx(1.0) + assert result["recall"] == pytest.approx(1.0) + assert result["f1"] == pytest.approx(1.0) + assert result["zoloss"] == pytest.approx(0.0) + + +def test_prediction_metrics_require_prediction_array(): + objective = make_objective(prediction_metrics=("precision",)) + scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2]) + + with pytest.raises(ValueError, match="anomaly_predictions"): + objective.evaluate_result(anomaly_scores=scores) + + +def test_nan_score_padding_is_masked(): + objective = make_objective() + scores = np.array([np.nan, 0.2, 0.9, 0.1, 0.8, 0.2]) + + result = objective.evaluate_result(anomaly_scores=scores) + + assert result["auc_pr"] == pytest.approx(1.0) + assert result["auc_roc"] == pytest.approx(1.0) + + +def test_prediction_padding_is_masked(): + objective = make_objective( + score_metrics=None, + prediction_metrics=("precision", "recall", "f1"), + ) + predictions = np.array([-1, 0, 1, 0, 1, 0]) + + result = objective.evaluate_result(anomaly_predictions=predictions) + + assert result["precision"] == pytest.approx(1.0) + assert result["recall"] == pytest.approx(1.0) + assert result["f1"] == pytest.approx(1.0) + assert result["value"] == pytest.approx(0.0) + + +def test_prediction_only_metrics_without_primary_value_fallback_to_zero(): + objective = make_objective( + score_metrics=None, + prediction_metrics=("precision",), + ) + predictions = np.array([0, 0, 1, 0, 1, 0]) + + result = objective.evaluate_result(anomaly_predictions=predictions) + + assert result["precision"] == pytest.approx(1.0) + assert result["value"] == pytest.approx(0.0) diff --git a/tests/test_predictions.py b/tests/test_predictions.py new file mode 100644 index 0000000..1e8f9b7 --- /dev/null +++ b/tests/test_predictions.py @@ -0,0 +1,33 @@ +import numpy as np +import pytest + +from benchmark_utils.predictions import cutoff_scores + + +def test_cutoff_scores_returns_none_without_cutoff(): + scores = np.array([0.1, 0.8, 0.2]) + + assert cutoff_scores(scores) is None + + +def test_cutoff_scores_uses_top_score_fraction(): + scores = np.array([0.1, 0.8, 0.2, 0.9]) + + predictions = cutoff_scores(scores, cutoff=0.25) + + np.testing.assert_array_equal(predictions, np.array([0, 0, 0, 1])) + + +def test_cutoff_scores_preserves_nan_padding_as_ignore_label(): + scores = np.array([np.nan, 0.1, 0.8, 0.2, 0.9]) + + predictions = cutoff_scores(scores, cutoff=0.25) + + np.testing.assert_array_equal(predictions, np.array([-1, 0, 0, 0, 1])) + + +def test_cutoff_scores_rejects_invalid_cutoff(): + scores = np.array([0.1, 0.8, 0.2]) + + with pytest.raises(ValueError, match="must be in"): + cutoff_scores(scores, cutoff=1) From 7e288e53814cb9c218d023fc4258aeda71792073 Mon Sep 17 00:00:00 2001 From: Jad Yehya Date: Mon, 18 May 2026 17:00:47 +0200 Subject: [PATCH 38/50] Adapt solvers to score/prediction contract with `cutoff` parameter. --- solvers/AR.py | 53 +++++++++++++----------------- solvers/anomalybert.py | 24 +++++++++----- solvers/autoencoder.py | 37 ++++++++++++++------- solvers/dagmm.py | 30 +++++++---------- solvers/legacy/abod.py | 44 ++++++++++--------------- solvers/legacy/cblof.py | 43 ++++++++++-------------- solvers/legacy/dif.py | 43 ++++++++++-------------- solvers/legacy/isolation-forest.py | 46 +++++++++++++------------- solvers/legacy/lof.py | 43 ++++++++++-------------- solvers/legacy/ocsvm.py | 42 ++++++++++------------- solvers/lstm.py | 34 +++++++++---------- solvers/matrixprofile.py | 42 ++++++++++++----------- solvers/rosecdl.py | 24 ++++++++------ solvers/tsb_chronos.py | 19 +++++++---- solvers/tsb_timesfm.py | 27 +++++++++++---- solvers/tsb_timesnet.py | 40 ++++++++++++++++++---- solvers/vae.py | 22 ++++++++----- solvers/vanilla-transformer.py | 46 ++++++++++++++------------ test_config.py | 35 +++++++++++++++++--- 19 files changed, 379 insertions(+), 315 deletions(-) diff --git a/solvers/AR.py b/solvers/AR.py index ba2446e..50eb908 100644 --- a/solvers/AR.py +++ b/solvers/AR.py @@ -8,13 +8,14 @@ from benchmark_utils.models import ARModel from benchmark_utils import mean_overlaping_pred +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): name = "AR" # AutoRegressive Linear model install_cmd = "conda" - requirements = ["pip::torch", "tqdm"] + requirements = ["pytorch", "tqdm"] sampling_strategy = "run_once" @@ -25,14 +26,12 @@ class Solver(BaseSolver): "weight_decay": [1e-7], "window_size": [100], "horizon": [1], - "percentile": [99.4], + "cutoff": [None], } test_config = { - 'solver': { - "n_epochs": 1, - "window_size": 16, - } + "n_epochs": 1, + "window_size": 16, } def set_objective(self, X_train, X_test): @@ -62,10 +61,6 @@ def set_objective(self, X_train, X_test): ) self.criterion = nn.MSELoss() - print("IN AR") - print("X_train shape", self.X_train.shape) - print("X_test shape", self.X_test.shape) - if self.X_train is not None: # (n_windows, window_size+horizon, n_features) self.Xw_train = np.lib.stride_tricks.sliding_window_view( @@ -136,31 +131,26 @@ def run(self, _): xw_hat = xw_hat.detach().cpu().numpy() - # Reconstructing the prediction from the predicted windows - # Creating the prediction array with -1 for the unknown values - # Corresponding to the first window_size values - x_hat = np.zeros_like(self.X_test)-1 # (n_test_samples, n_features) - x_hat[self.window_size:self.window_size+self.horizon] = xw_hat[0] + # Reconstructing the prediction from the predicted windows. + # The first ``window_size`` positions have no forecast (no full input + # window precedes them); fill them with -1 as a sentinel. + x_hat = np.zeros_like(self.X_test) - 1 + x_hat[self.window_size:] = mean_overlaping_pred(xw_hat, 1) - x_hat[self.window_size+self.horizon:] = mean_overlaping_pred( - xw_hat, 1 + reconstruction_err = np.abs( + self.X_test[self.window_size:] - x_hat[self.window_size:] ) - - # Calculating the percentile value for the threshold - percentile_value = np.percentile( - np.abs(self.X_test[self.window_size:] - x_hat[self.window_size:]), - self.percentile + self.anomaly_scores = np.full( + self.X_test.shape, np.nan, dtype=float ) + self.anomaly_scores[self.window_size:] = reconstruction_err + self.anomaly_scores = np.max(self.anomaly_scores, axis=1) - # Thresholding - predictions = np.zeros_like(x_hat)-1 - predictions[self.window_size:] = np.where( - np.abs(self.X_test[self.window_size:] - - x_hat[self.window_size:]) > percentile_value, 1, 0 + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) - self.predictions = np.max(predictions, axis=1) - # Skipping the solver call if a condition is met def skip(self, X_train, X_test): if X_train.shape[0]*X_train.shape[2] < self.window_size + self.horizon: @@ -170,4 +160,7 @@ def skip(self, X_train, X_test): return False, None def get_result(self): - return dict(y_hat=self.predictions) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/anomalybert.py b/solvers/anomalybert.py index abcda9a..8c9d1b4 100644 --- a/solvers/anomalybert.py +++ b/solvers/anomalybert.py @@ -1,3 +1,4 @@ +from models.anomaly_transformer import get_anomaly_transformer from benchopt import BaseSolver import sys @@ -8,17 +9,17 @@ from torch.optim.lr_scheduler import CosineAnnealingLR from tqdm import tqdm +from benchmark_utils.predictions import cutoff_scores + # Add AnomalyBERT to path sys.path.append(str(Path(__file__).parent.parent / 'AnomalyBERT')) -from models.anomaly_transformer import get_anomaly_transformer - class Solver(BaseSolver): name = "AnomalyBERT" sampling_strategy = "run_once" - requirements = ["pip::timm", "pip::torch", "pip::numpy", "pip::tqdm"] + requirements = ["pip::timm", "pytorch", "numpy", "tqdm"] parameters = { "patch_size": [1], @@ -31,6 +32,7 @@ class Solver(BaseSolver): "seed": [548920], "device": ["cuda:1"], "window_sliding": [16], + "cutoff": [None], } sampling_strategy = "run_once" @@ -237,7 +239,6 @@ def replacing_weights(interval_len): optimizer.step() scheduler.step() - def get_result(self): device = torch.device( self.device if torch.cuda.is_available() else 'cpu') self.model.eval() @@ -281,9 +282,14 @@ def get_result(self): n_overlap[idx:idx+window_size] += 1 n_overlap[n_overlap == 0] = 1 - scores = (output_values / n_overlap).cpu().numpy() - - threshold = np.percentile(scores, (1 - 0.1) * 100) - y_hat = (scores > threshold).astype(int) + self.anomaly_scores = (output_values / n_overlap).cpu().numpy() + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) - return dict(y_hat=y_hat, raw_anomaly_score=scores) + def get_result(self): + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/autoencoder.py b/solvers/autoencoder.py index f5a138e..a36fad0 100644 --- a/solvers/autoencoder.py +++ b/solvers/autoencoder.py @@ -1,17 +1,17 @@ from benchopt import BaseSolver -import numpy as np -from TSB_AD.utils.slidingWindows import find_length from sklearn.preprocessing import MinMaxScaler from benchmark_utils.models import Autoencoder +from benchmark_utils.predictions import cutoff_scores +from benchmark_utils.windowing import find_period_length class Solver(BaseSolver): name = "AE" install_cmd = "conda" - requirements = ["pip::tsb-uad", "scikit-learn"] + requirements = ["pytorch", "scikit-learn", "tqdm"] parameters = { "window_size": [10, "auto"], @@ -20,13 +20,20 @@ class Solver(BaseSolver): "learning_rate": [1e-3], "hidden_size": [64], "latent_size": [32], + "cutoff": [None], + } + + test_config = { + "window_size": 10, + "num_epochs": 1, + "batch_size": 8, } sampling_strategy = "run_once" def set_objective(self, X_train, X_test): if self.window_size == "auto": - self.window_size = find_length(X_train) + self.window_size = find_period_length(X_train.reshape(-1)) # Data received has shape (n_recordings, n_features, n_samples) n_features = X_train.shape[1] @@ -50,23 +57,29 @@ def run(self, _): ) self.clf.predict(self.X_test) - score = self.clf.decision_scores_ + anomaly_scores = self.clf.decision_scores_ - self.score = ( + self.anomaly_scores = ( MinMaxScaler(feature_range=(0, 1)) - .fit_transform(score.reshape(-1, 1)) + .fit_transform(anomaly_scores.reshape(-1, 1)) .ravel() ) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) def skip(self, X_train, X_test): """Check if the solver can be skipped.""" - if find_length(X_train) == 0 and self.window_size == "auto": + if find_period_length(X_train.reshape(-1)) == 0 and ( + self.window_size == "auto" + ): return True, "Window size is 0" return False, None def get_result(self): """Return the result of the solver.""" - # Binarizing the scores to 0 and 1 - # TEMPORARY SOLUTION - self.final_score = np.where(self.score > 0.75, 1, 0) - return dict(y_hat=self.final_score) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/dagmm.py b/solvers/dagmm.py index 99182f4..148fbc3 100644 --- a/solvers/dagmm.py +++ b/solvers/dagmm.py @@ -1,10 +1,10 @@ from benchopt import BaseSolver -import numpy as np import pandas as pd from merlion.models.anomaly.dagmm import DAGMM, DAGMMConfig from merlion.utils.time_series import TimeSeries -from sklearn.preprocessing import MinMaxScaler + +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): @@ -22,6 +22,7 @@ class Solver(BaseSolver): "batch_size": [8192], "lambda_energy": [0.1], "lambda_cov": [0.005], + "cutoff": [None], # "device": ["cuda:3"] } @@ -36,14 +37,10 @@ def set_objective(self, X_train, X_test): train_df = pd.DataFrame(self.X_train) test_df = pd.DataFrame(self.X_test) - print("Dataframe OK") - # Merlion expects a time index or it will generate one self.train_data = TimeSeries.from_pd(train_df) self.test_data = TimeSeries.from_pd(test_df) - print("TimeSeries OK") - # Configure DAGMM config = DAGMMConfig( gmm_k=self.gmm_k, @@ -66,17 +63,14 @@ def run(self, _): # Predict # get_anomaly_score returns a TimeSeries of scores scores_ts = self.model.get_anomaly_score(self.test_data) - self.scores = scores_ts.to_pd().values.flatten() + self.anomaly_scores = scores_ts.to_pd().values.flatten() + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) def get_result(self): - # Normalize scores to 0-1 range for thresholding - scaler = MinMaxScaler(feature_range=(0, 1)) - scores_norm = scaler.fit_transform(self.scores.reshape(-1, 1)).ravel() - - # Simple thresholding - y_hat = np.where(scores_norm > 0.75, 1, 0) - - return dict( - y_hat=y_hat, - raw_anomaly_score=self.scores - ) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/abod.py b/solvers/legacy/abod.py index 52e6e53..13bca75 100644 --- a/solvers/legacy/abod.py +++ b/solvers/legacy/abod.py @@ -5,6 +5,8 @@ from pyod.models.abod import ABOD import numpy as np +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "ABOD" # Angle-Based Outlier Detection @@ -18,6 +20,7 @@ class Solver(BaseSolver): "window": [True], "window_size": [20], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" @@ -51,31 +54,18 @@ def run(self, _): flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) self.clf.fit(flatrain) - - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = self.clf.decision_function(flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) # Function used to skip a solver call when n_neighbors >= window_size @@ -88,5 +78,7 @@ def get_result(self): # Anomaly : 1 # Inlier : 0 # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/cblof.py b/solvers/legacy/cblof.py index 452be07..1e65a7b 100644 --- a/solvers/legacy/cblof.py +++ b/solvers/legacy/cblof.py @@ -5,6 +5,8 @@ from pyod.models.cblof import CBLOF import numpy as np +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "CBLOF" @@ -18,6 +20,7 @@ class Solver(BaseSolver): "n_clusters": [10], "window_size": [20], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" @@ -50,30 +53,18 @@ def run(self, _): flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = self.clf.decision_function(flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) # Skipping the solver call if a condition is met @@ -86,5 +77,7 @@ def get_result(self): # Anomaly : 1 # Inlier : 0 # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/dif.py b/solvers/legacy/dif.py index b3b1f5f..36de441 100644 --- a/solvers/legacy/dif.py +++ b/solvers/legacy/dif.py @@ -4,6 +4,8 @@ from pyod.models.dif import DIF import numpy as np +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "DIF" @@ -16,6 +18,7 @@ class Solver(BaseSolver): "window": [True], "window_size": [20], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" @@ -47,30 +50,18 @@ def run(self, _): flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = self.clf.decision_function(flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) def skip(self, X_train, X_test): @@ -82,5 +73,7 @@ def get_result(self): # Anomaly : 1 # Inlier : 0 # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/isolation-forest.py b/solvers/legacy/isolation-forest.py index 9215294..83e8839 100644 --- a/solvers/legacy/isolation-forest.py +++ b/solvers/legacy/isolation-forest.py @@ -5,6 +5,8 @@ from sklearn.ensemble import IsolationForest import numpy as np +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "IsolationForest" @@ -17,6 +19,7 @@ class Solver(BaseSolver): "window": [True], "window_size": [60, 120, 180], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" @@ -50,24 +53,14 @@ def run(self, _): self.Xw_test.shape[0] * self.Xw_test.shape[1], -1) self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) + anomaly_scores = -self.clf.decision_function(flatest) # The results we get has a shape of n_recordings, n_features, n_windows, _ = self.Xw_test.shape - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Reshape back to original structure - self.raw_y_hat = self.raw_y_hat.reshape( - n_recordings, n_features, n_windows) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = self.raw_anomaly_score.reshape( + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + self.anomaly_scores = self.anomaly_scores.reshape( n_recordings, n_features, n_windows) else: # No windowing case @@ -77,15 +70,18 @@ def run(self, _): X_test_flat = self.X_test.reshape(-1, n_features) self.clf.fit(X_train_flat) - self.raw_y_hat = self.clf.predict(X_test_flat) - self.raw_anomaly_score = self.clf.decision_function(X_test_flat) + self.anomaly_scores = -self.clf.decision_function(X_test_flat) # Reshape to (n_recordings, n_samples) for single feature case # We assume we take the first feature or average across features - self.raw_y_hat = self.raw_y_hat.reshape(n_recordings, n_samples) - self.raw_anomaly_score = self.raw_anomaly_score.reshape( + self.anomaly_scores = self.anomaly_scores.reshape( n_recordings, n_samples) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + def skip(self, X_train, X_test): # Skip if dataset size is smaller than window size _, _, n_samples = X_train.shape @@ -98,7 +94,13 @@ def get_result(self): # Inlier : 0 # To ignore : -1 # For now, take the first recording - self.y_hat = self.raw_y_hat[0] if ( - self.raw_y_hat.ndim > 1 - ) else self.raw_y_hat - return dict(y_hat=self.y_hat) + anomaly_scores = self.anomaly_scores[0] if ( + self.anomaly_scores.ndim > 1 + ) else self.anomaly_scores + result = dict(anomaly_scores=anomaly_scores) + if self.anomaly_predictions is not None: + anomaly_predictions = self.anomaly_predictions[0] if ( + self.anomaly_predictions.ndim > 1 + ) else self.anomaly_predictions + result["anomaly_predictions"] = anomaly_predictions + return result diff --git a/solvers/legacy/lof.py b/solvers/legacy/lof.py index 00e6534..9075caa 100644 --- a/solvers/legacy/lof.py +++ b/solvers/legacy/lof.py @@ -5,6 +5,8 @@ from sklearn.neighbors import LocalOutlierFactor import numpy as np +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "LocalOutlierFactor" @@ -18,6 +20,7 @@ class Solver(BaseSolver): "window": [True], "window_size": [20], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" @@ -48,30 +51,18 @@ def run(self, _): flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = -self.clf.decision_function(flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) def skip(self, X_train, X_test): @@ -87,5 +78,7 @@ def get_result(self): # Anomaly : 1 # Inlier : 0 # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/ocsvm.py b/solvers/legacy/ocsvm.py index 96f2f6f..1813763 100644 --- a/solvers/legacy/ocsvm.py +++ b/solvers/legacy/ocsvm.py @@ -3,6 +3,8 @@ from sklearn.svm import OneClassSVM import numpy as np +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "OCSVM" @@ -17,6 +19,7 @@ class Solver(BaseSolver): "window": [True], "window_size": [128], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" @@ -47,30 +50,18 @@ def set_objective(self, X_train, X_test): def run(self, _): if self.window: self.clf.fit(self.flatrain) - raw_y_hat = self.clf.predict(self.flatest) - raw_anomaly_score = self.clf.decision_function(self.flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = -self.clf.decision_function(self.flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) def skip(self, X_train, X_test): @@ -79,4 +70,7 @@ def skip(self, X_train, X_test): return False, None def get_result(self): - return dict(y_hat=self.raw_y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/lstm.py b/solvers/lstm.py index 1c126ad..ff4a975 100644 --- a/solvers/lstm.py +++ b/solvers/lstm.py @@ -10,13 +10,14 @@ from benchmark_utils.models import AutoEncoderLSTM from benchmark_utils.windowing import make_windowed_dataset from benchmark_utils.windowing import reconstruct_from_windows +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): name = "LSTM" install_cmd = "conda" - requirements = ["pip::torch", "tqdm"] + requirements = ["pytorch", "tqdm"] sampling_strategy = "run_once" @@ -27,18 +28,16 @@ class Solver(BaseSolver): "lr": [1e-5], "window_size": [256], # window_size = seq_len "stride": [1], - "percentile": [97], + "cutoff": [None], "encoder_layers": [32], "decoder_layers": [32], } test_config = { - 'solver': { - "embedding_dim": 2, - "batch_size": 1, - "n_epochs": 1, - "window_size": 16, - } + "embedding_dim": 2, + "batch_size": 1, + "n_epochs": 1, + "window_size": 16, } def set_objective(self, X_train, X_test): @@ -105,9 +104,6 @@ def run(self, _): ti.set_postfix(train_loss=f"{train_loss:.5f}") - # Saving the model - torch.save(self.model.state_dict(), "model.pth") - # Test loop self.model.eval() raw_reconstruction = [] @@ -119,17 +115,18 @@ def run(self, _): raw_reconstruction.append(x_hat.detach().cpu().numpy()) reconstructed_data = np.concatenate(raw_reconstruction, axis=0) reconstructed_data = reconstruct_from_windows( - reconstructed_data, stride=self.stride, - batch=len(self.X_test), n_features=self.n_features + reconstructed_data, stride=self.stride, + batch=len(self.X_test), n_features=self.n_features ) reconstruction_err = np.mean( np.abs(self.X_test - reconstructed_data), axis=1 ) + self.anomaly_scores = reconstruction_err - self.y_hat = np.where( - reconstruction_err > np.percentile( - reconstruction_err, self.percentile), 1, 0 + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) def skip(self, X_train, X_test): @@ -138,4 +135,7 @@ def skip(self, X_train, X_test): return False, None def get_result(self): - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/matrixprofile.py b/solvers/matrixprofile.py index 48ed961..7b91d8d 100644 --- a/solvers/matrixprofile.py +++ b/solvers/matrixprofile.py @@ -1,9 +1,9 @@ from benchopt import BaseSolver from sklearn.preprocessing import MinMaxScaler -import numpy as np +from benchmark_utils.predictions import cutoff_scores +from benchmark_utils.windowing import find_period_length from TSB_AD.models.MatrixProfile import MatrixProfile -from TSB_AD.utils.slidingWindows import find_length class Solver(BaseSolver): @@ -14,6 +14,14 @@ class Solver(BaseSolver): parameters = { "window_size": [128, "auto"], + "cutoff": [None], + } + + test_config = { + "dataset": { + "n_features": 1, + }, + "window_size": 8, } sampling_strategy = "run_once" @@ -29,33 +37,29 @@ def set_objective(self, X_train, X_test): self.X_test = self.X_test.reshape(-1, n_features) if self.window_size == "auto": - self.window_size = int(find_length(X_train.reshape(-1))) - - print("=====================") - print(f"window_size: {self.window_size}") - print("=====================") + self.window_size = int(find_period_length(X_train.reshape(-1))) self.clf = MatrixProfile( window=self.window_size, ) def run(self, _): - print("Running Matrix Profile solver...") # Special solver, fitting on X_test self.clf.fit(self.X_test.reshape(-1)) - print("MP Fitted") - self.scores = self.clf.decision_scores_ - self.score = ( + anomaly_scores = self.clf.decision_scores_ + self.anomaly_scores = ( MinMaxScaler(feature_range=(0, 1)) - .fit_transform(self.scores.reshape(-1, 1)) + .fit_transform(anomaly_scores.reshape(-1, 1)) .ravel() ) - print("MP Scored") - print(f"Score shape: {self.score.shape}") + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) def skip(self, X_train, X_test): """Check if the solver can be skipped.""" - if (find_length(X_train.reshape(-1)) == 0) and ( + if (find_period_length(X_train.reshape(-1)) == 0) and ( self.window_size == "auto"): return True, "Window size is 0" if X_train.shape[1] != 1: @@ -64,7 +68,7 @@ def skip(self, X_train, X_test): def get_result(self): """Return the result of the solver.""" - # Binarizing the scores to 0 and 1 - # TEMPORARY SOLUTION - self.final_score = np.where(self.score > 0.90, 1, 0) - return dict(y_hat=self.final_score, raw_anomaly_score=self.score) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/rosecdl.py b/solvers/rosecdl.py index beabd79..6ccf54c 100644 --- a/solvers/rosecdl.py +++ b/solvers/rosecdl.py @@ -1,8 +1,9 @@ from benchopt import BaseSolver import torch +from benchmark_utils.predictions import cutoff_scores +from benchmark_utils.windowing import find_period_length from rosecdl.rosecdl import RoseCDL -from TSB_AD.utils.slidingWindows import find_length class Solver(BaseSolver): @@ -10,7 +11,7 @@ class Solver(BaseSolver): install_cmd = "conda" requirements = [ - "pip::git+https://github.com/tommoral/rosecdl.git", "pip::torch" + "pytorch", "pip::git+https://github.com/tommoral/rosecdl.git" ] parameters = { @@ -35,6 +36,7 @@ class Solver(BaseSolver): }, ], "plot": [False], + "cutoff": [None], } sampling_strategy = "run_once" @@ -49,11 +51,7 @@ def set_objective(self, X_train, X_test): self.X_test = X_test if self.kernel_size == "auto": - self.kernel_size = int(find_length(X_train.reshape(-1))) - - print("=====================") - print(f"kernel_size: {self.kernel_size}") - print("=====================") + self.kernel_size = int(find_period_length(X_train.reshape(-1))) self.clf = RoseCDL( n_components=self.n_components, @@ -75,7 +73,6 @@ def set_objective(self, X_train, X_test): def run(self, _): self.clf.fit(self.X_train) del self.X_train # Free GPU memory for X_train after fitting - self.y_pred = self.clf.get_outlier_mask(self.X_test) xh, zh = self.clf.csc( torch.tensor(self.X_test, dtype=torch.float32, device=self.device) @@ -88,9 +85,16 @@ def run(self, _): ) err = err.cpu().detach().numpy() # Aggregate errors over channels - self.err = err.sum(axis=1).reshape(-1) + self.anomaly_scores = err.sum(axis=1).reshape(-1) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) del self.clf # Free GPU memory for the model torch.cuda.empty_cache() # Release cached GPU memory def get_result(self): - return dict(y_hat=self.y_pred, raw_anomaly_score=self.err) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/tsb_chronos.py b/solvers/tsb_chronos.py index 9c87358..855c842 100644 --- a/solvers/tsb_chronos.py +++ b/solvers/tsb_chronos.py @@ -5,6 +5,8 @@ from TSB_AD.models.Chronos import Chronos from TSB_AD.utils.slidingWindows import find_length +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "TSB-Chronos" @@ -17,6 +19,7 @@ class Solver(BaseSolver): "prediction_length": [1], "model_size": ['base'], "batch_size": [32], + "cutoff": [None], } sampling_strategy = "run_once" @@ -39,16 +42,18 @@ def set_objective(self, X_train, X_test): ) def run(self, _): - print("Running Chronos solver...") self.clf.fit(self.data) - self.score = self.clf.decision_scores_[-len(self.X_test):] - print("Chronos Fitted") + self.anomaly_scores = self.clf.decision_scores_[-len(self.X_test):] + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) - # Map scores to predictions - threshold = np.percentile(self.score, (1 - 0.1) * 100) - self.y_hat = (self.score > threshold).astype(int) del self.clf # Free memory for the model torch.cuda.empty_cache() # Release cached GPU memory def get_result(self): - return dict(y_hat=self.y_hat, raw_anomaly_score=self.score) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/tsb_timesfm.py b/solvers/tsb_timesfm.py index 77a69b7..cfa591b 100644 --- a/solvers/tsb_timesfm.py +++ b/solvers/tsb_timesfm.py @@ -1,9 +1,13 @@ from benchopt import BaseSolver -import torch +from importlib.util import find_spec + import numpy as np +import torch from TSB_AD.model_wrapper import run_TimesFM +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "TSB-TimesFM" @@ -13,6 +17,7 @@ class Solver(BaseSolver): parameters = { "win_size": [256], + "cutoff": [None], } sampling_strategy = "run_once" @@ -23,15 +28,25 @@ def set_objective(self, X_train, X_test): self.data = self.data.reshape(-1, n_features) self.X_test = X_test.reshape(-1, n_features) + def skip(self, X_train, X_test): + if find_spec("timesfm") is None: + return True, "TSB-TimesFM requires the optional timesfm package." + return False, None + def run(self, _): - self.y_hat = run_TimesFM( + anomaly_scores = run_TimesFM( data=self.data, win_size=self.win_size, ) - self.raw_anomaly_score = self.y_hat[-len(self.X_test):] + self.anomaly_scores = anomaly_scores[-len(self.X_test):] + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) torch.cuda.empty_cache() # Release cached GPU memory def get_result(self): - threshold = np.percentile(self.raw_anomaly_score, 90) - self.y_hat = (self.raw_anomaly_score > threshold).astype(int) - return dict(y_hat=self.y_hat, raw_anomaly_score=self.raw_anomaly_score) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/tsb_timesnet.py b/solvers/tsb_timesnet.py index 76ed261..ed431ae 100644 --- a/solvers/tsb_timesnet.py +++ b/solvers/tsb_timesnet.py @@ -3,6 +3,8 @@ import torch from TSB_AD.models.TimesNet import TimesNet +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "TSB-TimesNet" @@ -13,6 +15,20 @@ class Solver(BaseSolver): parameters = { "window_size": [256], "lr": [1e-4], + "epochs": [10], + "batch_size": [128], + "cutoff": [None], + } + + test_config = { + "dataset": { + "n_samples": 512, + "n_features": 2, + "n_anomaly": 32, + }, + "window_size": 32, + "epochs": 1, + "batch_size": 16, } sampling_strategy = "run_once" @@ -25,8 +41,8 @@ def set_objective(self, X_train, X_test): self.clf = TimesNet( win_size=self.window_size, enc_in=n_features, - epochs=10, - batch_size=128, + epochs=self.epochs, + batch_size=self.batch_size, lr=self.lr, patience=3, features="M", @@ -36,13 +52,25 @@ def set_objective(self, X_train, X_test): def run(self, _): self.clf.fit(self.X_train) - self.raw_anomaly_score = self.clf.decision_function(self.X_test) + self.anomaly_scores = self.clf.decision_function(self.X_test) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) - print("TimesNet done") del self.clf.model del self.clf torch.cuda.empty_cache() # Release cached GPU memory + def skip(self, X_train, X_test): + if X_train.shape[-1] < self.window_size: + return True, "Not enough training samples to create a window." + if X_test.shape[-1] < self.window_size: + return True, "Not enough testing samples to create a window." + return False, None + def get_result(self): - self.y_hat = (self.raw_anomaly_score > 0).astype(int) - return dict(y_hat=self.y_hat, raw_anomaly_score=self.raw_anomaly_score) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/vae.py b/solvers/vae.py index 75e66c9..9dfd400 100644 --- a/solvers/vae.py +++ b/solvers/vae.py @@ -1,9 +1,9 @@ from benchopt import BaseSolver import torch -import numpy as np from pyod.models.vae import VAE +from benchmark_utils.predictions import cutoff_scores from benchmark_utils.windowing import make_windows @@ -11,7 +11,7 @@ class Solver(BaseSolver): name = "VAE" install_cmd = "conda" - requirements = ["pip::pyod", "pip::torch"] + requirements = ["pyod", "pytorch"] sampling_strategy = "run_once" @@ -26,12 +26,11 @@ class Solver(BaseSolver): "latent_dim": [2, 5, 10], "batch_norm": [True], "dropout_rate": [0.1, 0.2, 0.5], + "cutoff": [None], } test_config = { - 'solver': { - "n_epochs": 1, - "window_size": 16, - } + "n_epochs": 1, + "window_size": 16, } def set_objective(self, X_train, X_test): @@ -70,7 +69,14 @@ def set_objective(self, X_train, X_test): def run(self, _): self.clf.fit(self.Xw_train) - self.y_pred = self.clf.predict(self.Xw_test) + self.anomaly_scores = self.clf.decision_function(self.Xw_test) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) def get_result(self): - return dict(y_hat=self.y_pred) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/vanilla-transformer.py b/solvers/vanilla-transformer.py index 60b06be..11f91dd 100644 --- a/solvers/vanilla-transformer.py +++ b/solvers/vanilla-transformer.py @@ -11,13 +11,14 @@ from benchmark_utils.models import TransformerModel from benchmark_utils.windowing import make_windowed_dataset from benchmark_utils.windowing import reconstruct_from_windows +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): name = "Transformer" install_cmd = "conda" - requirements = ["pip::torch", "tqdm"] + requirements = ["pytorch", "tqdm"] sampling_strategy = "run_once" @@ -31,13 +32,11 @@ class Solver(BaseSolver): "horizon": [1], "window_size": [256], "stride": [1], - "percentile": [97], + "cutoff": [None], } test_config = { - 'solver': { - "n_epochs": 1, - "window_size": 16, - } + "n_epochs": 1, + "window_size": 16, } def set_objective(self, X_train, X_test): @@ -113,7 +112,9 @@ def run(self, _): total_loss += loss.item() avg_loss = total_loss / (len(self.Xw_train) // self.batch_size) - ti.set_description(f"Epoch {epoch} (loss={avg_loss:.5e})") + ti.set_description( + f"Epoch {epoch} (loss={avg_loss:.5e})" + ) # Learning rate scheduling self.scheduler.step(avg_loss) @@ -122,7 +123,6 @@ def run(self, _): if avg_loss < best_loss: best_loss = avg_loss no_improve = 0 - torch.save(self.model.state_dict(), 'best_model.pth') else: no_improve += 1 if no_improve == patience: @@ -148,21 +148,22 @@ def run(self, _): n_features=self.X_test.shape[1] ) - # Calculating the percentile value for the threshold - percentile_value = np.percentile( - np.abs(self.X_test[..., self.window_size:] - - x_hat[..., self.window_size:]), - self.percentile + reconstruction_err = np.abs( + self.X_test[..., self.window_size:] - x_hat[..., self.window_size:] ) - - # Thresholding - predictions = np.zeros_like(self.X_test)-1 - predictions[..., self.window_size:] = np.where( - np.abs(self.X_test[..., self.window_size:] - - x_hat[..., self.window_size:]) > percentile_value, 1, 0 + self.anomaly_scores = np.full( + self.X_test.shape[:1] + self.X_test.shape[2:], + np.nan, + dtype=float, + ) + self.anomaly_scores[..., self.window_size:] = np.max( + reconstruction_err, axis=1 ) - self.predictions = np.max(predictions, axis=1) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) def skip(self, X_train, X_test): if X_train.shape[-1] < self.window_size + self.horizon: @@ -170,4 +171,7 @@ def skip(self, X_train, X_test): return False, None def get_result(self): - return dict(y_hat=self.predictions) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/test_config.py b/test_config.py index fd149be..3606a74 100644 --- a/test_config.py +++ b/test_config.py @@ -1,10 +1,21 @@ import sys # noqa: F401 +from importlib.util import find_spec import pytest # noqa: F401 from benchopt.utils.sys_info import get_cuda_version +OPTIONAL_BACKEND_INSTALL_XFAILS = { + "dagmm": "DAGMM depends on the optional salesforce-merlion package.", + "mp": "MP depends on the optional TSB-AD package.", + "rosecdl": "RoseCDL depends on an optional GitHub package.", + "tsb-chronos": "TSB-Chronos depends on the optional TSB-AD backend.", + "tsb-timesfm": "TSB-TimesFM depends on TSB-AD and timesfm.", + "tsb-timesnet": "TSB-TimesNet depends on the optional TSB-AD backend.", +} + + def check_test_solver_install(benchmark, solver_class): """Hook called in `test_solver_install`. @@ -12,11 +23,16 @@ def check_test_solver_install(benchmark, solver_class): particular architecture, call pytest.xfail when detecting the situation. """ - if solver_class.name.lower() == "dif": + solver_name = solver_class.name.lower() + + if solver_name in OPTIONAL_BACKEND_INSTALL_XFAILS: + pytest.xfail(OPTIONAL_BACKEND_INSTALL_XFAILS[solver_name]) + + if solver_name == "dif": if get_cuda_version() is None: pytest.xfail("Deep IsolationForest needs a working GPU hardware.") - if solver_class.name.lower() == "anomalybert": + if solver_name == "anomalybert": pytest.xfail("AnomalyBERT needs to be installed locally from repo" " at https://github.com/Jhryu30/AnomalyBERT.git") @@ -29,11 +45,20 @@ def check_test_solver_install(benchmark, solver_class): # pytest.xfail("Transformer needs a working GPU hardware.") -def check_test_dataset_get_data(benchmark, data_class): - if data_class.name.lower() in [ +def check_test_solver_run(benchmark, solver_class): + """Hook called in `test_solver_run`.""" + if solver_class.name.lower() == "tsb-timesfm": + if find_spec("timesfm") is None: + pytest.xfail( + "TSB-TimesFM needs the optional timesfm package." + ) + + +def check_test_dataset_get_data(benchmark, dataset_class): + if dataset_class.name.lower() in [ "daphnet", "dodgers", "ecg", "genesis", "ghl", "iops", "kdd21", "mgab", "mitdb", "nab", "occupancy", "opportunity", "sensorscope", "smd", "svdb", "yahoo" ]: - pytest.xfail(f"{data_class.name} dataset is not downloaded.") + pytest.xfail(f"{dataset_class.name} dataset is not downloaded.") From cd77fe3111cff238d0c962c7d5a5cc8ff3644ba2 Mon Sep 17 00:00:00 2001 From: Jad Date: Mon, 18 May 2026 17:25:39 +0200 Subject: [PATCH 39/50] Perf vectorize metrics (#33) * Vectorize `soft_precision`, `soft_recall`, and `extract_anomaly_ranges`. --- benchmark_utils/metrics.py | 156 ++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 82 deletions(-) diff --git a/benchmark_utils/metrics.py b/benchmark_utils/metrics.py index febe234..9bcad02 100644 --- a/benchmark_utils/metrics.py +++ b/benchmark_utils/metrics.py @@ -1,6 +1,26 @@ import numpy as np +def _dilate(mask: np.ndarray, radius: int) -> np.ndarray: + """Binary dilation with a centered window of half-width ``radius``. + + ``out[i]`` is True iff any entry of ``mask`` in ``[i-radius, i+radius]`` + (clipped to the array) is truthy. Matches the half-open slice + ``mask[max(0, i-r):min(n, i+r+1)]`` used by the soft metrics. + """ + mask = np.asarray(mask) + n = mask.shape[0] + if n == 0: + return np.zeros(0, dtype=bool) + if radius <= 0: + return mask.astype(bool, copy=False) + cum = np.concatenate(([0], np.cumsum(mask.astype(np.int64)))) + idx = np.arange(n) + left = np.maximum(0, idx - radius) + right = np.minimum(n, idx + radius + 1) + return (cum[right] - cum[left]) > 0 + + def soft_precision(y_true: np.ndarray, y_pred: np.ndarray, detection_range=3, @@ -32,47 +52,34 @@ def soft_precision(y_true: np.ndarray, fa : int Number of false anomalies """ - # EM : Exact Match - em = 0 - # DA : Detected Anomaly - da = 0 - # FA : False Anomaly - fa = 0 + y_true = np.asarray(y_true) + y_pred = np.asarray(y_pred) - # TFDIR = (EM + DA) / (EM + DA + FA) + true_mask = y_true == 1 + pred_mask = y_pred == 1 - # Counting exact matches - for i in range(len(y_true)): - if y_true[i] == 1 and (y_true[i] == y_pred[i]): - em += 1 - - # False anomaly and detected anomalies - for i in range(len(y_true)): + # TFDIR = (EM + DA) / (EM + DA + FA) - left = max(0, i-detection_range) - right = min(len(y_true), i+detection_range+1) + # EM : Exact Match + em = int(np.sum(true_mask & pred_mask)) - if y_pred[i] == 1 and ( - y_true[left:right] == 0).all(): - fa += 1 + true_dil = _dilate(true_mask, detection_range) + pred_dil = _dilate(pred_mask, detection_range) - if y_true[i] == 1 and ( - y_pred[left:right] == 1).any(): - da += 1 + # DA : Detected Anomaly + fa = int(np.sum(pred_mask & ~true_dil)) + # FA : False Anomaly # Removing exact matches from detected anomalies because they are # counted twice - da -= em - - if return_counts: - if em + da + fa == 0: - return 0, em, da, fa + da = int(np.sum(true_mask & pred_dil)) - em - return (em + da) / (em + da + fa), em, da, fa + total = em + da + fa + score = (em + da) / total if total else 0 - if em + da + fa == 0: - return 0 - return (em + da) / (em + da + fa) + if return_counts: + return score, em, da, fa + return score def soft_recall(y_true: np.ndarray, @@ -101,46 +108,25 @@ def soft_recall(y_true: np.ndarray, ma : int Number of missed anomalies """ - # EM : Exact Match - em = 0 - # DA : Detected Anomaly - da = 0 - # MA : Missed Anomaly - ma = 0 - # DAIR = (EM + DA) / (EM + DA + MA) + y_true = np.asarray(y_true) + y_pred = np.asarray(y_pred) - # Counting exact matches - for i in range(len(y_true)): - if y_true[i] == 1 and (y_true[i] == y_pred[i]): - em += 1 + true_mask = y_true == 1 + pred_mask = y_pred == 1 - # Missing values and detected anomalies - for i in range(len(y_true)): + em = int(np.sum(true_mask & pred_mask)) - left = max(0, i-detection_range) - right = min(len(y_true), i+detection_range+1) + pred_dil = _dilate(pred_mask, detection_range) - if y_true[i] == 1 and ( - y_pred[left:right] == 0).all(): - ma += 1 + ma = int(np.sum(true_mask & ~pred_dil)) + da = int(np.sum(true_mask & pred_dil)) - em - if y_true[i] == 1 and ( - y_pred[left:right] == 1).any(): - da += 1 - - # Removing exact matches from detected anomalies because they are - # counted twice - da -= em + total = em + da + ma + score = (em + da) / total if total else 0 if return_counts: - if em + da + ma == 0: - return 0, em, da, ma - - return (em + da) / (em + da + ma), em, da, ma - - if em + da + ma == 0: - return 0 - return (em + da) / (em + da + ma) + return score, em, da, ma + return score def ctt(y_true: np.ndarray, y_pred: np.ndarray, return_signed: bool = False): @@ -237,22 +223,34 @@ def ttc(y_true: np.ndarray, y_pred: np.ndarray, return_signed: bool = False): return tot_dist / np.sum(y_true) -def soft_f1(precision, recall): +def soft_f1(precision, recall, detection_range=None): """ Calculate the F1 score from precision and recall. Parameters ---------- - precision : float + precision : float or np.ndarray Precision score - recall : float + recall : float or np.ndarray Recall score + detection_range : int, optional + If provided, ``precision`` and ``recall`` are interpreted as the + true and predicted label arrays used by ``soft_precision`` and + ``soft_recall``. Returns ------- f1 : float F1 score """ + if detection_range is not None: + precision_score = soft_precision( + precision, recall, detection_range=detection_range + ) + recall_score = soft_recall( + precision, recall, detection_range=detection_range) + precision, recall = precision_score, recall_score + if precision + recall == 0: return 0 return 2 * (precision * recall) / (precision + recall) @@ -277,21 +275,15 @@ def extract_anomaly_ranges(labels: list[int]): Each tuple represents a range (start_index, end_index) where anomalies are present. """ - ranges = [] - start = None - - for i, label in enumerate(labels): - if label == 1 and start is None: - start = i # Start of a new anomaly range - elif label == 0 and start is not None: - ranges.append((start, i - 1)) # End of the current anomaly range - start = None - - # Handle the case where the series ends with an anomaly - if start is not None: - ranges.append((start, len(labels) - 1)) - - return ranges + arr = np.asarray(labels) + if arr.size == 0: + return [] + binary = (arr == 1).astype(np.int8) + padded = np.concatenate(([0], binary, [0])) + diff = np.diff(padded) + starts = np.where(diff == 1)[0] + ends = np.where(diff == -1)[0] - 1 + return list(zip(starts.tolist(), ends.tolist())) def existence_reward(real_range, predicted_ranges): From dde01b101c06e900d59c104e046e68cdfc6a3822 Mon Sep 17 00:00:00 2001 From: Jad Date: Mon, 18 May 2026 17:26:38 +0200 Subject: [PATCH 40/50] Fix mean overlaping pred (#34) * Fix off-by-one in `mean_overlaping_pred` reconstructed length. --- benchmark_utils/__init__.py | 4 ++- tests/test_mean_overlaping_pred.py | 47 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 tests/test_mean_overlaping_pred.py diff --git a/benchmark_utils/__init__.py b/benchmark_utils/__init__.py index cd5be6c..eaeac2f 100644 --- a/benchmark_utils/__init__.py +++ b/benchmark_utils/__init__.py @@ -22,7 +22,9 @@ def mean_overlaping_pred(predictions, stride): np.ndarray: Averaged predictions for each feature. """ n_windows, H, n_features = predictions.shape - total_length = (n_windows-1) * stride + H - 1 + # The last window starts at (n_windows-1)*stride and covers H samples, so + # the reconstructed signal spans (n_windows-1)*stride + H positions. + total_length = (n_windows - 1) * stride + H # Array to store accumulated predictions for each feature accumulated = np.zeros((total_length, n_features)) diff --git a/tests/test_mean_overlaping_pred.py b/tests/test_mean_overlaping_pred.py new file mode 100644 index 0000000..4189fa4 --- /dev/null +++ b/tests/test_mean_overlaping_pred.py @@ -0,0 +1,47 @@ +import numpy as np + +from benchmark_utils import mean_overlaping_pred + + +def test_length_horizon_one_stride_one(): + # 5 windows, horizon=1, stride=1 → reconstructed signal length is 5 + preds = np.arange(5).reshape(5, 1, 1).astype(float) + out = mean_overlaping_pred(preds, stride=1) + assert out.shape == (5, 1) + assert np.allclose(out.ravel(), np.arange(5)) + + +def test_length_horizon_gt_one(): + # 4 windows, H=3, stride=1 → (4-1)*1 + 3 = 6 positions + preds = np.ones((4, 3, 2)) + out = mean_overlaping_pred(preds, stride=1) + assert out.shape == (6, 2) + # every position covered, averaged value is 1.0 + assert np.allclose(out, 1.0) + + +def test_overlap_averages_correctly(): + # H=2, stride=1, 3 windows. Index 1 is covered by windows 0 and 1, + # index 2 by windows 1 and 2. + preds = np.array( + [[[1.0], [2.0]], + [[3.0], [4.0]], + [[5.0], [6.0]]] + ) + out = mean_overlaping_pred(preds, stride=1) + # positions: 0 -> 1, 1 -> mean(2, 3) = 2.5, 2 -> mean(4, 5) = 4.5, 3 -> 6 + assert out.shape == (4, 1) + assert np.allclose(out.ravel(), [1.0, 2.5, 4.5, 6.0]) + + +def test_stride_gt_one_no_overlap(): + # H=2, stride=2 → windows tile end-to-end + preds = np.array( + [[[1.0], [2.0]], + [[3.0], [4.0]], + [[5.0], [6.0]]] + ) + out = mean_overlaping_pred(preds, stride=2) + # (3-1)*2 + 2 = 6 positions, no overlap + assert out.shape == (6, 1) + assert np.allclose(out.ravel(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) From f2d1396443c2f97c8c8010239a416e93c935fb06 Mon Sep 17 00:00:00 2001 From: Jad Date: Mon, 18 May 2026 17:32:23 +0200 Subject: [PATCH 41/50] Enh/find period lenght (#35) * CLN remove safe_import_context * Clear separation between `anomaly_scores` and optional `anomaly_predictions` (binary, used via solver-side `cutoff`). * Adapt solvers to score/prediction contract with `cutoff` parameter. * Adds `find_period_length` helper instead of importing TSB-AD only for this. --- benchmark_utils/predictions.py | 34 +++ benchmark_utils/windowing.py | 47 +++- objective.py | 361 ++++++++++++++++++++++------- solvers/AR.py | 53 ++--- solvers/anomalybert.py | 24 +- solvers/autoencoder.py | 37 ++- solvers/dagmm.py | 30 +-- solvers/legacy/abod.py | 44 ++-- solvers/legacy/cblof.py | 43 ++-- solvers/legacy/dif.py | 43 ++-- solvers/legacy/isolation-forest.py | 46 ++-- solvers/legacy/lof.py | 43 ++-- solvers/legacy/ocsvm.py | 42 ++-- solvers/lstm.py | 34 +-- solvers/matrixprofile.py | 42 ++-- solvers/rosecdl.py | 24 +- solvers/tsb_chronos.py | 19 +- solvers/tsb_timesfm.py | 27 ++- solvers/tsb_timesnet.py | 40 +++- solvers/vae.py | 22 +- solvers/vanilla-transformer.py | 46 ++-- test_config.py | 35 ++- tests/test_objective.py | 110 +++++++++ tests/test_predictions.py | 33 +++ 24 files changed, 883 insertions(+), 396 deletions(-) create mode 100644 benchmark_utils/predictions.py create mode 100644 tests/test_objective.py create mode 100644 tests/test_predictions.py diff --git a/benchmark_utils/predictions.py b/benchmark_utils/predictions.py new file mode 100644 index 0000000..7517dbc --- /dev/null +++ b/benchmark_utils/predictions.py @@ -0,0 +1,34 @@ +import numpy as np + + +def cutoff_scores(anomaly_scores, cutoff=None): + """Turn anomaly scores into binary predictions using a contamination rate. + + Larger scores are assumed to be more anomalous. NaN entries are preserved + as ``-1`` ignore labels so they are masked by the objective. + """ + if cutoff is None: + return None + + validate_cutoff(cutoff) + + scores = np.asarray(anomaly_scores) + predictions = np.full(scores.shape, -1, dtype=int) + valid = ~np.isnan(scores) + if not np.any(valid): + return predictions + + threshold = np.quantile(scores[valid], 1 - cutoff) + + predictions[valid] = (scores[valid] >= threshold).astype(int) + return predictions + + +def validate_cutoff(cutoff): + if cutoff is None: + raise ValueError("cutoff must be provided.") + if not 0 < cutoff < 1: + raise ValueError( + "cutoff must be in (0, 1), " + f"got {cutoff!r}." + ) diff --git a/benchmark_utils/windowing.py b/benchmark_utils/windowing.py index 40399b9..362135b 100644 --- a/benchmark_utils/windowing.py +++ b/benchmark_utils/windowing.py @@ -3,6 +3,49 @@ from torch.utils.data import TensorDataset +def find_period_length(data, default=125): + """Estimate a reasonable period length from autocorrelation. + + This local helper replaces the small ``TSB_AD`` utility previously used by + several solvers, avoiding a heavy optional dependency for solvers that only + need automatic window sizing. + """ + data = np.asarray(data) + if data.ndim > 1: + return 0 + + data = data[: min(20_000, len(data))] + if len(data) < 6: + return 0 + + centered = data - data.mean() + norm = np.dot(centered, centered) + if norm == 0: + return default + + max_lag = min(400, len(centered) - 1) + autocorr = np.correlate(centered, centered, mode="full") + autocorr = autocorr[len(centered) - 1: len(centered) + max_lag] / norm + + base = 3 + values = autocorr[base:] + if len(values) < 3: + return default + + local_max = ( + np.where((values[1:-1] > values[:-2]) & + (values[1:-1] > values[2:]))[0] + 1 + ) + + if len(local_max) == 0: + return default + + lag = local_max[np.argmax(values[local_max])] + base + if lag < 3 or lag > 300: + return default + return int(lag) + + def make_windows(X, window_size=32, stride=1, padding=False): """Create a windowed view of the data. @@ -82,7 +125,7 @@ def reconstruct_from_windows(windows, stride, batch, n_features): Parameters ---------- windows : np.ndarray - The overlapping windows of shape (batch*n_windows, window_size, n_features) + The overlapping windows of shape (batch*n_windows, window_size, n_feat) stride : int The stride used to create the windows batch : int @@ -110,4 +153,4 @@ def reconstruct_from_windows(windows, stride, batch, n_features): # count contributions np.add.at(cnt, idx.ravel(), 1) - return acc / cnt \ No newline at end of file + return acc / cnt diff --git a/objective.py b/objective.py index 48ee825..04dbde5 100644 --- a/objective.py +++ b/objective.py @@ -3,21 +3,22 @@ soft_precision as soft_precision_score, soft_recall as soft_recall_score, soft_f1 as soft_f1_score, - ctt, ttc, + ctt, + ttc, extract_anomaly_ranges, precision_t as precision_t_score, recall_t as recall_t_score, - f1_t as f1_t_score + f1_t as f1_t_score, ) import numpy as np from sklearn.metrics import ( + average_precision_score, precision_score, recall_score, f1_score, zero_one_loss, roc_auc_score, - precision_recall_curve, ) @@ -27,99 +28,301 @@ class Objective(BaseObjective): install_cmd = "conda" requirements = ["scikit-learn"] + parameters = { + "score_metrics": [("auc_pr", "auc_roc")], + "prediction_metrics": [None], + } + + detection_ranges = (1, 3, 5, 10, 20) + default_prediction_metrics = ( + "precision", + "recall", + "f1", + "precision_t", + "recall_t", + "f1_t", + "ctt", + "ttc", + "zoloss", + "soft_precision", + "soft_recall", + "soft_f1", + ) + def get_one_result(self): - """Return one solution for which the objective can be computed, - Used to get the shape of the result. - Our algorithms will return an array of labels of shape (n_samples,) - """ - return dict(y_hat=np.zeros_like(self.y_test)) + """Return one solution for which the objective can be computed.""" + score_metrics = self._normalize_metrics( + getattr(self, "score_metrics", ("auc_pr", "auc_roc")) + ) + prediction_metrics = self._expand_prediction_metrics( + getattr(self, "prediction_metrics", None) + ) + + result = {} + if score_metrics: + result["anomaly_scores"] = np.zeros_like( + self.y_test, dtype=float + ) + if prediction_metrics: + result["anomaly_predictions"] = np.zeros_like( + self.y_test, dtype=int + ) + return result def set_data(self, X_train, y_test, X_test): "Set the data to compute the objective." self.X_train = X_train self.X_test, self.y_test = X_test, y_test - def evaluate_result(self, y_hat, raw_anomaly_score=None): - """Evaluate the result provided by the solver.""" - print("y_hat shape", y_hat.shape) - print("self.y_test shape", self.y_test.shape) + def evaluate_result( + self, + anomaly_scores=None, + anomaly_predictions=None, + ): + """Evaluate the result provided by the solver. + + anomaly_scores is the score-based solver output. + anomaly_predictions is optional and only needed when requesting + prediction-based metrics. + """ + score_metrics = self._normalize_metrics( + getattr(self, "score_metrics", ("auc_pr", "auc_roc")) + ) + prediction_metrics = self._expand_prediction_metrics( + getattr(self, "prediction_metrics", None) + ) - to_discard = (y_hat == -1).sum() - self.y_test = self.y_test.reshape(-1)[to_discard:] - y_hat = y_hat.reshape(-1)[to_discard:] + if score_metrics and anomaly_scores is None: + raise ValueError("score_metrics require an anomaly_scores array.") + if prediction_metrics and anomaly_predictions is None: + raise ValueError( + "prediction_metrics require an anomaly_predictions array.") - print("y_hat shape after discard", y_hat.shape) - print("self.y_test shape after discard", self.y_test.shape) + y_true, scores, predictions = self._align_inputs( + anomaly_scores=anomaly_scores, + anomaly_predictions=anomaly_predictions, + ) result = {} - detection_ranges = [1, 3, 5, 10, 20] + if score_metrics: + result.update( + self._compute_score_metrics( + y_true=y_true, + anomaly_scores=scores, + metrics=score_metrics, + ) + ) + if prediction_metrics: + result.update( + self._compute_prediction_metrics( + y_true=y_true, + anomaly_predictions=predictions, + metrics=prediction_metrics, + ) + ) - # Standard metrics - precision = precision_score(self.y_test, y_hat, zero_division=0) - recall = recall_score(self.y_test, y_hat, zero_division=0) - f1 = f1_score(self.y_test, y_hat, zero_division=0) + # Setting value to 0. The actual value is not used for ranking. + result["value"] = 0.0 + return result - anomaly_ranges = extract_anomaly_ranges(self.y_test) - prediction_ranges = extract_anomaly_ranges(y_hat) + def get_objective(self): + return dict(X_train=self.X_train, X_test=self.X_test) - precision_t = precision_t_score(anomaly_ranges, prediction_ranges) - recall_t = recall_t_score(anomaly_ranges, prediction_ranges) - f1_t = f1_t_score(anomaly_ranges, prediction_ranges) + def _normalize_metrics(self, metrics): + if metrics is None: + return () + if isinstance(metrics, str): + if metrics == "all": + return ("auc_pr", "auc_roc") + return (metrics,) + return tuple(metric for metric in metrics if metric is not None) - result.update({ - "precision": precision, - "recall": recall, - "f1": f1 - }) + def _expand_prediction_metrics(self, metrics): + metrics = self._normalize_prediction_metrics(metrics) + expanded = [] - for range_value in detection_ranges: - soft_precision = soft_precision_score( - self.y_test, y_hat, detection_range=range_value - ) - soft_recall = soft_recall_score( - self.y_test, y_hat, detection_range=range_value - ) - soft_f1 = soft_f1_score(soft_precision, soft_recall) - - result.update({ - f"soft_precision_{range_value}": soft_precision, - f"soft_recall_{range_value}": soft_recall, - f"soft_f1_{range_value}": soft_f1 - }) - - zoloss = zero_one_loss(self.y_test, y_hat) - - # Other metrics - cct_score = ctt(self.y_test, y_hat) - ttc_score = ttc(self.y_test, y_hat) - - # Add remaining metrics to the result dictionary - result.update({ - "precision_t": precision_t, - "recall_t": recall_t, - "f1_t": f1_t, - "cct": cct_score, - "ttc": ttc_score, - "zoloss": zoloss, - "value": zoloss # having zoloss twice for the API - }) - - # AUC-ROC and AUC-PR - if raw_anomaly_score is not None: - auc_roc = roc_auc_score(self.y_test, raw_anomaly_score) - precision_curve, recall_curve, _ = precision_recall_curve( - self.y_test, raw_anomaly_score) - auc_pr = -np.trapz(precision_curve, recall_curve) - - result["auc_roc"] = auc_roc - result["auc_pr"] = auc_pr - - for key, value in result.items(): - print(f"{key}: {value}") + for metric in metrics: + if metric == "all": + metric = self.default_prediction_metrics + else: + metric = (metric,) + + for name in metric: + if name in { + "soft_precision", + "soft_recall", + "soft_f1", + }: + expanded.extend( + f"{name}_{detection_range}" + for detection_range in self.detection_ranges + ) + else: + expanded.append(name) + return tuple(expanded) + + def _normalize_prediction_metrics(self, metrics): + if metrics is None: + return () + if isinstance(metrics, str): + return (metrics,) + return tuple(metric for metric in metrics if metric is not None) + + def _align_inputs(self, anomaly_scores, anomaly_predictions): + # flatten everything before aligning lengths. + y_true = np.asarray(self.y_test).reshape(-1) + scores = self._as_flat_array(anomaly_scores) + predictions = self._as_flat_array(anomaly_predictions) + + # Only align against arrays that were returned. This keeps + # score-only and prediction-only evaluations valid. + arrays = [array for array in ( + scores, predictions) if array is not None] + if not arrays: + return y_true, None, None + + # Windowed solvers return fewer outputs than y_test because the + # first timestamps have no full context window. Keep the last samples, + # which correspond to the part of y_test the solver scored. + length = min([len(y_true)] + [len(array) for array in arrays]) + y_true = y_true[-length:] + if scores is not None: + scores = scores[-length:] + if predictions is not None: + predictions = predictions[-length:] + + # Drop invalid positions. NaN score padding and -1 prediction padding + # When both scores and predictions are present, the same mask is + # applied to keep mixed metric requests on the same timestamps. + valid = np.ones(length, dtype=bool) + if scores is not None: + valid &= ~np.isnan(scores) + if predictions is not None: + valid &= ~np.isnan(predictions) + valid &= predictions != -1 + + y_true = y_true[valid] + if scores is not None: + scores = scores[valid] + if predictions is not None: + predictions = predictions[valid] + + return y_true, scores, predictions + + def _as_flat_array(self, array): + if array is None: + return None + return np.asarray(array).reshape(-1) + + def _compute_score_metrics(self, y_true, anomaly_scores, metrics): + if len(y_true) == 0: + return {metric: np.nan for metric in metrics} + + result = {} + for metric in metrics: + if metric == "auc_roc": + result[metric] = self._safe_auc_roc(y_true, anomaly_scores) + elif metric == "auc_pr": + result[metric] = self._auc_pr(y_true, anomaly_scores) + else: + raise ValueError(f"Unknown score metric: {metric}") return result - def get_objective(self): - return dict( - X_train=self.X_train, X_test=self.X_test + def _compute_prediction_metrics( + self, + y_true, + anomaly_predictions, + metrics, + ): + if len(y_true) == 0: + return {metric: np.nan for metric in metrics} + + result = {} + anomaly_ranges = None + prediction_ranges = None + + for metric in metrics: + if metric == "precision": + result[metric] = precision_score( + y_true, anomaly_predictions, zero_division=0 + ) + elif metric == "recall": + result[metric] = recall_score( + y_true, anomaly_predictions, zero_division=0 + ) + elif metric == "f1": + result[metric] = f1_score( + y_true, anomaly_predictions, zero_division=0) + elif metric == "zoloss": + result[metric] = zero_one_loss(y_true, anomaly_predictions) + elif metric in {"precision_t", "recall_t", "f1_t"}: + if anomaly_ranges is None: + anomaly_ranges, prediction_ranges = self._get_ranges( + y_true, anomaly_predictions + ) + if metric == "precision_t": + result[metric] = precision_t_score( + anomaly_ranges, prediction_ranges + ) + elif metric == "recall_t": + result[metric] = recall_t_score( + anomaly_ranges, prediction_ranges) + else: + result[metric] = f1_t_score( + anomaly_ranges, prediction_ranges) + elif metric == "ctt": + result[metric] = ctt(y_true, anomaly_predictions) + elif metric == "ttc": + result[metric] = ttc(y_true, anomaly_predictions) + elif metric.startswith("soft_precision_"): + detection_range = self._parse_detection_range( + metric, "soft_precision") + result[metric] = soft_precision_score( + y_true, + anomaly_predictions, + detection_range=detection_range, + ) + elif metric.startswith("soft_recall_"): + detection_range = self._parse_detection_range( + metric, "soft_recall") + result[metric] = soft_recall_score( + y_true, + anomaly_predictions, + detection_range=detection_range, + ) + elif metric.startswith("soft_f1_"): + detection_range = self._parse_detection_range( + metric, "soft_f1") + result[metric] = soft_f1_score( + y_true, + anomaly_predictions, + detection_range=detection_range, + ) + else: + raise ValueError(f"Unknown prediction metric: {metric}") + + return result + + def _get_ranges(self, y_true, anomaly_predictions): + return ( + extract_anomaly_ranges(y_true), + extract_anomaly_ranges(anomaly_predictions), ) + + def _parse_detection_range(self, metric, prefix): + suffix = metric.replace(f"{prefix}_", "", 1) + try: + return int(suffix) + except ValueError as exc: + raise ValueError( + f"Invalid detection range in prediction metric: {metric}" + ) from exc + + def _safe_auc_roc(self, y_true, anomaly_scores): + return roc_auc_score(y_true, anomaly_scores) + + def _auc_pr(self, y_true, anomaly_scores): + if len(np.unique(y_true)) == 1: + return np.nan + return average_precision_score(y_true, anomaly_scores) diff --git a/solvers/AR.py b/solvers/AR.py index ba2446e..50eb908 100644 --- a/solvers/AR.py +++ b/solvers/AR.py @@ -8,13 +8,14 @@ from benchmark_utils.models import ARModel from benchmark_utils import mean_overlaping_pred +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): name = "AR" # AutoRegressive Linear model install_cmd = "conda" - requirements = ["pip::torch", "tqdm"] + requirements = ["pytorch", "tqdm"] sampling_strategy = "run_once" @@ -25,14 +26,12 @@ class Solver(BaseSolver): "weight_decay": [1e-7], "window_size": [100], "horizon": [1], - "percentile": [99.4], + "cutoff": [None], } test_config = { - 'solver': { - "n_epochs": 1, - "window_size": 16, - } + "n_epochs": 1, + "window_size": 16, } def set_objective(self, X_train, X_test): @@ -62,10 +61,6 @@ def set_objective(self, X_train, X_test): ) self.criterion = nn.MSELoss() - print("IN AR") - print("X_train shape", self.X_train.shape) - print("X_test shape", self.X_test.shape) - if self.X_train is not None: # (n_windows, window_size+horizon, n_features) self.Xw_train = np.lib.stride_tricks.sliding_window_view( @@ -136,31 +131,26 @@ def run(self, _): xw_hat = xw_hat.detach().cpu().numpy() - # Reconstructing the prediction from the predicted windows - # Creating the prediction array with -1 for the unknown values - # Corresponding to the first window_size values - x_hat = np.zeros_like(self.X_test)-1 # (n_test_samples, n_features) - x_hat[self.window_size:self.window_size+self.horizon] = xw_hat[0] + # Reconstructing the prediction from the predicted windows. + # The first ``window_size`` positions have no forecast (no full input + # window precedes them); fill them with -1 as a sentinel. + x_hat = np.zeros_like(self.X_test) - 1 + x_hat[self.window_size:] = mean_overlaping_pred(xw_hat, 1) - x_hat[self.window_size+self.horizon:] = mean_overlaping_pred( - xw_hat, 1 + reconstruction_err = np.abs( + self.X_test[self.window_size:] - x_hat[self.window_size:] ) - - # Calculating the percentile value for the threshold - percentile_value = np.percentile( - np.abs(self.X_test[self.window_size:] - x_hat[self.window_size:]), - self.percentile + self.anomaly_scores = np.full( + self.X_test.shape, np.nan, dtype=float ) + self.anomaly_scores[self.window_size:] = reconstruction_err + self.anomaly_scores = np.max(self.anomaly_scores, axis=1) - # Thresholding - predictions = np.zeros_like(x_hat)-1 - predictions[self.window_size:] = np.where( - np.abs(self.X_test[self.window_size:] - - x_hat[self.window_size:]) > percentile_value, 1, 0 + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) - self.predictions = np.max(predictions, axis=1) - # Skipping the solver call if a condition is met def skip(self, X_train, X_test): if X_train.shape[0]*X_train.shape[2] < self.window_size + self.horizon: @@ -170,4 +160,7 @@ def skip(self, X_train, X_test): return False, None def get_result(self): - return dict(y_hat=self.predictions) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/anomalybert.py b/solvers/anomalybert.py index abcda9a..8c9d1b4 100644 --- a/solvers/anomalybert.py +++ b/solvers/anomalybert.py @@ -1,3 +1,4 @@ +from models.anomaly_transformer import get_anomaly_transformer from benchopt import BaseSolver import sys @@ -8,17 +9,17 @@ from torch.optim.lr_scheduler import CosineAnnealingLR from tqdm import tqdm +from benchmark_utils.predictions import cutoff_scores + # Add AnomalyBERT to path sys.path.append(str(Path(__file__).parent.parent / 'AnomalyBERT')) -from models.anomaly_transformer import get_anomaly_transformer - class Solver(BaseSolver): name = "AnomalyBERT" sampling_strategy = "run_once" - requirements = ["pip::timm", "pip::torch", "pip::numpy", "pip::tqdm"] + requirements = ["pip::timm", "pytorch", "numpy", "tqdm"] parameters = { "patch_size": [1], @@ -31,6 +32,7 @@ class Solver(BaseSolver): "seed": [548920], "device": ["cuda:1"], "window_sliding": [16], + "cutoff": [None], } sampling_strategy = "run_once" @@ -237,7 +239,6 @@ def replacing_weights(interval_len): optimizer.step() scheduler.step() - def get_result(self): device = torch.device( self.device if torch.cuda.is_available() else 'cpu') self.model.eval() @@ -281,9 +282,14 @@ def get_result(self): n_overlap[idx:idx+window_size] += 1 n_overlap[n_overlap == 0] = 1 - scores = (output_values / n_overlap).cpu().numpy() - - threshold = np.percentile(scores, (1 - 0.1) * 100) - y_hat = (scores > threshold).astype(int) + self.anomaly_scores = (output_values / n_overlap).cpu().numpy() + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) - return dict(y_hat=y_hat, raw_anomaly_score=scores) + def get_result(self): + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/autoencoder.py b/solvers/autoencoder.py index f5a138e..a36fad0 100644 --- a/solvers/autoencoder.py +++ b/solvers/autoencoder.py @@ -1,17 +1,17 @@ from benchopt import BaseSolver -import numpy as np -from TSB_AD.utils.slidingWindows import find_length from sklearn.preprocessing import MinMaxScaler from benchmark_utils.models import Autoencoder +from benchmark_utils.predictions import cutoff_scores +from benchmark_utils.windowing import find_period_length class Solver(BaseSolver): name = "AE" install_cmd = "conda" - requirements = ["pip::tsb-uad", "scikit-learn"] + requirements = ["pytorch", "scikit-learn", "tqdm"] parameters = { "window_size": [10, "auto"], @@ -20,13 +20,20 @@ class Solver(BaseSolver): "learning_rate": [1e-3], "hidden_size": [64], "latent_size": [32], + "cutoff": [None], + } + + test_config = { + "window_size": 10, + "num_epochs": 1, + "batch_size": 8, } sampling_strategy = "run_once" def set_objective(self, X_train, X_test): if self.window_size == "auto": - self.window_size = find_length(X_train) + self.window_size = find_period_length(X_train.reshape(-1)) # Data received has shape (n_recordings, n_features, n_samples) n_features = X_train.shape[1] @@ -50,23 +57,29 @@ def run(self, _): ) self.clf.predict(self.X_test) - score = self.clf.decision_scores_ + anomaly_scores = self.clf.decision_scores_ - self.score = ( + self.anomaly_scores = ( MinMaxScaler(feature_range=(0, 1)) - .fit_transform(score.reshape(-1, 1)) + .fit_transform(anomaly_scores.reshape(-1, 1)) .ravel() ) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) def skip(self, X_train, X_test): """Check if the solver can be skipped.""" - if find_length(X_train) == 0 and self.window_size == "auto": + if find_period_length(X_train.reshape(-1)) == 0 and ( + self.window_size == "auto" + ): return True, "Window size is 0" return False, None def get_result(self): """Return the result of the solver.""" - # Binarizing the scores to 0 and 1 - # TEMPORARY SOLUTION - self.final_score = np.where(self.score > 0.75, 1, 0) - return dict(y_hat=self.final_score) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/dagmm.py b/solvers/dagmm.py index 99182f4..148fbc3 100644 --- a/solvers/dagmm.py +++ b/solvers/dagmm.py @@ -1,10 +1,10 @@ from benchopt import BaseSolver -import numpy as np import pandas as pd from merlion.models.anomaly.dagmm import DAGMM, DAGMMConfig from merlion.utils.time_series import TimeSeries -from sklearn.preprocessing import MinMaxScaler + +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): @@ -22,6 +22,7 @@ class Solver(BaseSolver): "batch_size": [8192], "lambda_energy": [0.1], "lambda_cov": [0.005], + "cutoff": [None], # "device": ["cuda:3"] } @@ -36,14 +37,10 @@ def set_objective(self, X_train, X_test): train_df = pd.DataFrame(self.X_train) test_df = pd.DataFrame(self.X_test) - print("Dataframe OK") - # Merlion expects a time index or it will generate one self.train_data = TimeSeries.from_pd(train_df) self.test_data = TimeSeries.from_pd(test_df) - print("TimeSeries OK") - # Configure DAGMM config = DAGMMConfig( gmm_k=self.gmm_k, @@ -66,17 +63,14 @@ def run(self, _): # Predict # get_anomaly_score returns a TimeSeries of scores scores_ts = self.model.get_anomaly_score(self.test_data) - self.scores = scores_ts.to_pd().values.flatten() + self.anomaly_scores = scores_ts.to_pd().values.flatten() + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) def get_result(self): - # Normalize scores to 0-1 range for thresholding - scaler = MinMaxScaler(feature_range=(0, 1)) - scores_norm = scaler.fit_transform(self.scores.reshape(-1, 1)).ravel() - - # Simple thresholding - y_hat = np.where(scores_norm > 0.75, 1, 0) - - return dict( - y_hat=y_hat, - raw_anomaly_score=self.scores - ) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/abod.py b/solvers/legacy/abod.py index 52e6e53..13bca75 100644 --- a/solvers/legacy/abod.py +++ b/solvers/legacy/abod.py @@ -5,6 +5,8 @@ from pyod.models.abod import ABOD import numpy as np +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "ABOD" # Angle-Based Outlier Detection @@ -18,6 +20,7 @@ class Solver(BaseSolver): "window": [True], "window_size": [20], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" @@ -51,31 +54,18 @@ def run(self, _): flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) self.clf.fit(flatrain) - - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = self.clf.decision_function(flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) # Function used to skip a solver call when n_neighbors >= window_size @@ -88,5 +78,7 @@ def get_result(self): # Anomaly : 1 # Inlier : 0 # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/cblof.py b/solvers/legacy/cblof.py index 452be07..1e65a7b 100644 --- a/solvers/legacy/cblof.py +++ b/solvers/legacy/cblof.py @@ -5,6 +5,8 @@ from pyod.models.cblof import CBLOF import numpy as np +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "CBLOF" @@ -18,6 +20,7 @@ class Solver(BaseSolver): "n_clusters": [10], "window_size": [20], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" @@ -50,30 +53,18 @@ def run(self, _): flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = self.clf.decision_function(flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) # Skipping the solver call if a condition is met @@ -86,5 +77,7 @@ def get_result(self): # Anomaly : 1 # Inlier : 0 # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/dif.py b/solvers/legacy/dif.py index b3b1f5f..36de441 100644 --- a/solvers/legacy/dif.py +++ b/solvers/legacy/dif.py @@ -4,6 +4,8 @@ from pyod.models.dif import DIF import numpy as np +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "DIF" @@ -16,6 +18,7 @@ class Solver(BaseSolver): "window": [True], "window_size": [20], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" @@ -47,30 +50,18 @@ def run(self, _): flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = self.clf.decision_function(flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) def skip(self, X_train, X_test): @@ -82,5 +73,7 @@ def get_result(self): # Anomaly : 1 # Inlier : 0 # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/isolation-forest.py b/solvers/legacy/isolation-forest.py index 9215294..83e8839 100644 --- a/solvers/legacy/isolation-forest.py +++ b/solvers/legacy/isolation-forest.py @@ -5,6 +5,8 @@ from sklearn.ensemble import IsolationForest import numpy as np +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "IsolationForest" @@ -17,6 +19,7 @@ class Solver(BaseSolver): "window": [True], "window_size": [60, 120, 180], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" @@ -50,24 +53,14 @@ def run(self, _): self.Xw_test.shape[0] * self.Xw_test.shape[1], -1) self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) + anomaly_scores = -self.clf.decision_function(flatest) # The results we get has a shape of n_recordings, n_features, n_windows, _ = self.Xw_test.shape - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Reshape back to original structure - self.raw_y_hat = self.raw_y_hat.reshape( - n_recordings, n_features, n_windows) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = self.raw_anomaly_score.reshape( + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + self.anomaly_scores = self.anomaly_scores.reshape( n_recordings, n_features, n_windows) else: # No windowing case @@ -77,15 +70,18 @@ def run(self, _): X_test_flat = self.X_test.reshape(-1, n_features) self.clf.fit(X_train_flat) - self.raw_y_hat = self.clf.predict(X_test_flat) - self.raw_anomaly_score = self.clf.decision_function(X_test_flat) + self.anomaly_scores = -self.clf.decision_function(X_test_flat) # Reshape to (n_recordings, n_samples) for single feature case # We assume we take the first feature or average across features - self.raw_y_hat = self.raw_y_hat.reshape(n_recordings, n_samples) - self.raw_anomaly_score = self.raw_anomaly_score.reshape( + self.anomaly_scores = self.anomaly_scores.reshape( n_recordings, n_samples) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + def skip(self, X_train, X_test): # Skip if dataset size is smaller than window size _, _, n_samples = X_train.shape @@ -98,7 +94,13 @@ def get_result(self): # Inlier : 0 # To ignore : -1 # For now, take the first recording - self.y_hat = self.raw_y_hat[0] if ( - self.raw_y_hat.ndim > 1 - ) else self.raw_y_hat - return dict(y_hat=self.y_hat) + anomaly_scores = self.anomaly_scores[0] if ( + self.anomaly_scores.ndim > 1 + ) else self.anomaly_scores + result = dict(anomaly_scores=anomaly_scores) + if self.anomaly_predictions is not None: + anomaly_predictions = self.anomaly_predictions[0] if ( + self.anomaly_predictions.ndim > 1 + ) else self.anomaly_predictions + result["anomaly_predictions"] = anomaly_predictions + return result diff --git a/solvers/legacy/lof.py b/solvers/legacy/lof.py index 00e6534..9075caa 100644 --- a/solvers/legacy/lof.py +++ b/solvers/legacy/lof.py @@ -5,6 +5,8 @@ from sklearn.neighbors import LocalOutlierFactor import numpy as np +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "LocalOutlierFactor" @@ -18,6 +20,7 @@ class Solver(BaseSolver): "window": [True], "window_size": [20], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" @@ -48,30 +51,18 @@ def run(self, _): flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = -self.clf.decision_function(flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) def skip(self, X_train, X_test): @@ -87,5 +78,7 @@ def get_result(self): # Anomaly : 1 # Inlier : 0 # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/ocsvm.py b/solvers/legacy/ocsvm.py index 96f2f6f..1813763 100644 --- a/solvers/legacy/ocsvm.py +++ b/solvers/legacy/ocsvm.py @@ -3,6 +3,8 @@ from sklearn.svm import OneClassSVM import numpy as np +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "OCSVM" @@ -17,6 +19,7 @@ class Solver(BaseSolver): "window": [True], "window_size": [128], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" @@ -47,30 +50,18 @@ def set_objective(self, X_train, X_test): def run(self, _): if self.window: self.clf.fit(self.flatrain) - raw_y_hat = self.clf.predict(self.flatest) - raw_anomaly_score = self.clf.decision_function(self.flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = -self.clf.decision_function(self.flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) def skip(self, X_train, X_test): @@ -79,4 +70,7 @@ def skip(self, X_train, X_test): return False, None def get_result(self): - return dict(y_hat=self.raw_y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/lstm.py b/solvers/lstm.py index 1c126ad..ff4a975 100644 --- a/solvers/lstm.py +++ b/solvers/lstm.py @@ -10,13 +10,14 @@ from benchmark_utils.models import AutoEncoderLSTM from benchmark_utils.windowing import make_windowed_dataset from benchmark_utils.windowing import reconstruct_from_windows +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): name = "LSTM" install_cmd = "conda" - requirements = ["pip::torch", "tqdm"] + requirements = ["pytorch", "tqdm"] sampling_strategy = "run_once" @@ -27,18 +28,16 @@ class Solver(BaseSolver): "lr": [1e-5], "window_size": [256], # window_size = seq_len "stride": [1], - "percentile": [97], + "cutoff": [None], "encoder_layers": [32], "decoder_layers": [32], } test_config = { - 'solver': { - "embedding_dim": 2, - "batch_size": 1, - "n_epochs": 1, - "window_size": 16, - } + "embedding_dim": 2, + "batch_size": 1, + "n_epochs": 1, + "window_size": 16, } def set_objective(self, X_train, X_test): @@ -105,9 +104,6 @@ def run(self, _): ti.set_postfix(train_loss=f"{train_loss:.5f}") - # Saving the model - torch.save(self.model.state_dict(), "model.pth") - # Test loop self.model.eval() raw_reconstruction = [] @@ -119,17 +115,18 @@ def run(self, _): raw_reconstruction.append(x_hat.detach().cpu().numpy()) reconstructed_data = np.concatenate(raw_reconstruction, axis=0) reconstructed_data = reconstruct_from_windows( - reconstructed_data, stride=self.stride, - batch=len(self.X_test), n_features=self.n_features + reconstructed_data, stride=self.stride, + batch=len(self.X_test), n_features=self.n_features ) reconstruction_err = np.mean( np.abs(self.X_test - reconstructed_data), axis=1 ) + self.anomaly_scores = reconstruction_err - self.y_hat = np.where( - reconstruction_err > np.percentile( - reconstruction_err, self.percentile), 1, 0 + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) def skip(self, X_train, X_test): @@ -138,4 +135,7 @@ def skip(self, X_train, X_test): return False, None def get_result(self): - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/matrixprofile.py b/solvers/matrixprofile.py index 48ed961..7b91d8d 100644 --- a/solvers/matrixprofile.py +++ b/solvers/matrixprofile.py @@ -1,9 +1,9 @@ from benchopt import BaseSolver from sklearn.preprocessing import MinMaxScaler -import numpy as np +from benchmark_utils.predictions import cutoff_scores +from benchmark_utils.windowing import find_period_length from TSB_AD.models.MatrixProfile import MatrixProfile -from TSB_AD.utils.slidingWindows import find_length class Solver(BaseSolver): @@ -14,6 +14,14 @@ class Solver(BaseSolver): parameters = { "window_size": [128, "auto"], + "cutoff": [None], + } + + test_config = { + "dataset": { + "n_features": 1, + }, + "window_size": 8, } sampling_strategy = "run_once" @@ -29,33 +37,29 @@ def set_objective(self, X_train, X_test): self.X_test = self.X_test.reshape(-1, n_features) if self.window_size == "auto": - self.window_size = int(find_length(X_train.reshape(-1))) - - print("=====================") - print(f"window_size: {self.window_size}") - print("=====================") + self.window_size = int(find_period_length(X_train.reshape(-1))) self.clf = MatrixProfile( window=self.window_size, ) def run(self, _): - print("Running Matrix Profile solver...") # Special solver, fitting on X_test self.clf.fit(self.X_test.reshape(-1)) - print("MP Fitted") - self.scores = self.clf.decision_scores_ - self.score = ( + anomaly_scores = self.clf.decision_scores_ + self.anomaly_scores = ( MinMaxScaler(feature_range=(0, 1)) - .fit_transform(self.scores.reshape(-1, 1)) + .fit_transform(anomaly_scores.reshape(-1, 1)) .ravel() ) - print("MP Scored") - print(f"Score shape: {self.score.shape}") + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) def skip(self, X_train, X_test): """Check if the solver can be skipped.""" - if (find_length(X_train.reshape(-1)) == 0) and ( + if (find_period_length(X_train.reshape(-1)) == 0) and ( self.window_size == "auto"): return True, "Window size is 0" if X_train.shape[1] != 1: @@ -64,7 +68,7 @@ def skip(self, X_train, X_test): def get_result(self): """Return the result of the solver.""" - # Binarizing the scores to 0 and 1 - # TEMPORARY SOLUTION - self.final_score = np.where(self.score > 0.90, 1, 0) - return dict(y_hat=self.final_score, raw_anomaly_score=self.score) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/rosecdl.py b/solvers/rosecdl.py index beabd79..6ccf54c 100644 --- a/solvers/rosecdl.py +++ b/solvers/rosecdl.py @@ -1,8 +1,9 @@ from benchopt import BaseSolver import torch +from benchmark_utils.predictions import cutoff_scores +from benchmark_utils.windowing import find_period_length from rosecdl.rosecdl import RoseCDL -from TSB_AD.utils.slidingWindows import find_length class Solver(BaseSolver): @@ -10,7 +11,7 @@ class Solver(BaseSolver): install_cmd = "conda" requirements = [ - "pip::git+https://github.com/tommoral/rosecdl.git", "pip::torch" + "pytorch", "pip::git+https://github.com/tommoral/rosecdl.git" ] parameters = { @@ -35,6 +36,7 @@ class Solver(BaseSolver): }, ], "plot": [False], + "cutoff": [None], } sampling_strategy = "run_once" @@ -49,11 +51,7 @@ def set_objective(self, X_train, X_test): self.X_test = X_test if self.kernel_size == "auto": - self.kernel_size = int(find_length(X_train.reshape(-1))) - - print("=====================") - print(f"kernel_size: {self.kernel_size}") - print("=====================") + self.kernel_size = int(find_period_length(X_train.reshape(-1))) self.clf = RoseCDL( n_components=self.n_components, @@ -75,7 +73,6 @@ def set_objective(self, X_train, X_test): def run(self, _): self.clf.fit(self.X_train) del self.X_train # Free GPU memory for X_train after fitting - self.y_pred = self.clf.get_outlier_mask(self.X_test) xh, zh = self.clf.csc( torch.tensor(self.X_test, dtype=torch.float32, device=self.device) @@ -88,9 +85,16 @@ def run(self, _): ) err = err.cpu().detach().numpy() # Aggregate errors over channels - self.err = err.sum(axis=1).reshape(-1) + self.anomaly_scores = err.sum(axis=1).reshape(-1) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) del self.clf # Free GPU memory for the model torch.cuda.empty_cache() # Release cached GPU memory def get_result(self): - return dict(y_hat=self.y_pred, raw_anomaly_score=self.err) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/tsb_chronos.py b/solvers/tsb_chronos.py index 9c87358..855c842 100644 --- a/solvers/tsb_chronos.py +++ b/solvers/tsb_chronos.py @@ -5,6 +5,8 @@ from TSB_AD.models.Chronos import Chronos from TSB_AD.utils.slidingWindows import find_length +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "TSB-Chronos" @@ -17,6 +19,7 @@ class Solver(BaseSolver): "prediction_length": [1], "model_size": ['base'], "batch_size": [32], + "cutoff": [None], } sampling_strategy = "run_once" @@ -39,16 +42,18 @@ def set_objective(self, X_train, X_test): ) def run(self, _): - print("Running Chronos solver...") self.clf.fit(self.data) - self.score = self.clf.decision_scores_[-len(self.X_test):] - print("Chronos Fitted") + self.anomaly_scores = self.clf.decision_scores_[-len(self.X_test):] + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) - # Map scores to predictions - threshold = np.percentile(self.score, (1 - 0.1) * 100) - self.y_hat = (self.score > threshold).astype(int) del self.clf # Free memory for the model torch.cuda.empty_cache() # Release cached GPU memory def get_result(self): - return dict(y_hat=self.y_hat, raw_anomaly_score=self.score) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/tsb_timesfm.py b/solvers/tsb_timesfm.py index 77a69b7..cfa591b 100644 --- a/solvers/tsb_timesfm.py +++ b/solvers/tsb_timesfm.py @@ -1,9 +1,13 @@ from benchopt import BaseSolver -import torch +from importlib.util import find_spec + import numpy as np +import torch from TSB_AD.model_wrapper import run_TimesFM +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "TSB-TimesFM" @@ -13,6 +17,7 @@ class Solver(BaseSolver): parameters = { "win_size": [256], + "cutoff": [None], } sampling_strategy = "run_once" @@ -23,15 +28,25 @@ def set_objective(self, X_train, X_test): self.data = self.data.reshape(-1, n_features) self.X_test = X_test.reshape(-1, n_features) + def skip(self, X_train, X_test): + if find_spec("timesfm") is None: + return True, "TSB-TimesFM requires the optional timesfm package." + return False, None + def run(self, _): - self.y_hat = run_TimesFM( + anomaly_scores = run_TimesFM( data=self.data, win_size=self.win_size, ) - self.raw_anomaly_score = self.y_hat[-len(self.X_test):] + self.anomaly_scores = anomaly_scores[-len(self.X_test):] + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) torch.cuda.empty_cache() # Release cached GPU memory def get_result(self): - threshold = np.percentile(self.raw_anomaly_score, 90) - self.y_hat = (self.raw_anomaly_score > threshold).astype(int) - return dict(y_hat=self.y_hat, raw_anomaly_score=self.raw_anomaly_score) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/tsb_timesnet.py b/solvers/tsb_timesnet.py index 76ed261..ed431ae 100644 --- a/solvers/tsb_timesnet.py +++ b/solvers/tsb_timesnet.py @@ -3,6 +3,8 @@ import torch from TSB_AD.models.TimesNet import TimesNet +from benchmark_utils.predictions import cutoff_scores + class Solver(BaseSolver): name = "TSB-TimesNet" @@ -13,6 +15,20 @@ class Solver(BaseSolver): parameters = { "window_size": [256], "lr": [1e-4], + "epochs": [10], + "batch_size": [128], + "cutoff": [None], + } + + test_config = { + "dataset": { + "n_samples": 512, + "n_features": 2, + "n_anomaly": 32, + }, + "window_size": 32, + "epochs": 1, + "batch_size": 16, } sampling_strategy = "run_once" @@ -25,8 +41,8 @@ def set_objective(self, X_train, X_test): self.clf = TimesNet( win_size=self.window_size, enc_in=n_features, - epochs=10, - batch_size=128, + epochs=self.epochs, + batch_size=self.batch_size, lr=self.lr, patience=3, features="M", @@ -36,13 +52,25 @@ def set_objective(self, X_train, X_test): def run(self, _): self.clf.fit(self.X_train) - self.raw_anomaly_score = self.clf.decision_function(self.X_test) + self.anomaly_scores = self.clf.decision_function(self.X_test) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) - print("TimesNet done") del self.clf.model del self.clf torch.cuda.empty_cache() # Release cached GPU memory + def skip(self, X_train, X_test): + if X_train.shape[-1] < self.window_size: + return True, "Not enough training samples to create a window." + if X_test.shape[-1] < self.window_size: + return True, "Not enough testing samples to create a window." + return False, None + def get_result(self): - self.y_hat = (self.raw_anomaly_score > 0).astype(int) - return dict(y_hat=self.y_hat, raw_anomaly_score=self.raw_anomaly_score) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/vae.py b/solvers/vae.py index 75e66c9..9dfd400 100644 --- a/solvers/vae.py +++ b/solvers/vae.py @@ -1,9 +1,9 @@ from benchopt import BaseSolver import torch -import numpy as np from pyod.models.vae import VAE +from benchmark_utils.predictions import cutoff_scores from benchmark_utils.windowing import make_windows @@ -11,7 +11,7 @@ class Solver(BaseSolver): name = "VAE" install_cmd = "conda" - requirements = ["pip::pyod", "pip::torch"] + requirements = ["pyod", "pytorch"] sampling_strategy = "run_once" @@ -26,12 +26,11 @@ class Solver(BaseSolver): "latent_dim": [2, 5, 10], "batch_norm": [True], "dropout_rate": [0.1, 0.2, 0.5], + "cutoff": [None], } test_config = { - 'solver': { - "n_epochs": 1, - "window_size": 16, - } + "n_epochs": 1, + "window_size": 16, } def set_objective(self, X_train, X_test): @@ -70,7 +69,14 @@ def set_objective(self, X_train, X_test): def run(self, _): self.clf.fit(self.Xw_train) - self.y_pred = self.clf.predict(self.Xw_test) + self.anomaly_scores = self.clf.decision_function(self.Xw_test) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) def get_result(self): - return dict(y_hat=self.y_pred) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/vanilla-transformer.py b/solvers/vanilla-transformer.py index 60b06be..11f91dd 100644 --- a/solvers/vanilla-transformer.py +++ b/solvers/vanilla-transformer.py @@ -11,13 +11,14 @@ from benchmark_utils.models import TransformerModel from benchmark_utils.windowing import make_windowed_dataset from benchmark_utils.windowing import reconstruct_from_windows +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): name = "Transformer" install_cmd = "conda" - requirements = ["pip::torch", "tqdm"] + requirements = ["pytorch", "tqdm"] sampling_strategy = "run_once" @@ -31,13 +32,11 @@ class Solver(BaseSolver): "horizon": [1], "window_size": [256], "stride": [1], - "percentile": [97], + "cutoff": [None], } test_config = { - 'solver': { - "n_epochs": 1, - "window_size": 16, - } + "n_epochs": 1, + "window_size": 16, } def set_objective(self, X_train, X_test): @@ -113,7 +112,9 @@ def run(self, _): total_loss += loss.item() avg_loss = total_loss / (len(self.Xw_train) // self.batch_size) - ti.set_description(f"Epoch {epoch} (loss={avg_loss:.5e})") + ti.set_description( + f"Epoch {epoch} (loss={avg_loss:.5e})" + ) # Learning rate scheduling self.scheduler.step(avg_loss) @@ -122,7 +123,6 @@ def run(self, _): if avg_loss < best_loss: best_loss = avg_loss no_improve = 0 - torch.save(self.model.state_dict(), 'best_model.pth') else: no_improve += 1 if no_improve == patience: @@ -148,21 +148,22 @@ def run(self, _): n_features=self.X_test.shape[1] ) - # Calculating the percentile value for the threshold - percentile_value = np.percentile( - np.abs(self.X_test[..., self.window_size:] - - x_hat[..., self.window_size:]), - self.percentile + reconstruction_err = np.abs( + self.X_test[..., self.window_size:] - x_hat[..., self.window_size:] ) - - # Thresholding - predictions = np.zeros_like(self.X_test)-1 - predictions[..., self.window_size:] = np.where( - np.abs(self.X_test[..., self.window_size:] - - x_hat[..., self.window_size:]) > percentile_value, 1, 0 + self.anomaly_scores = np.full( + self.X_test.shape[:1] + self.X_test.shape[2:], + np.nan, + dtype=float, + ) + self.anomaly_scores[..., self.window_size:] = np.max( + reconstruction_err, axis=1 ) - self.predictions = np.max(predictions, axis=1) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) def skip(self, X_train, X_test): if X_train.shape[-1] < self.window_size + self.horizon: @@ -170,4 +171,7 @@ def skip(self, X_train, X_test): return False, None def get_result(self): - return dict(y_hat=self.predictions) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/test_config.py b/test_config.py index fd149be..3606a74 100644 --- a/test_config.py +++ b/test_config.py @@ -1,10 +1,21 @@ import sys # noqa: F401 +from importlib.util import find_spec import pytest # noqa: F401 from benchopt.utils.sys_info import get_cuda_version +OPTIONAL_BACKEND_INSTALL_XFAILS = { + "dagmm": "DAGMM depends on the optional salesforce-merlion package.", + "mp": "MP depends on the optional TSB-AD package.", + "rosecdl": "RoseCDL depends on an optional GitHub package.", + "tsb-chronos": "TSB-Chronos depends on the optional TSB-AD backend.", + "tsb-timesfm": "TSB-TimesFM depends on TSB-AD and timesfm.", + "tsb-timesnet": "TSB-TimesNet depends on the optional TSB-AD backend.", +} + + def check_test_solver_install(benchmark, solver_class): """Hook called in `test_solver_install`. @@ -12,11 +23,16 @@ def check_test_solver_install(benchmark, solver_class): particular architecture, call pytest.xfail when detecting the situation. """ - if solver_class.name.lower() == "dif": + solver_name = solver_class.name.lower() + + if solver_name in OPTIONAL_BACKEND_INSTALL_XFAILS: + pytest.xfail(OPTIONAL_BACKEND_INSTALL_XFAILS[solver_name]) + + if solver_name == "dif": if get_cuda_version() is None: pytest.xfail("Deep IsolationForest needs a working GPU hardware.") - if solver_class.name.lower() == "anomalybert": + if solver_name == "anomalybert": pytest.xfail("AnomalyBERT needs to be installed locally from repo" " at https://github.com/Jhryu30/AnomalyBERT.git") @@ -29,11 +45,20 @@ def check_test_solver_install(benchmark, solver_class): # pytest.xfail("Transformer needs a working GPU hardware.") -def check_test_dataset_get_data(benchmark, data_class): - if data_class.name.lower() in [ +def check_test_solver_run(benchmark, solver_class): + """Hook called in `test_solver_run`.""" + if solver_class.name.lower() == "tsb-timesfm": + if find_spec("timesfm") is None: + pytest.xfail( + "TSB-TimesFM needs the optional timesfm package." + ) + + +def check_test_dataset_get_data(benchmark, dataset_class): + if dataset_class.name.lower() in [ "daphnet", "dodgers", "ecg", "genesis", "ghl", "iops", "kdd21", "mgab", "mitdb", "nab", "occupancy", "opportunity", "sensorscope", "smd", "svdb", "yahoo" ]: - pytest.xfail(f"{data_class.name} dataset is not downloaded.") + pytest.xfail(f"{dataset_class.name} dataset is not downloaded.") diff --git a/tests/test_objective.py b/tests/test_objective.py new file mode 100644 index 0000000..6a50afc --- /dev/null +++ b/tests/test_objective.py @@ -0,0 +1,110 @@ +import numpy as np +import pytest + +from objective import Objective + + +def make_objective(score_metrics=("auc_pr", "auc_roc"), + prediction_metrics=None): + objective = Objective() + objective.score_metrics = score_metrics + objective.prediction_metrics = prediction_metrics + objective.set_data( + X_train=np.empty((1, 1, 6)), + y_test=np.array([0, 0, 1, 0, 1, 0]), + X_test=np.empty((1, 1, 6)), + ) + return objective + + +def test_default_evaluation_uses_score_metrics_only(): + objective = make_objective() + scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2]) + + result = objective.evaluate_result(anomaly_scores=scores) + + assert result["auc_pr"] == pytest.approx(1.0) + assert result["auc_roc"] == pytest.approx(1.0) + assert result["value"] == pytest.approx(0.0) + assert "precision" not in result + + +def test_score_and_prediction_metrics_use_canonical_keys(): + objective = make_objective( + score_metrics=("auc_pr",), + prediction_metrics=("precision",), + ) + scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2]) + predictions = np.array([0, 0, 1, 0, 1, 0]) + + result = objective.evaluate_result( + anomaly_scores=scores, + anomaly_predictions=predictions, + ) + + assert result["auc_pr"] == pytest.approx(1.0) + assert result["precision"] == pytest.approx(1.0) + + +def test_prediction_metrics_are_opt_in(): + objective = make_objective( + prediction_metrics=("precision", "recall", "f1", "zoloss"), + ) + scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2]) + predictions = np.array([0, 0, 1, 0, 1, 0]) + + result = objective.evaluate_result( + anomaly_scores=scores, + anomaly_predictions=predictions, + ) + + assert result["precision"] == pytest.approx(1.0) + assert result["recall"] == pytest.approx(1.0) + assert result["f1"] == pytest.approx(1.0) + assert result["zoloss"] == pytest.approx(0.0) + + +def test_prediction_metrics_require_prediction_array(): + objective = make_objective(prediction_metrics=("precision",)) + scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2]) + + with pytest.raises(ValueError, match="anomaly_predictions"): + objective.evaluate_result(anomaly_scores=scores) + + +def test_nan_score_padding_is_masked(): + objective = make_objective() + scores = np.array([np.nan, 0.2, 0.9, 0.1, 0.8, 0.2]) + + result = objective.evaluate_result(anomaly_scores=scores) + + assert result["auc_pr"] == pytest.approx(1.0) + assert result["auc_roc"] == pytest.approx(1.0) + + +def test_prediction_padding_is_masked(): + objective = make_objective( + score_metrics=None, + prediction_metrics=("precision", "recall", "f1"), + ) + predictions = np.array([-1, 0, 1, 0, 1, 0]) + + result = objective.evaluate_result(anomaly_predictions=predictions) + + assert result["precision"] == pytest.approx(1.0) + assert result["recall"] == pytest.approx(1.0) + assert result["f1"] == pytest.approx(1.0) + assert result["value"] == pytest.approx(0.0) + + +def test_prediction_only_metrics_without_primary_value_fallback_to_zero(): + objective = make_objective( + score_metrics=None, + prediction_metrics=("precision",), + ) + predictions = np.array([0, 0, 1, 0, 1, 0]) + + result = objective.evaluate_result(anomaly_predictions=predictions) + + assert result["precision"] == pytest.approx(1.0) + assert result["value"] == pytest.approx(0.0) diff --git a/tests/test_predictions.py b/tests/test_predictions.py new file mode 100644 index 0000000..1e8f9b7 --- /dev/null +++ b/tests/test_predictions.py @@ -0,0 +1,33 @@ +import numpy as np +import pytest + +from benchmark_utils.predictions import cutoff_scores + + +def test_cutoff_scores_returns_none_without_cutoff(): + scores = np.array([0.1, 0.8, 0.2]) + + assert cutoff_scores(scores) is None + + +def test_cutoff_scores_uses_top_score_fraction(): + scores = np.array([0.1, 0.8, 0.2, 0.9]) + + predictions = cutoff_scores(scores, cutoff=0.25) + + np.testing.assert_array_equal(predictions, np.array([0, 0, 0, 1])) + + +def test_cutoff_scores_preserves_nan_padding_as_ignore_label(): + scores = np.array([np.nan, 0.1, 0.8, 0.2, 0.9]) + + predictions = cutoff_scores(scores, cutoff=0.25) + + np.testing.assert_array_equal(predictions, np.array([-1, 0, 0, 0, 1])) + + +def test_cutoff_scores_rejects_invalid_cutoff(): + scores = np.array([0.1, 0.8, 0.2]) + + with pytest.raises(ValueError, match="must be in"): + cutoff_scores(scores, cutoff=1) From 0a12847c0770057ba4a0a22cb6e4ebe5715f4aae Mon Sep 17 00:00:00 2001 From: Jad Yehya Date: Tue, 19 May 2026 09:11:31 +0200 Subject: [PATCH 42/50] FIX Autoencoder device handling --- benchmark_utils/models.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/benchmark_utils/models.py b/benchmark_utils/models.py index ad38592..a2b4f2a 100644 --- a/benchmark_utils/models.py +++ b/benchmark_utils/models.py @@ -294,7 +294,7 @@ def fit( return losses - def predict(self, X_test, X_dirty=None, device="cuda"): + def predict(self, X_test, X_dirty=None, device=None): """ Predict anomaly scores for time series data. @@ -306,6 +306,10 @@ def predict(self, X_test, X_dirty=None, device="cuda"): Returns: Reconstructed data and sets decision_scores_ attribute """ + if device is None: + device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") + self.eval() self.to(device) @@ -349,7 +353,7 @@ def predict(self, X_test, X_dirty=None, device="cuda"): return test_predict - def encode_data(self, x, device="cuda"): + def encode_data(self, x, device=None): """ Encode input data to latent representation. @@ -360,6 +364,10 @@ def encode_data(self, x, device="cuda"): Returns: Encoded data as numpy array """ + if device is None: + device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") + self.eval() self.to(device) From 840b23ec7cda45742a27eec3f665285e1c36a937 Mon Sep 17 00:00:00 2001 From: Jad Yehya Date: Tue, 19 May 2026 17:40:14 +0200 Subject: [PATCH 43/50] Update pre-commit configuration --- .gitignore | 2 ++ .pre-commit-config.yaml | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 091a223..a90798c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__ __cache__ *.egg-info +*.pth .coverage **/outputs joblib/ @@ -19,3 +20,4 @@ coverage.xml # Data directories data/ exploratory/ +src/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a31f116..dca8836 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,13 +8,13 @@ repos: - id: check-json - repo: https://github.com/pycqa/flake8 - rev: 6.0.0 + rev: 7.3.0 hooks: - id: flake8 args: [--max-line-length=79] # Customize flake8 options here - - repo: https://github.com/pre-commit/mirrors-autopep8 - rev: v1.6.0 + - repo: https://github.com/hhatto/autopep8 + rev: v2.3.2 hooks: - id: autopep8 - args: [--max-line-length=79, --in-place] \ No newline at end of file + args: [--max-line-length=79, --in-place] From 092b5d7e07dc2cfe90c52599fb38e59258983fe7 Mon Sep 17 00:00:00 2001 From: Jad Yehya Date: Tue, 19 May 2026 17:44:46 +0200 Subject: [PATCH 44/50] ENH add download helper for TSB-UAD public dataset bundle --- benchmark_utils/download.py | 71 +++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 benchmark_utils/download.py diff --git a/benchmark_utils/download.py b/benchmark_utils/download.py new file mode 100644 index 0000000..e27af85 --- /dev/null +++ b/benchmark_utils/download.py @@ -0,0 +1,71 @@ +"""Shared download helper for the TSB-UAD public dataset bundle. +""" +from pathlib import Path + +from benchopt import config + + +_BUNDLE_URL = "https://www.thedatum.org/datasets/TSB-UAD-Public.zip" +_BUNDLE_SHA256 = ( + "ff4aa83a5a111835d410d962152e8dbebcda1039b778bae45b6b9c3f46dd49a1" +) +_BUNDLE_FILENAME = "TSB-UAD-Public.zip" +_BUNDLE_ROOT = "TSB-UAD-Public" + +# Map benchmark dataset name -> subdirectory inside the TSB-UAD bundle. +_SUBDIR = { + "DAPHNET": "Daphnet", + "DODGERS": "Dodgers", + "ECG": "ECG", + "GENESIS": "Genesis", + "GHL": "GHL", + "IOPS": "IOPS", + "KDD21": "KDD21", + "MGAB": "MGAB", + "MITDB": "MITDB", + "NAB": "NAB", + "OCCUPANCY": "Occupancy", + "OPPORTUNITY": "OPPORTUNITY", + "SENSORSCOPE": "SensorScope", + "SMD": "SMD", + "SVDB": "SVDB", + "YAHOO": "YAHOO", +} + + +def fetch_tsb_uad(name: str) -> Path: + """Return the local directory holding TSB-UAD's ``.out`` files for *name*. + + The bundle is downloaded once into + ``benchopt.config.get_data_path("TSB-UAD-Public")`` and extracted; + subsequent calls are cache hits. + """ + if name not in _SUBDIR: + raise KeyError( + f"{name!r} is not a TSB-UAD dataset name. " + f"Known names: {sorted(_SUBDIR)}" + ) + + import pooch # local import: only required when downloading + + cache_root = Path(config.get_data_path(key=_BUNDLE_ROOT)) + cache_root.mkdir(parents=True, exist_ok=True) + + registry = pooch.create( + path=cache_root, + base_url="https://www.thedatum.org/datasets/", + registry={_BUNDLE_FILENAME: f"sha256:{_BUNDLE_SHA256}"}, + urls={_BUNDLE_FILENAME: _BUNDLE_URL}, + ) + registry.fetch( + _BUNDLE_FILENAME, + processor=pooch.Unzip(extract_dir="."), + progressbar=True, + ) + + subdir = cache_root / _BUNDLE_ROOT / _SUBDIR[name] + if not subdir.exists(): + raise FileNotFoundError( + f"Expected {subdir} after extracting the TSB-UAD bundle." + ) + return subdir From ce6724f432aa3b1005ca269931336e58a95a6a89 Mon Sep 17 00:00:00 2001 From: Jad Yehya Date: Tue, 19 May 2026 17:45:09 +0200 Subject: [PATCH 45/50] ENH update dataset loading to use fetch_tsb_uad for DODGERS, MITDB, and NAB. --- datasets/dodgers.py | 10 +++++++--- datasets/mitdb.py | 10 +++++++--- datasets/nab.py | 10 +++++++--- test_config.py | 4 ++-- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/datasets/dodgers.py b/datasets/dodgers.py index f3c6879..8d3b7a2 100644 --- a/datasets/dodgers.py +++ b/datasets/dodgers.py @@ -1,10 +1,10 @@ -from benchopt import BaseDataset, config +from benchopt import BaseDataset from pathlib import Path import numpy as np import pandas as pd -PATH = config.get_data_path("DODGERS") +from benchmark_utils.download import fetch_tsb_uad def load_data(db_path, record_ids=None, verbose=False): @@ -90,6 +90,8 @@ def load_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "DODGERS" + requirements = ["pip:pooch"] + parameters = { # "recordings_id": [["101"]], "recordings_id": [None], @@ -99,11 +101,13 @@ class Dataset(BaseDataset): def get_data(self): """Load the DODGERS dataset.""" + path = fetch_tsb_uad("DODGERS") + # X shape (n_recordings, n_samples) # y shape (n_recordings, n_samples) if self.recordings_id in (["all"], "all"): self.recordings_id = None - X, y_true = load_data(PATH, self.recordings_id) + X, y_true = load_data(path, self.recordings_id) X_test = X.copy() y_test = y_true.copy() diff --git a/datasets/mitdb.py b/datasets/mitdb.py index 7f811d0..5cf8668 100644 --- a/datasets/mitdb.py +++ b/datasets/mitdb.py @@ -1,10 +1,10 @@ -from benchopt import BaseDataset, config +from benchopt import BaseDataset from pathlib import Path import numpy as np import pandas as pd -PATH = config.get_data_path("MITDB") +from benchmark_utils.download import fetch_tsb_uad def load_mitdb_data(db_path, record_ids=None, verbose=False): @@ -102,6 +102,8 @@ def load_mitdb_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "MITDB" + requirements = ["pip:pooch"] + parameters = { "recordings_id": [["100", "201", "109", "105", "111", "221"]], "debug": [False], @@ -110,11 +112,13 @@ class Dataset(BaseDataset): def get_data(self): """Load the MITDB dataset.""" + path = fetch_tsb_uad("MITDB") + # X shape (n_recordings, n_samples) # y shape (n_recordings, n_samples) if self.recordings_id in (["all"], "all"): self.recordings_id = None - X, y_true = load_mitdb_data(PATH, self.recordings_id) + X, y_true = load_mitdb_data(path, self.recordings_id) X_test = X.copy() y_test = y_true.copy() diff --git a/datasets/nab.py b/datasets/nab.py index 20a0960..88b1d0f 100644 --- a/datasets/nab.py +++ b/datasets/nab.py @@ -1,10 +1,10 @@ -from benchopt import BaseDataset, config +from benchopt import BaseDataset from pathlib import Path import numpy as np import pandas as pd -PATH = config.get_data_path("NAB") +from benchmark_utils.download import fetch_tsb_uad def load_data(db_path, record_ids=None, verbose=False): @@ -88,6 +88,8 @@ def load_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "NAB" + requirements = ["pip:pooch"] + parameters = { "recordings_id": [["art0"], ["art1"], ["CloudWatch"]], "debug": [False], @@ -96,9 +98,11 @@ class Dataset(BaseDataset): def get_data(self): """Load the NAB dataset.""" + path = fetch_tsb_uad("NAB") + # X shape (n_recordings, n_samples) # y shape (n_recordings, n_samples) - X, y_true = load_data(PATH, self.recordings_id) + X, y_true = load_data(path, self.recordings_id) X_test = X.copy() y_test = y_true.copy() diff --git a/test_config.py b/test_config.py index 3606a74..5076198 100644 --- a/test_config.py +++ b/test_config.py @@ -56,8 +56,8 @@ def check_test_solver_run(benchmark, solver_class): def check_test_dataset_get_data(benchmark, dataset_class): if dataset_class.name.lower() in [ - "daphnet", "dodgers", "ecg", "genesis", "ghl", - "iops", "kdd21", "mgab", "mitdb", "nab", + "daphnet", "ecg", "genesis", "ghl", + "iops", "kdd21", "mgab", "occupancy", "opportunity", "sensorscope", "smd", "svdb", "yahoo" ]: From 88639b61cc813fb0a6a920ec6af4abb681ae28c4 Mon Sep 17 00:00:00 2001 From: Jad Yehya Date: Tue, 19 May 2026 17:53:40 +0200 Subject: [PATCH 46/50] FIX "::" instead of ":" for pip channel --- datasets/dodgers.py | 2 +- datasets/mitdb.py | 2 +- datasets/nab.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datasets/dodgers.py b/datasets/dodgers.py index 8d3b7a2..4f41e37 100644 --- a/datasets/dodgers.py +++ b/datasets/dodgers.py @@ -90,7 +90,7 @@ def load_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "DODGERS" - requirements = ["pip:pooch"] + requirements = ["pip::pooch"] parameters = { # "recordings_id": [["101"]], diff --git a/datasets/mitdb.py b/datasets/mitdb.py index 5cf8668..b7abebd 100644 --- a/datasets/mitdb.py +++ b/datasets/mitdb.py @@ -102,7 +102,7 @@ def load_mitdb_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "MITDB" - requirements = ["pip:pooch"] + requirements = ["pip::pooch"] parameters = { "recordings_id": [["100", "201", "109", "105", "111", "221"]], diff --git a/datasets/nab.py b/datasets/nab.py index 88b1d0f..dc1e970 100644 --- a/datasets/nab.py +++ b/datasets/nab.py @@ -88,7 +88,7 @@ def load_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "NAB" - requirements = ["pip:pooch"] + requirements = ["pip::pooch"] parameters = { "recordings_id": [["art0"], ["art1"], ["CloudWatch"]], From f4705dd563799181f9f7ab53dc265f550a8506ce Mon Sep 17 00:00:00 2001 From: Jad Yehya Date: Tue, 19 May 2026 18:02:13 +0200 Subject: [PATCH 47/50] trying to install pooch without pip --- datasets/dodgers.py | 2 +- datasets/mitdb.py | 2 +- datasets/nab.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datasets/dodgers.py b/datasets/dodgers.py index 4f41e37..0dceb20 100644 --- a/datasets/dodgers.py +++ b/datasets/dodgers.py @@ -90,7 +90,7 @@ def load_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "DODGERS" - requirements = ["pip::pooch"] + requirements = ["pooch"] parameters = { # "recordings_id": [["101"]], diff --git a/datasets/mitdb.py b/datasets/mitdb.py index b7abebd..d0ce342 100644 --- a/datasets/mitdb.py +++ b/datasets/mitdb.py @@ -102,7 +102,7 @@ def load_mitdb_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "MITDB" - requirements = ["pip::pooch"] + requirements = ["pooch"] parameters = { "recordings_id": [["100", "201", "109", "105", "111", "221"]], diff --git a/datasets/nab.py b/datasets/nab.py index dc1e970..d56a291 100644 --- a/datasets/nab.py +++ b/datasets/nab.py @@ -88,7 +88,7 @@ def load_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "NAB" - requirements = ["pip::pooch"] + requirements = ["pooch"] parameters = { "recordings_id": [["art0"], ["art1"], ["CloudWatch"]], From 25ed19a92d918687f1bc92f94e714603ab34b8e1 Mon Sep 17 00:00:00 2001 From: Jad Yehya Date: Tue, 19 May 2026 18:13:16 +0200 Subject: [PATCH 48/50] Removed pooch requirements from solvers to objective. --- datasets/dodgers.py | 2 -- datasets/mitdb.py | 2 -- datasets/nab.py | 2 -- objective.py | 2 +- 4 files changed, 1 insertion(+), 7 deletions(-) diff --git a/datasets/dodgers.py b/datasets/dodgers.py index 0dceb20..1ff52dd 100644 --- a/datasets/dodgers.py +++ b/datasets/dodgers.py @@ -90,8 +90,6 @@ def load_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "DODGERS" - requirements = ["pooch"] - parameters = { # "recordings_id": [["101"]], "recordings_id": [None], diff --git a/datasets/mitdb.py b/datasets/mitdb.py index d0ce342..cce6810 100644 --- a/datasets/mitdb.py +++ b/datasets/mitdb.py @@ -102,8 +102,6 @@ def load_mitdb_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "MITDB" - requirements = ["pooch"] - parameters = { "recordings_id": [["100", "201", "109", "105", "111", "221"]], "debug": [False], diff --git a/datasets/nab.py b/datasets/nab.py index d56a291..b1de8c4 100644 --- a/datasets/nab.py +++ b/datasets/nab.py @@ -88,8 +88,6 @@ def load_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "NAB" - requirements = ["pooch"] - parameters = { "recordings_id": [["art0"], ["art1"], ["CloudWatch"]], "debug": [False], diff --git a/objective.py b/objective.py index 04dbde5..07165d5 100644 --- a/objective.py +++ b/objective.py @@ -26,7 +26,7 @@ class Objective(BaseObjective): name = "Anomaly detection" install_cmd = "conda" - requirements = ["scikit-learn"] + requirements = ["scikit-learn", "pip:pooch"] parameters = { "score_metrics": [("auc_pr", "auc_roc")], From 0d67dc56a237874a5d107f18dcd6e94cd7b67531 Mon Sep 17 00:00:00 2001 From: Jad Yehya Date: Tue, 19 May 2026 19:44:17 +0200 Subject: [PATCH 49/50] ENH add progress bar support for dataset downloads and update requirements for DODGERS, MITDB, and NAB datasets --- benchmark_utils/download.py | 8 +++++++- datasets/dodgers.py | 2 ++ datasets/mitdb.py | 2 ++ datasets/nab.py | 2 ++ objective.py | 2 +- 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/benchmark_utils/download.py b/benchmark_utils/download.py index e27af85..becb2c9 100644 --- a/benchmark_utils/download.py +++ b/benchmark_utils/download.py @@ -48,6 +48,12 @@ def fetch_tsb_uad(name: str) -> Path: import pooch # local import: only required when downloading + try: + import tqdm # noqa: F401 + progressbar = True + except ImportError: + progressbar = False + cache_root = Path(config.get_data_path(key=_BUNDLE_ROOT)) cache_root.mkdir(parents=True, exist_ok=True) @@ -60,7 +66,7 @@ def fetch_tsb_uad(name: str) -> Path: registry.fetch( _BUNDLE_FILENAME, processor=pooch.Unzip(extract_dir="."), - progressbar=True, + progressbar=progressbar, ) subdir = cache_root / _BUNDLE_ROOT / _SUBDIR[name] diff --git a/datasets/dodgers.py b/datasets/dodgers.py index 1ff52dd..8d3b7a2 100644 --- a/datasets/dodgers.py +++ b/datasets/dodgers.py @@ -90,6 +90,8 @@ def load_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "DODGERS" + requirements = ["pip:pooch"] + parameters = { # "recordings_id": [["101"]], "recordings_id": [None], diff --git a/datasets/mitdb.py b/datasets/mitdb.py index cce6810..5cf8668 100644 --- a/datasets/mitdb.py +++ b/datasets/mitdb.py @@ -102,6 +102,8 @@ def load_mitdb_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "MITDB" + requirements = ["pip:pooch"] + parameters = { "recordings_id": [["100", "201", "109", "105", "111", "221"]], "debug": [False], diff --git a/datasets/nab.py b/datasets/nab.py index b1de8c4..88b1d0f 100644 --- a/datasets/nab.py +++ b/datasets/nab.py @@ -88,6 +88,8 @@ def load_data(db_path, record_ids=None, verbose=False): class Dataset(BaseDataset): name = "NAB" + requirements = ["pip:pooch"] + parameters = { "recordings_id": [["art0"], ["art1"], ["CloudWatch"]], "debug": [False], diff --git a/objective.py b/objective.py index 07165d5..04dbde5 100644 --- a/objective.py +++ b/objective.py @@ -26,7 +26,7 @@ class Objective(BaseObjective): name = "Anomaly detection" install_cmd = "conda" - requirements = ["scikit-learn", "pip:pooch"] + requirements = ["scikit-learn"] parameters = { "score_metrics": [("auc_pr", "auc_roc")], From 48b238cbc97733ae5b40827f632108c163761adf Mon Sep 17 00:00:00 2001 From: Jad Yehya Date: Wed, 20 May 2026 18:09:25 +0200 Subject: [PATCH 50/50] ENH revert missing datasets to xfail check in dataset retrieval --- test_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_config.py b/test_config.py index 5076198..cd15317 100644 --- a/test_config.py +++ b/test_config.py @@ -59,6 +59,6 @@ def check_test_dataset_get_data(benchmark, dataset_class): "daphnet", "ecg", "genesis", "ghl", "iops", "kdd21", "mgab", "occupancy", "opportunity", "sensorscope", "smd", - "svdb", "yahoo" + "svdb", "yahoo", "nab", "mitdb", "dodgers", ]: pytest.xfail(f"{dataset_class.name} dataset is not downloaded.")