diff --git a/benchmark_utils/__init__.py b/benchmark_utils/__init__.py index 5f8fa37..eaeac2f 100644 --- a/benchmark_utils/__init__.py +++ b/benchmark_utils/__init__.py @@ -3,11 +3,9 @@ # name `benchmark_utils`, and code defined inside will be importable using # the usual import syntax -from benchopt import safe_import_context from pathlib import Path -with safe_import_context() as import_ctx: - import numpy as np +import numpy as np def mean_overlaping_pred(predictions, stride): @@ -24,7 +22,9 @@ def mean_overlaping_pred(predictions, stride): np.ndarray: Averaged predictions for each feature. """ n_windows, H, n_features = predictions.shape - total_length = (n_windows-1) * stride + H - 1 + # The last window starts at (n_windows-1)*stride and covers H samples, so + # the reconstructed signal spans (n_windows-1)*stride + H positions. + total_length = (n_windows - 1) * stride + H # Array to store accumulated predictions for each feature accumulated = np.zeros((total_length, n_features)) diff --git a/benchmark_utils/metrics.py b/benchmark_utils/metrics.py index 4607670..9bcad02 100644 --- a/benchmark_utils/metrics.py +++ b/benchmark_utils/metrics.py @@ -1,7 +1,24 @@ -from benchopt import safe_import_context +import numpy as np -with safe_import_context() as import_ctx: - import numpy as np + +def _dilate(mask: np.ndarray, radius: int) -> np.ndarray: + """Binary dilation with a centered window of half-width ``radius``. + + ``out[i]`` is True iff any entry of ``mask`` in ``[i-radius, i+radius]`` + (clipped to the array) is truthy. Matches the half-open slice + ``mask[max(0, i-r):min(n, i+r+1)]`` used by the soft metrics. + """ + mask = np.asarray(mask) + n = mask.shape[0] + if n == 0: + return np.zeros(0, dtype=bool) + if radius <= 0: + return mask.astype(bool, copy=False) + cum = np.concatenate(([0], np.cumsum(mask.astype(np.int64)))) + idx = np.arange(n) + left = np.maximum(0, idx - radius) + right = np.minimum(n, idx + radius + 1) + return (cum[right] - cum[left]) > 0 def soft_precision(y_true: np.ndarray, @@ -35,47 +52,34 @@ def soft_precision(y_true: np.ndarray, fa : int Number of false anomalies """ - # EM : Exact Match - em = 0 - # DA : Detected Anomaly - da = 0 - # FA : False Anomaly - fa = 0 - - # TFDIR = (EM + DA) / (EM + DA + FA) + y_true = np.asarray(y_true) + y_pred = np.asarray(y_pred) - # Counting exact matches - for i in range(len(y_true)): - if y_true[i] == 1 and (y_true[i] == y_pred[i]): - em += 1 + true_mask = y_true == 1 + pred_mask = y_pred == 1 - # False anomaly and detected anomalies - for i in range(len(y_true)): + # TFDIR = (EM + DA) / (EM + DA + FA) - left = max(0, i-detection_range) - right = min(len(y_true), i+detection_range+1) + # EM : Exact Match + em = int(np.sum(true_mask & pred_mask)) - if y_pred[i] == 1 and ( - y_true[left:right] == 0).all(): - fa += 1 + true_dil = _dilate(true_mask, detection_range) + pred_dil = _dilate(pred_mask, detection_range) - if y_true[i] == 1 and ( - y_pred[left:right] == 1).any(): - da += 1 + # DA : Detected Anomaly + fa = int(np.sum(pred_mask & ~true_dil)) + # FA : False Anomaly # Removing exact matches from detected anomalies because they are # counted twice - da -= em + da = int(np.sum(true_mask & pred_dil)) - em - if return_counts: - if em + da + fa == 0: - return 0, em, da, fa - - return (em + da) / (em + da + fa), em, da, fa + total = em + da + fa + score = (em + da) / total if total else 0 - if em + da + fa == 0: - return 0 - return (em + da) / (em + da + fa) + if return_counts: + return score, em, da, fa + return score def soft_recall(y_true: np.ndarray, @@ -104,46 +108,25 @@ def soft_recall(y_true: np.ndarray, ma : int Number of missed anomalies """ - # EM : Exact Match - em = 0 - # DA : Detected Anomaly - da = 0 - # MA : Missed Anomaly - ma = 0 - # DAIR = (EM + DA) / (EM + DA + MA) + y_true = np.asarray(y_true) + y_pred = np.asarray(y_pred) - # Counting exact matches - for i in range(len(y_true)): - if y_true[i] == 1 and (y_true[i] == y_pred[i]): - em += 1 + true_mask = y_true == 1 + pred_mask = y_pred == 1 - # Missing values and detected anomalies - for i in range(len(y_true)): + em = int(np.sum(true_mask & pred_mask)) - left = max(0, i-detection_range) - right = min(len(y_true), i+detection_range+1) + pred_dil = _dilate(pred_mask, detection_range) - if y_true[i] == 1 and ( - y_pred[left:right] == 0).all(): - ma += 1 + ma = int(np.sum(true_mask & ~pred_dil)) + da = int(np.sum(true_mask & pred_dil)) - em - if y_true[i] == 1 and ( - y_pred[left:right] == 1).any(): - da += 1 - - # Removing exact matches from detected anomalies because they are - # counted twice - da -= em + total = em + da + ma + score = (em + da) / total if total else 0 if return_counts: - if em + da + ma == 0: - return 0, em, da, ma - - return (em + da) / (em + da + ma), em, da, ma - - if em + da + ma == 0: - return 0 - return (em + da) / (em + da + ma) + return score, em, da, ma + return score def ctt(y_true: np.ndarray, y_pred: np.ndarray, return_signed: bool = False): @@ -240,22 +223,34 @@ def ttc(y_true: np.ndarray, y_pred: np.ndarray, return_signed: bool = False): return tot_dist / np.sum(y_true) -def soft_f1(precision, recall): +def soft_f1(precision, recall, detection_range=None): """ Calculate the F1 score from precision and recall. Parameters ---------- - precision : float + precision : float or np.ndarray Precision score - recall : float + recall : float or np.ndarray Recall score + detection_range : int, optional + If provided, ``precision`` and ``recall`` are interpreted as the + true and predicted label arrays used by ``soft_precision`` and + ``soft_recall``. Returns ------- f1 : float F1 score """ + if detection_range is not None: + precision_score = soft_precision( + precision, recall, detection_range=detection_range + ) + recall_score = soft_recall( + precision, recall, detection_range=detection_range) + precision, recall = precision_score, recall_score + if precision + recall == 0: return 0 return 2 * (precision * recall) / (precision + recall) @@ -280,21 +275,15 @@ def extract_anomaly_ranges(labels: list[int]): Each tuple represents a range (start_index, end_index) where anomalies are present. """ - ranges = [] - start = None - - for i, label in enumerate(labels): - if label == 1 and start is None: - start = i # Start of a new anomaly range - elif label == 0 and start is not None: - ranges.append((start, i - 1)) # End of the current anomaly range - start = None - - # Handle the case where the series ends with an anomaly - if start is not None: - ranges.append((start, len(labels) - 1)) - - return ranges + arr = np.asarray(labels) + if arr.size == 0: + return [] + binary = (arr == 1).astype(np.int8) + padded = np.concatenate(([0], binary, [0])) + diff = np.diff(padded) + starts = np.where(diff == 1)[0] + ends = np.where(diff == -1)[0] - 1 + return list(zip(starts.tolist(), ends.tolist())) def existence_reward(real_range, predicted_ranges): diff --git a/benchmark_utils/models.py b/benchmark_utils/models.py index 7bcf100..a2b4f2a 100644 --- a/benchmark_utils/models.py +++ b/benchmark_utils/models.py @@ -1,4 +1,10 @@ from torch import nn +from sklearn.preprocessing import MinMaxScaler +import torch +import torch.optim as optim +from torch.utils.data import DataLoader, Dataset +import numpy as np +from tqdm import tqdm class ARModel(nn.Module): @@ -122,3 +128,253 @@ def forward(self, x): x, (_, _) = self.decoder(x) return x + + +class SlidingWindowDataset(Dataset): + def __init__(self, data, window_size): + self.data = data + self.window_size = window_size + + def __len__(self): + return len(self.data) - self.window_size + 1 + + def __getitem__(self, idx): + window = self.data[idx:idx + self.window_size] + return window # Input and target are the same for autoencoder + + +class Autoencoder(nn.Module): + def __init__( + self, + input_size=32, + hidden_size=32, + latent_size=16, + sliding_window=10 + ): + super(Autoencoder, self).__init__() + + self.sliding_window = sliding_window + self.decision_scores_ = None + + # Encoder + self.encoder = nn.Sequential( + nn.Linear(input_size, hidden_size), + nn.ReLU(), + nn.BatchNorm1d(hidden_size), + nn.Linear(hidden_size, latent_size), + nn.ReLU(), + nn.BatchNorm1d(latent_size), + ) + + # Decoder + self.decoder = nn.Sequential( + nn.Linear(latent_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, input_size), + nn.ReLU(), + ) + + def forward(self, x): + # Flatten input if needed + x = x.view(x.size(0), -1) + + # Encode + encoded = self.encoder(x) + + # Decode + decoded = self.decoder(encoded) + + return decoded + + def encode(self, x): + x = x.view(x.size(0), -1) + return self.encoder(x) + + def _create_sliding_windows(self, X): + """Create sliding windows from input data""" + if isinstance(X, np.ndarray): + X = torch.from_numpy(X).float() + + # If X is 1D, reshape to 2D + if X.dim() == 1: + X = X.unsqueeze(1) + + windows = [] + for i in range(len(X) - self.sliding_window + 1): + window = X[i:i + self.sliding_window].flatten() + windows.append(window) + + return torch.stack(windows) + + def fit( + self, + X, + num_epochs=50, + learning_rate=1e-3, + device=None, + batch_size=32 + ): + """ + Train the autoencoder on the provided data. + + Args: + X: Input data tensor or numpy array shape (n_samples, n_features) + num_epochs: Number of training epochs + learning_rate: Learning rate for optimizer + device: Device to train on ('cuda' or 'cpu') + batch_size: Batch size for training + + Returns: + List of training losses per epoch + """ + if device is None: + device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu" + ) + + # Convert to tensor if numpy array + if isinstance(X, np.ndarray): + X = torch.from_numpy(X).float() + + # Ensure X is 2D + if X.dim() == 1: + X = X.unsqueeze(1) + if X.dim() == 3: + # (n_samples, n_timesteps, n_features) + X = X.view(-1, 1) + + # Create sliding windows + windowed_data = self._create_sliding_windows(X) + + # Create dataset and dataloader + # window_size=1 since we already created windows + dataset = SlidingWindowDataset(windowed_data, window_size=1) + dataloader = DataLoader( + dataset, batch_size=batch_size, shuffle=True, drop_last=True) + + self.to(device) + criterion = nn.MSELoss() + optimizer = optim.Adam(self.parameters(), lr=learning_rate) + + self.train() + losses = [] + + # Progress bar for epochs + epoch_pbar = tqdm(range(num_epochs), desc="Training", unit="epoch") + + for epoch in epoch_pbar: + epoch_loss = 0.0 + + # Progress bar for batches + batch_pbar = tqdm( + dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False) + + for batch_idx, (data) in enumerate(batch_pbar): + data = data.to(device) + + # Forward pass + output = self(data) + loss = criterion(output, data) + + # Backward pass + optimizer.zero_grad() + loss.backward() + optimizer.step() + + epoch_loss += loss.item() + + # Update batch progress bar + batch_pbar.set_postfix({"Batch Loss": f"{loss.item():.4f}"}) + + avg_loss = epoch_loss / len(dataloader) + losses.append(avg_loss) + + # Update epoch progress bar + epoch_pbar.set_postfix({"Avg Loss": f"{avg_loss:.4f}"}) + + return losses + + def predict(self, X_test, X_dirty=None, device=None): + """ + Predict anomaly scores for time series data. + + Args: + X_test: Test data for reconstruction + X_dirty: Original dirty data (if None, uses X_test) + device: Device to run inference on + + Returns: + Reconstructed data and sets decision_scores_ attribute + """ + if device is None: + device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") + + self.eval() + self.to(device) + + # Create sliding windows for test data + if isinstance(X_test, np.ndarray): + X_test = torch.from_numpy(X_test).float() + + windowed_test = self._create_sliding_windows(X_test) + windowed_test = windowed_test.to(device) + + with torch.no_grad(): + test_predict = self(windowed_test).cpu().numpy() + + # Calculate MAE loss + test_mae_loss = np.mean( + np.abs(test_predict - windowed_test.cpu().numpy()), axis=1) + + # Normalize MAE loss + nor_test_mae_loss = MinMaxScaler().fit_transform( + test_mae_loss.reshape(-1, 1)).ravel() + + # Use X_dirty if provided, otherwise use original X_test + if X_dirty is None: + if isinstance(X_test, torch.Tensor): + X_dirty = X_test.cpu().numpy() + else: + X_dirty = X_test + + # Initialize score array + score = np.zeros(len(X_dirty)) + + # Fill the score array with sliding window approach + score[self.sliding_window // 2:self.sliding_window // + 2 + len(test_mae_loss)] = nor_test_mae_loss + score[:self.sliding_window // 2] = nor_test_mae_loss[0] + score[self.sliding_window // 2 + + len(test_mae_loss):] = nor_test_mae_loss[-1] + + # Store decision scores + self.decision_scores_ = score + + return test_predict + + def encode_data(self, x, device=None): + """ + Encode input data to latent representation. + + Args: + x: Input tensor or numpy array + device: Device to run inference on + + Returns: + Encoded data as numpy array + """ + if device is None: + device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") + + self.eval() + self.to(device) + + # Convert to tensor if numpy array + if isinstance(x, np.ndarray): + x = torch.from_numpy(x).float() + x = x.to(device) + with torch.no_grad(): + encoded = self.encode(x) + return encoded.cpu().numpy() diff --git a/benchmark_utils/predictions.py b/benchmark_utils/predictions.py new file mode 100644 index 0000000..7517dbc --- /dev/null +++ b/benchmark_utils/predictions.py @@ -0,0 +1,34 @@ +import numpy as np + + +def cutoff_scores(anomaly_scores, cutoff=None): + """Turn anomaly scores into binary predictions using a contamination rate. + + Larger scores are assumed to be more anomalous. NaN entries are preserved + as ``-1`` ignore labels so they are masked by the objective. + """ + if cutoff is None: + return None + + validate_cutoff(cutoff) + + scores = np.asarray(anomaly_scores) + predictions = np.full(scores.shape, -1, dtype=int) + valid = ~np.isnan(scores) + if not np.any(valid): + return predictions + + threshold = np.quantile(scores[valid], 1 - cutoff) + + predictions[valid] = (scores[valid] >= threshold).astype(int) + return predictions + + +def validate_cutoff(cutoff): + if cutoff is None: + raise ValueError("cutoff must be provided.") + if not 0 < cutoff < 1: + raise ValueError( + "cutoff must be in (0, 1), " + f"got {cutoff!r}." + ) diff --git a/benchmark_utils/windowing.py b/benchmark_utils/windowing.py new file mode 100644 index 0000000..362135b --- /dev/null +++ b/benchmark_utils/windowing.py @@ -0,0 +1,156 @@ +import numpy as np +import torch +from torch.utils.data import TensorDataset + + +def find_period_length(data, default=125): + """Estimate a reasonable period length from autocorrelation. + + This local helper replaces the small ``TSB_AD`` utility previously used by + several solvers, avoiding a heavy optional dependency for solvers that only + need automatic window sizing. + """ + data = np.asarray(data) + if data.ndim > 1: + return 0 + + data = data[: min(20_000, len(data))] + if len(data) < 6: + return 0 + + centered = data - data.mean() + norm = np.dot(centered, centered) + if norm == 0: + return default + + max_lag = min(400, len(centered) - 1) + autocorr = np.correlate(centered, centered, mode="full") + autocorr = autocorr[len(centered) - 1: len(centered) + max_lag] / norm + + base = 3 + values = autocorr[base:] + if len(values) < 3: + return default + + local_max = ( + np.where((values[1:-1] > values[:-2]) & + (values[1:-1] > values[2:]))[0] + 1 + ) + + if len(local_max) == 0: + return default + + lag = local_max[np.argmax(values[local_max])] + base + if lag < 3 or lag > 300: + return default + return int(lag) + + +def make_windows(X, window_size=32, stride=1, padding=False): + """Create a windowed view of the data. + + Parameters + ---------- + X : np.ndarray + Input data of shape (n_samples, n_features, n_times). + window_size : int + Size of the sliding window. + stride : int + Stride of the sliding window. + + Returns + ------- + windows : np.ndarray + A windowed view of the data in shape: + (n_eff_samples, window_size, n_features) + """ + + if padding: + n_samples, n_features, n_times = X.shape + n_pad = (window_size - stride + n_times % stride) % stride + pad_width = ((0, 0), (0, 0), (0, n_pad)) + X = np.pad(X, pad_width=pad_width, mode='constant') + + return np.lib.stride_tricks.sliding_window_view( + X, window_shape=window_size, axis=-1 + )[..., ::stride, :].transpose(0, 2, 1, 3).reshape( + -1, X.shape[1], window_size + ).transpose(0, 2, 1) + + +def make_windowed_dataset(X, y=None, window_size=32, stride=1): + """ + Create a DataLoader with windowed views of the data. + + Parameters + ---------- + X : np.ndarray + Input data of shape (n_samples, n_features, n_times). + y : np.ndarray, optional + Target data of shape (n_samples, n_times). + window_size : int + Size of the sliding window. + stride : int + Stride of the sliding window. + + Returns + ------- + Dataset + A PyTorch Dataset with windowed data in shape: + (n_eff_samples, window_size, n_features) + """ + + if window_size is not None: + X = make_windows(X, window_size, stride) + + X_tensor = torch.tensor(X, dtype=torch.float32) + + if y is not None: + if window_size is not None: + y = np.lib.stride_tricks.sliding_window_view( + y, window_shape=window_size, axis=-1 + )[..., ::stride, :].reshape(-1, window_size) + + y_tensor = torch.tensor(y, dtype=torch.float32) + dataset = TensorDataset(X_tensor, y_tensor) + else: + dataset = TensorDataset(X_tensor) + + return dataset + + +def reconstruct_from_windows(windows, stride, batch, n_features): + """Reconstruct the original signal from overlapping windows + + Parameters + ---------- + windows : np.ndarray + The overlapping windows of shape (batch*n_windows, window_size, n_feat) + stride : int + The stride used to create the windows + batch : int + The batch size used when creating the windows + n_features : int + The number of features in the original signal + """ + # windows: (batch*n_windows, window_size, n_features) + w = windows.shape[1] + windows = windows.reshape(batch, -1, w, n_features) + b, nw, ws, nf = windows.shape + nt = (nw - 1) * stride + ws + + # allocate accumulator + counts for correct overlap averaging + acc = np.zeros((b, nf, nt)) + cnt = np.zeros((nt,), dtype=int) + + # build index map for overlap positions + idx = np.arange(ws)[:, None] + stride * np.arange(nw) + + # add windows efficiently + np.add.at(acc, (slice(None), slice(None), idx.ravel()), + windows.transpose(0, 3, 1, 2).reshape(b, nf, -1)) + + # count contributions + np.add.at(cnt, idx.ravel(), 1) + + return acc / cnt diff --git a/datasets/daphnet.py b/datasets/daphnet.py new file mode 100644 index 0000000..e2d42bc --- /dev/null +++ b/datasets/daphnet.py @@ -0,0 +1,158 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +PATH = config.get_data_path("DAPHNET") + + +def load_data(db_path, record_ids=None, verbose=False, number=-1): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + verbose: If True, print loading progress information. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is not None and number > 0: + print( + "Warning: 'number' parameter is " + "ignored when 'record_ids' is provided." + ) + + if record_ids is None: + # Get all available record files with .test.csv@X.out pattern + record_files = list(db_path.glob("*.test.csv@*.out")) + record_ids = [f.name.split(".")[0] for f in record_files] + if number > 0: + record_ids = record_ids[:number] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Find all files matching the pattern for the given record_id + record_files = list(db_path.glob(f"{record_id}.test.csv@*.out")) + + if not record_files: + if verbose: + print(f"No record files found for ID: {record_id}") + continue + + for record_file in record_files: + if verbose: + print(f"Loading record file: {record_file}") + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print( + f"Insufficient columns " + f"for record file {record_file.name}" + ) + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "DAPHNET" + + parameters = { + # "recordings_id": [["S01R02E0"]], + "recordings_id": [None], # [["S01R02E0"]], + "number": [-1], + "debug": [False], + } + + test_parameters = { + "debug": [True], + } + + def get_data(self): + """Load the DAPHNET dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + if self.recordings_id in (["all"], "all"): + self.recordings_id = None + X, y_true = load_data(PATH, self.recordings_id, number=self.number) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + plt.figure(figsize=(6, 3)) + plt.plot(X_train[0, 0, :500], linewidth=1.2) + plt.plot(range(297, 305), + X_train[0, 0, 297:305], color="orange", linewidth=3) + plt.title("Daphnet dataset") + plt.tight_layout() + plt.savefig("daphnet_example.png") + plt.close() + + print("PLOT SAVED") + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/dodgers.py b/datasets/dodgers.py new file mode 100644 index 0000000..f3c6879 --- /dev/null +++ b/datasets/dodgers.py @@ -0,0 +1,128 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("DODGERS") + + +def load_data(db_path, record_ids=None, verbose=False): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + verbose: If True, print loading progress information. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with freeway-traffic pattern + record_files = list(db_path.glob("*-freeway-traffic.test.out")) + record_ids = [f.name for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Handle direct filename or construct pattern + if record_id.endswith('-freeway-traffic.test.out'): + record_file = db_path / record_id + else: + record_file = db_path / f"{record_id}-freeway-traffic.test.out" + if record_file.exists(): + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for record {record_id}") + else: + if verbose: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "DODGERS" + + parameters = { + # "recordings_id": [["101"]], + "recordings_id": [None], + "debug": [False], + } + + def get_data(self): + """Load the DODGERS dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + if self.recordings_id in (["all"], "all"): + self.recordings_id = None + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/ecg.py b/datasets/ecg.py new file mode 100644 index 0000000..81f1986 --- /dev/null +++ b/datasets/ecg.py @@ -0,0 +1,133 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("ECG") + + +def load_data(db_path, record_ids=None, verbose=False, number=-1): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + verbose: If True, print loading progress information. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is not None and number > 0: + print("Warning: 'number' parameter is " + "ignored when 'record_ids' is provided.") + + if record_ids is None: + # Get all available record files + record_files = list(db_path.glob("*.out")) + record_ids = [f.stem for f in record_files] + + if "MBA_ECG14046_data" in record_ids: + record_ids.remove("MBA_ECG14046_data") + if verbose: + print("Removed MBA_ECG14046_data from records due to issues") + + if number > 0: + record_ids = record_ids[:number] + print(record_ids) + + data_list = [] + labels_list = [] + for record_id in record_ids: + record_file = db_path / f"{record_id}.out" + if record_file.exists(): + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for record {record_id}") + else: + if verbose: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append(np.pad( + data, + (0, max_length - len(data)), + mode='constant', + constant_values=data[-1]) + ) + padded_labels.append(np.pad( + labels, + (0, max_length - len(labels)), + mode='constant', + constant_values=0), + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "ECG" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + "number": [-1], + } + + def get_data(self): + """Load the MITDB dataset.""" + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + if self.recordings_id in (["all"], "all"): + self.recordings_id = None + X, y_true = load_data(PATH, self.recordings_id, number=self.number) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + size = 5000 + X_train = X_train[:, :size] + X_test = X_test[:, :size] + y_test = y_test[:, :size] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/genesis.py b/datasets/genesis.py new file mode 100644 index 0000000..8425d89 --- /dev/null +++ b/datasets/genesis.py @@ -0,0 +1,126 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("GENESIS") + + +def load_data(db_path, record_ids=None, verbose=False): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + verbose: If True, print loading progress information. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with genesis pattern + record_files = list(db_path.glob("genesis-*.out")) + record_ids = [f.name for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Handle direct filename or construct genesis pattern + if record_id.startswith('genesis-') and record_id.endswith('.out'): + record_file = db_path / record_id + else: + record_file = db_path / \ + f"genesis-anomalies.test.csv@{record_id}.out" + if record_file.exists(): + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for record {record_id}") + else: + if verbose: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "GENESIS" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the GENESIS dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/ghl.py b/datasets/ghl.py new file mode 100644 index 0000000..dd102dd --- /dev/null +++ b/datasets/ghl.py @@ -0,0 +1,130 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("GHL") + + +def load_data(db_path, record_ids=None, verbose=False): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + verbose: If True, print loading progress information. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with GHL pattern + record_files = list(db_path.glob( + "*_Lev_fault_Temp_corr_*.test.csv@*.out")) + record_ids = [f.name for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Handle direct filename or construct pattern + if '_Lev_fault_Temp_corr_' in record_id and record_id.endswith('.out'): + record_file = db_path / record_id + else: + # Try to find matching files with pattern + matching_files = list(db_path.glob( + f"*{record_id}*_Lev_fault_Temp_corr_*.out")) + record_file = matching_files[0] if matching_files else db_path / \ + f"{record_id}.out" + if record_file.exists(): + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for record {record_id}") + else: + if verbose: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "GHL" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the GHL dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/iops.py b/datasets/iops.py new file mode 100644 index 0000000..12df7ef --- /dev/null +++ b/datasets/iops.py @@ -0,0 +1,139 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("IOPS") +PATH = "/data/parietal/store2/data/tsb-uad/TSB-UAD-Public/IOPS/" + + +def load_data(db_path, verbose=False): + """ + Load train and test data from the database path. + + Args: + db_path: Path to the database directory + verbose: If True, print loading progress information. + + Returns: + tuple: (X_train, X_test, y_test) where: + - X_train: nd.array of shape (num_records, num_samples) + - X_test: nd.array of shape (num_records, num_samples) + - y_test: nd.array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + # Get all train and test files + train_files = list(db_path.glob("KPI-*.train.out")) + test_files = list(db_path.glob("KPI-*.test.out")) + + if not train_files or not test_files: + raise ValueError("No train or test files found") + + # Load train data + train_data_list = [] + for train_file in train_files: + record_data = pd.read_csv(train_file, header=None).dropna().to_numpy() + if record_data.shape[1] >= 1: + train_data_list.append(record_data[:, 0].astype(float)) + else: + if verbose: + print(f"Insufficient columns for train file {train_file}") + + # Load test data and labels + test_data_list = [] + test_labels_list = [] + for test_file in test_files: + record_data = pd.read_csv(test_file, header=None).dropna().to_numpy() + if record_data.shape[1] >= 2: + test_data_list.append(record_data[:, 0].astype(float)) + test_labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for test file {test_file}") + + if not train_data_list or not test_data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_train_length = max(len(data) for data in train_data_list) + max_test_length = max(len(data) for data in test_data_list) + + # Pad train sequences + padded_train_data = [] + for data in train_data_list: + if len(data) < max_train_length: + padded_train_data.append( + np.pad( + data, + (0, max_train_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + else: + padded_train_data.append(data[:max_train_length]) + + # Pad test sequences and labels + padded_test_data = [] + padded_test_labels = [] + for data, labels in zip(test_data_list, test_labels_list): + if len(data) < max_test_length: + padded_test_data.append( + np.pad( + data, + (0, max_test_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_test_labels.append( + np.pad( + labels, + (0, max_test_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_test_data.append(data[:max_test_length]) + padded_test_labels.append(labels[:max_test_length]) + + return ( + np.array(padded_train_data), + np.array(padded_test_data), + np.array(padded_test_labels) + ) + + +class Dataset(BaseDataset): + name = "IOPS" + + parameters = { + "debug": [False], + } + + def get_data(self): + """Load the IOPS dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X_train, X_test, y_test = load_data(PATH) + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/kdd21.py b/datasets/kdd21.py new file mode 100644 index 0000000..3d0da0b --- /dev/null +++ b/datasets/kdd21.py @@ -0,0 +1,124 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("KDD21") + + +def load_data(db_path, record_ids=None, verbose=False): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + verbose: If True, print loading progress information. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + if record_ids is None: + # Get all available record files + record_files = list(db_path.glob("*.out")) + record_ids = [f.name.split('_')[0] for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Convert record_id to 3-digit format + formatted_id = str(record_id).zfill(3) + # Find file that starts with the formatted record_id + matching_files = list(db_path.glob(f"{formatted_id}_*.out")) + if matching_files: + record_file = matching_files[0] # Take the first matching file + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for record {record_id}") + else: + if verbose: + print(f"Record file not found for ID: {record_id}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "KDD21" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the KDD21 dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/mgab.py b/datasets/mgab.py new file mode 100644 index 0000000..ac00972 --- /dev/null +++ b/datasets/mgab.py @@ -0,0 +1,121 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("MGAB") + + +def load_data(db_path, record_ids=None, verbose=False): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + verbose: If True, print loading progress information. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files + record_files = list(db_path.glob("*.test.out")) + record_ids = [f.name.split(".")[0] for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + record_file = db_path / f"{record_id}.test.out" + if record_file.exists(): + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for record {record_id}") + else: + if verbose: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "MGAB" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the MITDB dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + n_recordings, _ = X.shape + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/mitdb.py b/datasets/mitdb.py new file mode 100644 index 0000000..7f811d0 --- /dev/null +++ b/datasets/mitdb.py @@ -0,0 +1,135 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("MITDB") + + +def load_mitdb_data(db_path, record_ids=None, verbose=False): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with format like 100.test.csv@1.out + record_files = list(db_path.glob("*.out")) + record_ids = [str(f.name).split(".")[0] for f in record_files] + + if verbose: + print(f"Loading records: {record_ids}") + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Find file starting with record_id and ending with .out + record_files = list(db_path.glob(f"{record_id}*.out")) + if record_files: + if len(record_files) > 1: + if verbose: + print( + f"Multiple files found for record ID {record_id}, " + f"using the first one: {record_files[0]}" + ) + record_file = record_files[0] + # Load the record data + record_data = pd.read_csv( + db_path / record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if verbose: + print( + f"Loaded record {record_id} " + f"with shape {record_data.shape}") + if record_data.shape[1] >= 2: + if verbose: + print(f"Record {record_id} has sufficient columns") + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for record {record_id}") + else: + if verbose: + print(f"Record file not found for ID: {db_path / record_id}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "MITDB" + + parameters = { + "recordings_id": [["100", "201", "109", "105", "111", "221"]], + "debug": [False], + } + + def get_data(self): + """Load the MITDB dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + if self.recordings_id in (["all"], "all"): + self.recordings_id = None + X, y_true = load_mitdb_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, : int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, -2000:] + X_test = X_test[:, -2000:] + y_test = y_test[:, -2000:] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict(X_train=X_train, y_test=y_test, X_test=X_test) diff --git a/datasets/msl.py b/datasets/msl.py index fe177ba..aaa2fe2 100644 --- a/datasets/msl.py +++ b/datasets/msl.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - import numpy as np - import requests +import numpy as np +import requests # Create global variables to store the urls URL_XTRAIN = ( @@ -58,7 +57,12 @@ def get_data(self): X_test = X_test[:1000] y_test = y_test[:1000] - print(X_train.shape, X_test.shape, y_test.shape) + # Reshaping data to (n_recordings, n_features, n_samples) + # For MSL, treat as single recording + n_features = X_train.shape[1] + X_train = X_train.T.reshape(1, n_features, -1) + X_test = X_test.T.reshape(1, n_features, -1) + y_test = y_test.reshape(1, -1) return dict( X_train=X_train, y_test=y_test, X_test=X_test diff --git a/datasets/nab.py b/datasets/nab.py new file mode 100644 index 0000000..20a0960 --- /dev/null +++ b/datasets/nab.py @@ -0,0 +1,123 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("NAB") + + +def load_data(db_path, record_ids=None, verbose=False): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + verbose: If True, print loading progress information. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files + record_files = list(db_path.glob("NAB_data_*.out")) + record_ids = [f.name.split('_')[2] for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + record_files = list(db_path.glob(f"NAB_data_{record_id}_*.out")) + if record_files: + # Take the first matching file + record_file = record_files[0] + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for record {record_id}") + else: + if verbose: + print(f"Record file not found for: {record_id}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "NAB" + + parameters = { + "recordings_id": [["art0"], ["art1"], ["CloudWatch"]], + "debug": [False], + } + + def get_data(self): + """Load the NAB dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/occupancy.py b/datasets/occupancy.py new file mode 100644 index 0000000..cddb6e5 --- /dev/null +++ b/datasets/occupancy.py @@ -0,0 +1,141 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("OCCUPANCY") + + +def load_data(db_path, record_ids=None, verbose=False): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load for testing. + verbose: If True, print loading progress information. + + Returns: + tuple: (X_train, X_test, y_test) where: + - X_train: numpy array of shape (num_records, num_samples) + - X_test: numpy array of shape (num_records, num_samples) + - y_test: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + # Load training data + train_files = sorted(list(db_path.glob("room-occupancy.train.csv@*.out"))) + if verbose: + print(train_files) + if not train_files: + raise FileNotFoundError("No training files found.") + train_data_list = [ + pd.read_csv(f, header=None).dropna().to_numpy()[:, 0].astype(float) + for f in train_files + ] + # Concatenate all training series into a single array + X_train = np.concatenate(train_data_list) + + # Load testing data + if record_ids is None: + record_ids = sorted( + list(set( + f.name.split('.')[0].split('-')[-1] + for f in db_path.glob("room-occupancy-*.test.csv@*.out") + )) + ) + + test_data_list = [] + labels_list = [] + for record_id in record_ids: + test_files = sorted( + list(db_path.glob(f"room-occupancy-{record_id}.test.csv@*.out")) + ) + if not test_files: + if verbose: + print(f"No test files found for record_id {record_id}") + continue + + for test_file in test_files: + record_data = pd.read_csv( + test_file, header=None).dropna().to_numpy() + if record_data.shape[1] >= 2: + test_data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print( + f"Insufficient columns " + f"for record file {test_file.name}") + + if not test_data_list: + raise ValueError("No valid test data found") + + # Find maximum length for padding test data + max_length = max(len(data) for data in test_data_list) + + # Pad all test sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(test_data_list, labels_list): + pad_width = max_length - len(data) + if pad_width > 0: + padded_data.append( + np.pad( + data, ( + 0, + pad_width), + mode="constant", + constant_values=data[-1] + ) + ) + padded_labels.append( + np.pad( + labels, (0, pad_width), mode="constant", constant_values=0 + ) + ) + else: + padded_data.append(data) + padded_labels.append(labels) + + X_test = np.array(padded_data) + y_test = np.array(padded_labels) + + # Reshape X_train to be 2D + X_train = X_train.reshape(1, -1) + + return X_train, X_test, y_test + + +class Dataset(BaseDataset): + name = "OCCUPANCY" + + parameters = { + "recordings_id": [None], + "debug": [False], + } + + def get_data(self): + """Load the OCCUPANCY dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X_train, X_test, y_test = load_data(PATH, self.recordings_id) + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/opportunity.py b/datasets/opportunity.py new file mode 100644 index 0000000..3968a2b --- /dev/null +++ b/datasets/opportunity.py @@ -0,0 +1,126 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("OPPORTUNITY") + + +def load_data(db_path, record_ids=None, verbose=False): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + verbose: If True, print loading progress information. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with S*-ADL*.test.csv@*.out pattern + record_files = list(db_path.glob("S*-ADL*.test.csv@*.out")) + # Extract record_id from filename + record_ids = [f.name.split('-')[0][1:] for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Find files matching the pattern S{record_id}-ADL*.test.csv@*.out + pattern = f"S{record_id}-ADL*.test.csv@*.out" + matching_files = list(db_path.glob(pattern)) + + if matching_files: + record_file = matching_files[0] # Take first match + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for record {record_id}") + else: + if verbose: + print(f"Record file not found for pattern: {pattern}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "OPPORTUNITY" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the OPPORTUNITY dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/pattern.py b/datasets/pattern.py new file mode 100644 index 0000000..42b4cd3 --- /dev/null +++ b/datasets/pattern.py @@ -0,0 +1,62 @@ +from benchopt import BaseDataset + +import numpy as np +from rosecdl.utils.utils_signal import generate_experiment + + +class Dataset(BaseDataset): + name = "Pattern" + + parameters = { + "n_samples": [10], + "n_times": [5000], + "debug": [False], + "random_state": [42], + "n_times_atom": [250], + } + + def get_data(self): + if self.debug: + self.n_samples = 2 + self.n_times = 1000 + + contamination_params = { + "n_atoms": 2, + "sparsity": 3, + "init_z": "constant", + "init_z_kwargs": {"value": 50}, + } + + simulation_params = { + "n_trials": self.n_samples * 2, + "n_channels": 2, + "n_times": self.n_times, + "n_atoms": 2, + "n_times_atom": self.n_times_atom, + "n_atoms_extra": 2, # extra atoms in the learned dictionary + "D_init": "random", + "window": True, + "contamination_params": contamination_params, + "init_d": "shapes", + "init_d_kwargs": {"shapes": ["sin", "gaussian"]}, + "init_z": "constant", + "init_z_kwargs": {"value": 1}, + "noise_std": 0.01, + "rng": self.random_state, + "sparsity": 20, + } + + X, _, _, _, info_contam = generate_experiment( + simulation_params=simulation_params, + return_info_contam=True, + ) + + X_train, X_test = X[: self.n_samples], X[self.n_samples:] + y_test = info_contam["outliers_mask"][self.n_samples:] + y_test = np.any(y_test, axis=1) + + print(f"X_train shape: {X_train.shape}") + print(f"X_test shape: {X_test.shape}") + print(f"y_test shape: {y_test.shape}") + + return dict(X_train=X_train, y_test=y_test, X_test=X_test) diff --git a/datasets/psm.py b/datasets/psm.py index bd5e60f..b5ce22c 100644 --- a/datasets/psm.py +++ b/datasets/psm.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - import requests - import pandas as pd +import requests +import pandas as pd URL_XTRAIN = ( "https://drive.google.com/uc?&id=1d3tAbYTj0CZLhB7z3IDTfTRg3E7qj_tw" diff --git a/datasets/sensorscope.py b/datasets/sensorscope.py new file mode 100644 index 0000000..7bcdb9a --- /dev/null +++ b/datasets/sensorscope.py @@ -0,0 +1,125 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("SENSORSCOPE") + + +def load_data(db_path, record_ids=None, verbose=False): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + verbose: If True, print loading progress information. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files with stb pattern + record_files = list(db_path.glob("stb-*.test.out")) + record_ids = [f.name for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Handle direct filename or construct pattern + if record_id.startswith('stb-') and record_id.endswith('.test.out'): + record_file = db_path / record_id + else: + record_file = db_path / f"stb-{record_id}.test.out" + if record_file.exists(): + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for record {record_id}") + else: + if verbose: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "SENSORSCOPE" + + parameters = { + "recordings_id": [["10", "11"]], + "debug": [False], + } + + def get_data(self): + """Load the SENSORSCOPE dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/simulated.py b/datasets/simulated.py index 7f48524..a91b101 100644 --- a/datasets/simulated.py +++ b/datasets/simulated.py @@ -1,8 +1,7 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset -with safe_import_context() as import_ctx: - from sklearn.datasets import make_regression - import numpy as np +from sklearn.datasets import make_regression +import numpy as np class Dataset(BaseDataset): @@ -12,17 +11,17 @@ class Dataset(BaseDataset): requirements = ["scikit-learn"] parameters = { - "n_samples": [10000], - "n_features": [5], + "n_samples": [100_000], + "n_features": [6], "noise": [0.1], - "n_anomaly": [90], + "n_anomaly": [15_000], } test_parameters = { - "n_samples": [500], - "n_features": [5], + "n_samples": [64], + "n_features": [2], "noise": [0.1], - "n_anomaly": [90], + "n_anomaly": [9], } def get_data(self): @@ -46,7 +45,7 @@ def get_data(self): # Adding anomalies y_test = np.zeros(self.n_samples) - for i in range(self.n_anomaly): + for _ in range(self.n_anomaly): idx = np.random.randint(self.n_samples) y_test[idx] = 1 @@ -57,4 +56,10 @@ def get_data(self): * 10 ) + # Reshaping data to (n_recordings, n_features, n_samples) + # For simulated data, treat as single recording + X_train = X_train.T.reshape(1, self.n_features, -1) + X_test = X_test.T.reshape(1, self.n_features, -1) + y_test = y_test.reshape(1, -1) + return dict(X_train=X_train, y_test=y_test, X_test=X_test) diff --git a/datasets/smap.py b/datasets/smap.py index 86dd691..756250e 100644 --- a/datasets/smap.py +++ b/datasets/smap.py @@ -1,9 +1,8 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, config -with safe_import_context() as import_ctx: - import numpy as np - import requests - # from sklearn.model_selection import TimeSeriesSplit +import numpy as np +import requests +# from sklearn.model_selection import TimeSeriesSplit URL_XTRAIN = ( "https://drive.google.com/uc?&id=1e_JhpIURD" @@ -63,6 +62,13 @@ def get_data(self): X_test = X_test[:1000] y_test = y_test[:1000] + # Reshaping data to (n_recordings, n_features, n_samples) + # For SMAP, treat as single recording + n_features = X_train.shape[1] + X_train = X_train.T.reshape(1, n_features, -1) + X_test = X_test.T.reshape(1, n_features, -1) + y_test = y_test.reshape(1, -1) + return dict( X_train=X_train, y_test=y_test, X_test=X_test ) diff --git a/datasets/smd.py b/datasets/smd.py new file mode 100644 index 0000000..d258391 --- /dev/null +++ b/datasets/smd.py @@ -0,0 +1,130 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("SMD") + + +def load_data(db_path, record_ids=None): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + # Get all available record files matching the pattern + record_files = list(db_path.glob("machine-*-*.test.csv*")) + # Extract record IDs from filenames + record_ids = [] + for f in record_files: + # Extract from machine-{record_id}-*.test.csv + parts = f.stem.split('-') + if len(parts) >= 3: + record_ids.append(parts[1]) + record_ids = list(set(record_ids)) # Remove duplicates + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Find files matching the pattern + pattern = f"machine-{record_id}-*.test.csv*" + record_files = list(db_path.glob(pattern)) + + for record_file in record_files: + if record_file.exists(): + # Load the record data + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + print(f"Insufficient columns for record {record_id}") + else: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + # Find maximum length for padding + max_length = max(len(data) for data in data_list) + + # Pad all sequences to the same length + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Pad with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "SMD" + + parameters = { + "recordings_id": [["1", "2"]], + "debug": [False], + } + + def get_data(self): + """Load the SMD dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + # For SMD, treat as single recording + n_features = X_train.shape[1] + X_train = X_train.T.reshape(1, n_features, -1) + X_test = X_test.T.reshape(1, n_features, -1) + y_test = y_test.reshape(1, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/svdb.py b/datasets/svdb.py new file mode 100644 index 0000000..ea127c6 --- /dev/null +++ b/datasets/svdb.py @@ -0,0 +1,151 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +PATH = config.get_data_path("SVDB") + + +def load_data(db_path, record_ids=None, verbose=False, number=-1): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is not None and number > 0: + print("Warning: 'number' parameter is " + "ignored when 'record_ids' is provided.") + + if record_ids is None: + record_files = list(db_path.glob("*.test.csv@*.out")) + record_ids = [f.name.split(".")[0] for f in record_files] + if number > 0: + record_ids = record_ids[:number] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Handle case where record_id already includes the pattern + record_files = list(db_path.glob(f"{record_id}*test.csv@*.out")) + if record_files: + if len(record_files) > 1: + if verbose: + print( + f"Multiple files found for record ID {record_id}, " + f"using the first one: {record_files[0]}" + ) + record_file = record_files[0] + # Load the record data + record_data = pd.read_csv( + db_path / record_file, header=None).dropna().to_numpy() + # Assuming first column is the data, second column is labels + if verbose: + print( + f"Loaded record {record_id} " + f"with shape {record_data.shape}") + if record_data.shape[1] >= 2: + if verbose: + print(f"Record {record_id} has sufficient columns") + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for record {record_id}") + else: + if verbose: + print(f"Record file not found for ID: {db_path / record_id}") + if not data_list: + raise ValueError("No valid data found") + + max_length = max(len(data) for data in data_list) + + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Padding with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "SVDB" + + parameters = { + "recordings_id": [["801"]], + "number": [-1], + "debug": [False], + } + + def get_data(self): + """Load the SVDB dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + if self.recordings_id in (["all"], "all"): + self.recordings_id = None + X, y_true = load_data(PATH, self.recordings_id, number=self.number) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + plt.figure(figsize=(6, 3)) + plt.plot(X_train[0, 0, :500], linewidth=1.2) + plt.plot(range(350, 360), + X_train[0, 0, 350:360], color="orange", linewidth=3) + plt.title("SVDB dataset") + plt.tight_layout() + plt.savefig("svdb_example.png") + plt.close() + + print("PLOT SAVED") + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/datasets/swat.py b/datasets/swat.py index 5400d4f..943aad6 100644 --- a/datasets/swat.py +++ b/datasets/swat.py @@ -1,14 +1,13 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset from benchopt.config import get_data_path from benchmark_utils import check_data -with safe_import_context() as import_ctx: - import pandas as pd +import pandas as pd - # Checking if the data is available - PATH = get_data_path(key="SWaT") - TRAIN_PATH = check_data(PATH, "SWaT", "train") - TEST_PATH = check_data(PATH, "SWaT", "test") +# Checking if the data is available +PATH = get_data_path(key="SWaT") +TRAIN_PATH = check_data(PATH, "SWaT", "train") +TEST_PATH = check_data(PATH, "SWaT", "test") class Dataset(BaseDataset): @@ -45,6 +44,13 @@ def get_data(self): X_test = X_test[:1000] y_test = y_test[:1000] + # Reshaping data to (n_recordings, n_features, n_samples) + # For SWaT, treat as single recording + n_features = X_train.shape[1] + X_train = X_train.T.reshape(1, n_features, -1) + X_test = X_test.T.reshape(1, n_features, -1) + y_test = y_test.reshape(1, -1) + return dict( X_train=X_train, y_test=y_test, X_test=X_test ) diff --git a/datasets/trend.py b/datasets/trend.py new file mode 100644 index 0000000..44db101 --- /dev/null +++ b/datasets/trend.py @@ -0,0 +1,79 @@ +from benchopt import BaseDataset + +import numpy as np +from rosecdl.utils.utils_signal import generate_experiment + + +class Dataset(BaseDataset): + name = "Trend" + + parameters = { + "n_samples": [10], + "n_times": [5000], + "debug": [False], + "random_state": [42], + "n_times_atom": [250], + "trend_scale": [9], + "freq": [4], # frequency multiplier for the trend + } + + def get_data(self): + if self.debug: + self.n_samples = 2 + self.n_times = 1000 + + contamination_params = { + "n_atoms": 2, + "sparsity": 3, + "init_z": "constant", + "init_z_kwargs": {"value": 50}, + } + + simulation_params = { + "n_trials": self.n_samples * 2, + "n_channels": 2, + "n_times": self.n_times, + "n_atoms": 2, + "n_times_atom": self.n_times_atom, + "n_atoms_extra": 2, # extra atoms in the learned dictionary + "D_init": "random", + "window": True, + "contamination_params": contamination_params, + "init_d": "shapes", + "init_d_kwargs": {"shapes": ["sin", "gaussian"]}, + "init_z": "constant", + "init_z_kwargs": {"value": 1}, + "noise_std": 0.01, + "rng": self.random_state, + "sparsity": 20, + } + + X, _, _, _, info_contam = generate_experiment( + simulation_params=simulation_params, + return_info_contam=True, + ) + + # Add low frequency sinusoidal trend + t = np.linspace(0, self.freq * np.pi, self.n_times) + trend = self.trend_scale * np.sin(t) + X += trend[None, None, :] + + X_train, X_test = X[: self.n_samples], X[self.n_samples:] + y_test = info_contam["outliers_mask"][self.n_samples:] + y_test = np.any(y_test, axis=1) + + import matplotlib.pyplot as plt + # Plot example time series with trend + plt.figure(figsize=(10, 4)) + plt.plot(X_train[0, 0, :]) + plt.title('Example Time Series with Added Trend') + plt.xlabel('Time') + plt.ylabel('Value') + plt.legend() + plt.show() + + print(f"X_train shape: {X_train.shape}") + print(f"X_test shape: {X_test.shape}") + print(f"y_test shape: {y_test.shape}") + + return dict(X_train=X_train, y_test=y_test, X_test=X_test) diff --git a/datasets/wadi.py b/datasets/wadi.py index d890ec6..1c5c502 100644 --- a/datasets/wadi.py +++ b/datasets/wadi.py @@ -1,14 +1,13 @@ -from benchopt import BaseDataset, safe_import_context +from benchopt import BaseDataset from benchopt.config import get_data_path from benchmark_utils import check_data -with safe_import_context() as import_ctx: - import pandas as pd +import pandas as pd - # Checking if the data is available - PATH = get_data_path(key="WADI") - TRAIN_PATH = check_data(PATH, "WADI", "train") - TEST_PATH = check_data(PATH, "WADI", "test") +# Checking if the data is available +PATH = get_data_path(key="WADI") +TRAIN_PATH = check_data(PATH, "WADI", "train") +TEST_PATH = check_data(PATH, "WADI", "test") class Dataset(BaseDataset): @@ -66,6 +65,13 @@ def get_data(self): X_test = X_test[:1000] y_test = y_test[:1000] + # Reshaping data to (n_recordings, n_features, n_samples) + # For WADI, treat as single recording + n_features = X_train.shape[1] + X_train = X_train.T.reshape(1, n_features, -1) + X_test = X_test.T.reshape(1, n_features, -1) + y_test = y_test.reshape(1, 1, -1) + return dict( X_train=X_train, y_test=y_test, X_test=X_test ) diff --git a/datasets/yahoo.py b/datasets/yahoo.py new file mode 100644 index 0000000..181ef0e --- /dev/null +++ b/datasets/yahoo.py @@ -0,0 +1,140 @@ +from benchopt import BaseDataset, config + +from pathlib import Path +import numpy as np +import pandas as pd + +PATH = config.get_data_path("YAHOO") + + +def load_data(db_path, record_ids=None, verbose=False): + """ + Load data from the database path for specified record IDs. + + Args: + db_path: Path to the database directory + record_ids: List of record IDs to load. + If None, loads all available records. + + Returns: + tuple: (X, y_true) where: + - X: numpy array of shape (num_records, num_samples) + - y_true: numpy array of shape (num_records, num_samples) + """ + db_path = Path(db_path) + + if record_ids is None: + record_files = list(db_path.glob("*.data.out")) + record_ids = [f.name for f in record_files] + + data_list = [] + labels_list = [] + for record_id in record_ids: + # Handle case where record_id already includes the pattern + if record_id.endswith('.data.out'): + pattern = record_id + else: + # Create pattern based on the A{record_id} format + patterns = [ + f"Yahoo_A{record_id}real_*_data.out", + f"Yahoo_A{record_id}synthetic_*_data.out", + f"YahooA{record_id}Benchmark-TS*_data.out" + ] + + # Find all matching files for this record_id + matching_files = [] + if record_id.endswith('.data.out'): + matching_files = list(db_path.glob(pattern)) + else: + for pattern in patterns: + matching_files.extend(list(db_path.glob(pattern))) + + if not matching_files: + if verbose: + print(f"No files found for record {record_id}") + continue + + for record_file in matching_files: + if record_file.exists(): + record_data = pd.read_csv( + record_file, header=None).dropna().to_numpy() + # First column is the data, second column is labels + if record_data.shape[1] >= 2: + data_list.append(record_data[:, 0].astype(float)) + labels_list.append(record_data[:, 1].astype(int)) + else: + if verbose: + print(f"Insufficient columns for file {record_file}") + else: + if verbose: + print(f"Record file not found: {record_file}") + + if not data_list: + raise ValueError("No valid data found") + + max_length = max(len(data) for data in data_list) + + padded_data = [] + padded_labels = [] + for data, labels in zip(data_list, labels_list): + if len(data) < max_length: + # Padding with last value for data and 0 for labels + padded_data.append( + np.pad( + data, + (0, max_length - len(data)), + mode="constant", + constant_values=data[-1], + ) + ) + padded_labels.append( + np.pad( + labels, + (0, max_length - len(labels)), + mode="constant", + constant_values=0, + ) + ) + else: + padded_data.append(data[:max_length]) + padded_labels.append(labels[:max_length]) + + return np.array(padded_data), np.array(padded_labels) + + +class Dataset(BaseDataset): + name = "YAHOO" + + parameters = { + "recordings_id": [["1"]], + "debug": [False], + } + + def get_data(self): + """Load the YAHOO dataset.""" + + # X shape (n_recordings, n_samples) + # y shape (n_recordings, n_samples) + X, y_true = load_data(PATH, self.recordings_id) + + X_test = X.copy() + y_test = y_true.copy() + + X_train = X[:, :int(X.shape[1] * 0.1)] + + if self.debug: + X_train = X_train[:, :1000] + X_test = X_test[:, :1000] + y_test = y_test[:, :1000] + + # Reshaping data to (n_recordings, n_features, n_samples) + n_recordings = X_train.shape[0] + X_train = X_train.reshape(n_recordings, 1, -1) + X_test = X_test.reshape(n_recordings, 1, -1) + y_test = y_test.reshape(n_recordings, -1) + + return dict( + X_train=X_train, + y_test=y_test, + X_test=X_test + ) diff --git a/objective.py b/objective.py index 3d34e42..04dbde5 100644 --- a/objective.py +++ b/objective.py @@ -1,20 +1,25 @@ -from benchopt import BaseObjective, safe_import_context +from benchopt import BaseObjective from benchmark_utils.metrics import ( soft_precision as soft_precision_score, soft_recall as soft_recall_score, soft_f1 as soft_f1_score, - ctt, ttc, + ctt, + ttc, extract_anomaly_ranges, precision_t as precision_t_score, recall_t as recall_t_score, - f1_t as f1_t_score + f1_t as f1_t_score, ) -with safe_import_context() as import_ctx: - import numpy as np - from sklearn.metrics import ( - precision_score, recall_score, f1_score, zero_one_loss - ) +import numpy as np +from sklearn.metrics import ( + average_precision_score, + precision_score, + recall_score, + f1_score, + zero_one_loss, + roc_auc_score, +) class Objective(BaseObjective): @@ -23,80 +28,301 @@ class Objective(BaseObjective): install_cmd = "conda" requirements = ["scikit-learn"] + parameters = { + "score_metrics": [("auc_pr", "auc_roc")], + "prediction_metrics": [None], + } + + detection_ranges = (1, 3, 5, 10, 20) + default_prediction_metrics = ( + "precision", + "recall", + "f1", + "precision_t", + "recall_t", + "f1_t", + "ctt", + "ttc", + "zoloss", + "soft_precision", + "soft_recall", + "soft_f1", + ) + def get_one_result(self): - """Return one solution for which the objective can be computed, - Used to get the shape of the result. - Our algorithms will return an array of labels of shape (n_samples,) - """ - return dict(y_hat=np.ones(self.X_test.shape[0])) + """Return one solution for which the objective can be computed.""" + score_metrics = self._normalize_metrics( + getattr(self, "score_metrics", ("auc_pr", "auc_roc")) + ) + prediction_metrics = self._expand_prediction_metrics( + getattr(self, "prediction_metrics", None) + ) + + result = {} + if score_metrics: + result["anomaly_scores"] = np.zeros_like( + self.y_test, dtype=float + ) + if prediction_metrics: + result["anomaly_predictions"] = np.zeros_like( + self.y_test, dtype=int + ) + return result def set_data(self, X_train, y_test, X_test): "Set the data to compute the objective." self.X_train = X_train self.X_test, self.y_test = X_test, y_test - def evaluate_result(self, y_hat): - """Evaluate the result provided by the solver.""" - to_discard = (y_hat == -1).sum() - self.y_test = self.y_test[to_discard:] - y_hat = y_hat[to_discard:] - - result = {} - detection_ranges = [1, 3, 5, 10, 20] - - # Standard metrics - precision = precision_score(self.y_test, y_hat, zero_division=0) - recall = recall_score(self.y_test, y_hat, zero_division=0) - f1 = f1_score(self.y_test, y_hat, zero_division=0) + def evaluate_result( + self, + anomaly_scores=None, + anomaly_predictions=None, + ): + """Evaluate the result provided by the solver. - anomaly_ranges = extract_anomaly_ranges(self.y_test) - prediction_ranges = extract_anomaly_ranges(y_hat) + anomaly_scores is the score-based solver output. + anomaly_predictions is optional and only needed when requesting + prediction-based metrics. + """ + score_metrics = self._normalize_metrics( + getattr(self, "score_metrics", ("auc_pr", "auc_roc")) + ) + prediction_metrics = self._expand_prediction_metrics( + getattr(self, "prediction_metrics", None) + ) - precision_t = precision_t_score(anomaly_ranges, prediction_ranges) - recall_t = recall_t_score(anomaly_ranges, prediction_ranges) - f1_t = f1_t_score(anomaly_ranges, prediction_ranges) + if score_metrics and anomaly_scores is None: + raise ValueError("score_metrics require an anomaly_scores array.") + if prediction_metrics and anomaly_predictions is None: + raise ValueError( + "prediction_metrics require an anomaly_predictions array.") - result.update({ - "precision": precision, - "recall": recall, - "f1": f1 - }) + y_true, scores, predictions = self._align_inputs( + anomaly_scores=anomaly_scores, + anomaly_predictions=anomaly_predictions, + ) - for range_value in detection_ranges: - soft_precision = soft_precision_score( - self.y_test, y_hat, detection_range=range_value + result = {} + if score_metrics: + result.update( + self._compute_score_metrics( + y_true=y_true, + anomaly_scores=scores, + metrics=score_metrics, + ) ) - soft_recall = soft_recall_score( - self.y_test, y_hat, detection_range=range_value + if prediction_metrics: + result.update( + self._compute_prediction_metrics( + y_true=y_true, + anomaly_predictions=predictions, + metrics=prediction_metrics, + ) ) - soft_f1 = soft_f1_score(soft_precision, soft_recall) - - result.update({ - f"soft_precision_{range_value}": soft_precision, - f"soft_recall_{range_value}": soft_recall, - f"soft_f1_{range_value}": soft_f1 - }) - - zoloss = zero_one_loss(self.y_test, y_hat) - - # Other metrics - cct_score = ctt(self.y_test, y_hat) - ttc_score = ttc(self.y_test, y_hat) - - # Add remaining metrics to the result dictionary - result.update({ - "precision_t": precision_t, - "recall_t": recall_t, - "f1_t": f1_t, - "cct": cct_score, - "ttc": ttc_score, - "zoloss": zoloss, - "value": zoloss # having zoloss twice for the API - }) + # Setting value to 0. The actual value is not used for ranking. + result["value"] = 0.0 return result def get_objective(self): - return dict( - X_train=self.X_train, y_test=None, X_test=self.X_test + return dict(X_train=self.X_train, X_test=self.X_test) + + def _normalize_metrics(self, metrics): + if metrics is None: + return () + if isinstance(metrics, str): + if metrics == "all": + return ("auc_pr", "auc_roc") + return (metrics,) + return tuple(metric for metric in metrics if metric is not None) + + def _expand_prediction_metrics(self, metrics): + metrics = self._normalize_prediction_metrics(metrics) + expanded = [] + + for metric in metrics: + if metric == "all": + metric = self.default_prediction_metrics + else: + metric = (metric,) + + for name in metric: + if name in { + "soft_precision", + "soft_recall", + "soft_f1", + }: + expanded.extend( + f"{name}_{detection_range}" + for detection_range in self.detection_ranges + ) + else: + expanded.append(name) + + return tuple(expanded) + + def _normalize_prediction_metrics(self, metrics): + if metrics is None: + return () + if isinstance(metrics, str): + return (metrics,) + return tuple(metric for metric in metrics if metric is not None) + + def _align_inputs(self, anomaly_scores, anomaly_predictions): + # flatten everything before aligning lengths. + y_true = np.asarray(self.y_test).reshape(-1) + scores = self._as_flat_array(anomaly_scores) + predictions = self._as_flat_array(anomaly_predictions) + + # Only align against arrays that were returned. This keeps + # score-only and prediction-only evaluations valid. + arrays = [array for array in ( + scores, predictions) if array is not None] + if not arrays: + return y_true, None, None + + # Windowed solvers return fewer outputs than y_test because the + # first timestamps have no full context window. Keep the last samples, + # which correspond to the part of y_test the solver scored. + length = min([len(y_true)] + [len(array) for array in arrays]) + y_true = y_true[-length:] + if scores is not None: + scores = scores[-length:] + if predictions is not None: + predictions = predictions[-length:] + + # Drop invalid positions. NaN score padding and -1 prediction padding + # When both scores and predictions are present, the same mask is + # applied to keep mixed metric requests on the same timestamps. + valid = np.ones(length, dtype=bool) + if scores is not None: + valid &= ~np.isnan(scores) + if predictions is not None: + valid &= ~np.isnan(predictions) + valid &= predictions != -1 + + y_true = y_true[valid] + if scores is not None: + scores = scores[valid] + if predictions is not None: + predictions = predictions[valid] + + return y_true, scores, predictions + + def _as_flat_array(self, array): + if array is None: + return None + return np.asarray(array).reshape(-1) + + def _compute_score_metrics(self, y_true, anomaly_scores, metrics): + if len(y_true) == 0: + return {metric: np.nan for metric in metrics} + + result = {} + for metric in metrics: + if metric == "auc_roc": + result[metric] = self._safe_auc_roc(y_true, anomaly_scores) + elif metric == "auc_pr": + result[metric] = self._auc_pr(y_true, anomaly_scores) + else: + raise ValueError(f"Unknown score metric: {metric}") + return result + + def _compute_prediction_metrics( + self, + y_true, + anomaly_predictions, + metrics, + ): + if len(y_true) == 0: + return {metric: np.nan for metric in metrics} + + result = {} + anomaly_ranges = None + prediction_ranges = None + + for metric in metrics: + if metric == "precision": + result[metric] = precision_score( + y_true, anomaly_predictions, zero_division=0 + ) + elif metric == "recall": + result[metric] = recall_score( + y_true, anomaly_predictions, zero_division=0 + ) + elif metric == "f1": + result[metric] = f1_score( + y_true, anomaly_predictions, zero_division=0) + elif metric == "zoloss": + result[metric] = zero_one_loss(y_true, anomaly_predictions) + elif metric in {"precision_t", "recall_t", "f1_t"}: + if anomaly_ranges is None: + anomaly_ranges, prediction_ranges = self._get_ranges( + y_true, anomaly_predictions + ) + if metric == "precision_t": + result[metric] = precision_t_score( + anomaly_ranges, prediction_ranges + ) + elif metric == "recall_t": + result[metric] = recall_t_score( + anomaly_ranges, prediction_ranges) + else: + result[metric] = f1_t_score( + anomaly_ranges, prediction_ranges) + elif metric == "ctt": + result[metric] = ctt(y_true, anomaly_predictions) + elif metric == "ttc": + result[metric] = ttc(y_true, anomaly_predictions) + elif metric.startswith("soft_precision_"): + detection_range = self._parse_detection_range( + metric, "soft_precision") + result[metric] = soft_precision_score( + y_true, + anomaly_predictions, + detection_range=detection_range, + ) + elif metric.startswith("soft_recall_"): + detection_range = self._parse_detection_range( + metric, "soft_recall") + result[metric] = soft_recall_score( + y_true, + anomaly_predictions, + detection_range=detection_range, + ) + elif metric.startswith("soft_f1_"): + detection_range = self._parse_detection_range( + metric, "soft_f1") + result[metric] = soft_f1_score( + y_true, + anomaly_predictions, + detection_range=detection_range, + ) + else: + raise ValueError(f"Unknown prediction metric: {metric}") + + return result + + def _get_ranges(self, y_true, anomaly_predictions): + return ( + extract_anomaly_ranges(y_true), + extract_anomaly_ranges(anomaly_predictions), ) + + def _parse_detection_range(self, metric, prefix): + suffix = metric.replace(f"{prefix}_", "", 1) + try: + return int(suffix) + except ValueError as exc: + raise ValueError( + f"Invalid detection range in prediction metric: {metric}" + ) from exc + + def _safe_auc_roc(self, y_true, anomaly_scores): + return roc_auc_score(y_true, anomaly_scores) + + def _auc_pr(self, y_true, anomaly_scores): + if len(np.unique(y_true)) == 1: + return np.nan + return average_precision_score(y_true, anomaly_scores) diff --git a/solvers/AR.py b/solvers/AR.py index ff2d547..50eb908 100644 --- a/solvers/AR.py +++ b/solvers/AR.py @@ -1,20 +1,21 @@ # AR model -from benchopt import BaseSolver, safe_import_context -from benchmark_utils import mean_overlaping_pred +from benchopt import BaseSolver + +import torch +from torch import optim, nn +import numpy as np +from tqdm import tqdm -with safe_import_context() as import_ctx: - import torch - from torch import optim, nn - import numpy as np - from tqdm import tqdm - from benchmark_utils.models import ARModel +from benchmark_utils.models import ARModel +from benchmark_utils import mean_overlaping_pred +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): name = "AR" # AutoRegressive Linear model install_cmd = "conda" - requirements = ["pip:torch", "tqdm"] + requirements = ["pytorch", "tqdm"] sampling_strategy = "run_once" @@ -23,29 +24,39 @@ class Solver(BaseSolver): "n_epochs": [50], "lr": [1e-5], "weight_decay": [1e-7], - "window_size": [256], + "window_size": [100], "horizon": [1], - "percentile": [99.4], + "cutoff": [None], + } + + test_config = { + "n_epochs": 1, + "window_size": 16, } - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) - self.X_train = X_train # (n_samples, n_features) - self.X_test, self.y_test = X_test, y_test # (n_samples, n_features) - self.n_features = X_train.shape[1] + # Receiving shapes of (n_recordings, n_features, n_samples) + + _, n_features, _ = X_train.shape + + # (n_samples, n_features) + self.X_train = X_train.reshape(-1, n_features) + # (n_samples, n_features) + self.X_test = X_test.reshape(-1, n_features) self.model = ARModel( - self.n_features, + n_features, self.window_size, self.horizon ) self.optimizer = optim.Adam( self.model.parameters(), - lr=self.lr, + lr=float(self.lr), # weight_decay=self.weight_decay ) self.criterion = nn.MSELoss() @@ -53,7 +64,7 @@ def set_objective(self, X_train, y_test, X_test): if self.X_train is not None: # (n_windows, window_size+horizon, n_features) self.Xw_train = np.lib.stride_tricks.sliding_window_view( - X_train, + self.X_train, window_shape=self.window_size+self.horizon, axis=0 ).transpose(0, 2, 1) @@ -61,7 +72,7 @@ def set_objective(self, X_train, y_test, X_test): if self.X_test is not None: # (n_windows, window_size+horizon, n_features) self.Xw_test = np.lib.stride_tricks.sliding_window_view( - X_test, + self.X_test, window_shape=self.window_size+self.horizon, axis=0 ).transpose(0, 2, 1) @@ -120,38 +131,36 @@ def run(self, _): xw_hat = xw_hat.detach().cpu().numpy() - # Reconstructing the prediction from the predicted windows - # Creating the prediction array with -1 for the unknown values - # Corresponding to the first window_size values - x_hat = np.zeros_like(self.X_test)-1 # (n_test_samples, n_features) - x_hat[self.window_size:self.window_size+self.horizon] = xw_hat[0] + # Reconstructing the prediction from the predicted windows. + # The first ``window_size`` positions have no forecast (no full input + # window precedes them); fill them with -1 as a sentinel. + x_hat = np.zeros_like(self.X_test) - 1 + x_hat[self.window_size:] = mean_overlaping_pred(xw_hat, 1) - x_hat[self.window_size+self.horizon:] = mean_overlaping_pred( - xw_hat, 1 + reconstruction_err = np.abs( + self.X_test[self.window_size:] - x_hat[self.window_size:] ) - - # Calculating the percentile value for the threshold - percentile_value = np.percentile( - np.abs(self.X_test[self.window_size:] - x_hat[self.window_size:]), - self.percentile + self.anomaly_scores = np.full( + self.X_test.shape, np.nan, dtype=float ) + self.anomaly_scores[self.window_size:] = reconstruction_err + self.anomaly_scores = np.max(self.anomaly_scores, axis=1) - # Thresholding - predictions = np.zeros_like(x_hat)-1 - predictions[self.window_size:] = np.where( - np.abs(self.X_test[self.window_size:] - - x_hat[self.window_size:]) > percentile_value, 1, 0 + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) - self.predictions = np.max(predictions, axis=1) - # Skipping the solver call if a condition is met - def skip(self, X_train, X_test, y_test): - if X_train.shape[0] < self.window_size + self.horizon: + def skip(self, X_train, X_test): + if X_train.shape[0]*X_train.shape[2] < self.window_size + self.horizon: return True, "No enough training samples" - if X_test.shape[0] < self.window_size + self.horizon: + if X_test.shape[0]*X_test.shape[2] < self.window_size + self.horizon: return True, "No enough testing samples" return False, None def get_result(self): - return dict(y_hat=self.predictions) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/anomalybert.py b/solvers/anomalybert.py new file mode 100644 index 0000000..8c9d1b4 --- /dev/null +++ b/solvers/anomalybert.py @@ -0,0 +1,295 @@ +from models.anomaly_transformer import get_anomaly_transformer +from benchopt import BaseSolver + +import sys +from pathlib import Path +import numpy as np +import torch +import torch.nn as nn +from torch.optim.lr_scheduler import CosineAnnealingLR +from tqdm import tqdm + +from benchmark_utils.predictions import cutoff_scores + +# Add AnomalyBERT to path +sys.path.append(str(Path(__file__).parent.parent / 'AnomalyBERT')) + + +class Solver(BaseSolver): + name = "AnomalyBERT" + sampling_strategy = "run_once" + + requirements = ["pip::timm", "pytorch", "numpy", "tqdm"] + + parameters = { + "patch_size": [1], + "d_embed": [512], + "n_layer": [6], + "batch_size": [128], + "lr": [0.0001], + "max_steps": [5000], + "n_patches": [512], + "seed": [548920], + "device": ["cuda:1"], + "window_sliding": [16], + "cutoff": [None], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, X_test): + # X_train shape: (n_series, n_features, n_samples) + if X_train.ndim == 3: + self.X_train = np.transpose( + X_train, (0, 2, 1)).reshape( + -1, X_train.shape[1] + ).astype(np.float32) + self.X_test = np.transpose( + X_test, (0, 2, 1)).reshape( + -1, X_test.shape[1] + ).astype(np.float32) + else: + self.X_train = X_train.astype(np.float32) + self.X_test = X_test.astype(np.float32) + + def run(self, _): + torch.manual_seed(self.seed) + np.random.seed(self.seed) + + device = torch.device( + self.device if torch.cuda.is_available() else 'cpu') + + train_data = self.X_train + d_data = train_data.shape[1] + + # Configuration + patch_size = self.patch_size + # This corresponds to n_features (max_seq_len) in AnomalyBERT + n_patches = self.n_patches + data_seq_len = n_patches * patch_size + + if len(train_data) <= data_seq_len: + raise ValueError( + f"Data length {len(train_data)} is smaller than " + f"sequence length {data_seq_len}") + self.model = get_anomaly_transformer( + input_d_data=d_data, + output_d_data=1, # BCE loss + patch_size=patch_size, + d_embed=self.d_embed, + hidden_dim_rate=4., + max_seq_len=n_patches, + positional_encoding=None, + relative_position_embedding=True, + transformer_n_layer=self.n_layer, + transformer_n_head=8, + dropout=0.1 + ).to(device) + + # Optimizer + optimizer = torch.optim.AdamW( + params=self.model.parameters(), lr=self.lr, weight_decay=1e-4) + scheduler = CosineAnnealingLR( + optimizer, T_max=self.max_steps, eta_min=self.lr*0.01) + + train_loss_fn = nn.BCELoss().to(device) + sigmoid = nn.Sigmoid().to(device) + + # Data Augmentation Parameters + replacing_rate = (0.015, 0.15) + + replacing_table = list(np.random.randint( + int(data_seq_len*replacing_rate[0]), + int(data_seq_len*replacing_rate[1]), + size=10000)) + replacing_table_index = 0 + replacing_table_length = 10000 + + soft_replacing_prob = 1 - 0.5 + uniform_replacing_prob = soft_replacing_prob - 0.15 + peak_noising_prob = uniform_replacing_prob - 0.15 + + replacing_weight = 0.7 + + def replacing_weights(interval_len): + warmup_len = interval_len // 10 + return np.concatenate(( + np.linspace(0, replacing_weight, num=warmup_len), + np.full(interval_len-2*warmup_len, replacing_weight), + np.linspace(replacing_weight, 0, num=warmup_len)), + axis=None) + + valid_index_list = np.arange(len(train_data) - data_seq_len) + numerical_column = np.arange(d_data) # Assume all numerical + + # Training Loop + self.model.train() + for i in tqdm(range(self.max_steps)): + first_index = np.random.choice( + valid_index_list, size=self.batch_size) + x = [] + for j in first_index: + x.append(torch.Tensor( + train_data[j:j+data_seq_len].copy()).to(device)) + + # Replace data logic + current_index = replacing_table_index + replacing_table_index += self.batch_size + + if replacing_table_index > replacing_table_length: + replacing_lengths = replacing_table[current_index:] + \ + replacing_table[:replacing_table_index - + replacing_table_length] + replacing_table_index -= replacing_table_length + else: + replacing_lengths = replacing_table[ + current_index:replacing_table_index + ] + if replacing_table_index == replacing_table_length: + replacing_table_index = 0 + + replacing_lengths = np.array(replacing_lengths) + + target_index = np.random.randint( + 0, data_seq_len-replacing_lengths+1) + + replacing_type = np.random.uniform(0., 1., size=(self.batch_size,)) + replacing_dim_numerical = np.random.uniform( + 0., 1., size=(self.batch_size, d_data)) + replacing_dim_numerical = replacing_dim_numerical - \ + np.maximum(replacing_dim_numerical.min( + axis=1, keepdims=True), 0.3) <= 0.001 + + x_anomaly = torch.zeros( + self.batch_size, data_seq_len, device=device) + + for j, tar, leng, typ, dim_num in zip( + range(self.batch_size), + target_index, replacing_lengths, + replacing_type, + replacing_dim_numerical): + if leng > 0: + _x = x[j].clone().transpose(0, 1) # (d_data, seq_len) + + # External interval replacing + if typ > soft_replacing_prob: + col_num = numerical_column[dim_num] + if len(col_num) > 0: + # Pick random interval from train_data + rep_start = np.random.randint( + 0, len(train_data) - leng) + random_interval = train_data[rep_start:rep_start + + leng, col_num].copy() + + # Random flip + if np.random.rand() > 0.5: # Horizontal + random_interval = random_interval[::-1].copy() + if np.random.rand() > 0.5: # Vertical + random_interval = 1 - random_interval + + _x_temp = torch.from_numpy(random_interval).to( + device).transpose(0, 1) # (n_cols, leng) + + weights = torch.from_numpy(replacing_weights( + leng)).float().unsqueeze(0).to(device) + _x[col_num, tar:tar+leng] = _x_temp * weights + \ + _x[col_num, tar:tar+leng] * (1 - weights) + + x_anomaly[j, tar:tar+leng] = 1 + x[j] = _x.transpose(0, 1) + + # Uniform replacing + elif typ > uniform_replacing_prob: + col_num = numerical_column[dim_num] + if len(col_num) > 0: + _x[col_num, tar:tar + + leng] = torch.rand( + len(col_num), leng, device=device + ) + x_anomaly[j, tar:tar+leng] = 1 + x[j] = _x.transpose(0, 1) + + # Peak noising + elif typ > peak_noising_prob: + col_num = numerical_column[dim_num] + if len(col_num) > 0: + peak_index = np.random.randint(0, leng) + peak_value = ( + _x[col_num, tar+peak_index] < 0.5 + ).float().to(device) + peak_value = peak_value + \ + (0.1 * (1 - 2 * peak_value)) * \ + torch.rand(len(col_num), device=device) + _x[col_num, tar+peak_index] = peak_value + + tar_first = np.maximum( + 0, tar + peak_index - patch_size) + tar_last = tar + peak_index + patch_size + 1 + x_anomaly[j, tar_first:tar_last] = 1 + x[j] = _x.transpose(0, 1) + + z = torch.stack(x) + y = self.model(z) + y = y.squeeze(-1) + loss = train_loss_fn(sigmoid(y), x_anomaly) + + optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) + optimizer.step() + scheduler.step() + + device = torch.device( + self.device if torch.cuda.is_available() else 'cpu') + self.model.eval() + + test_data = self.X_test + window_size = self.n_patches * self.patch_size + window_sliding = self.window_sliding # Default from estimate.py + batch_size = self.batch_size + + # We will just slide over the test data + + n_samples = len(test_data) + output_values = torch.zeros(n_samples, device=device) + n_overlap = torch.zeros(n_samples, device=device) + + sigmoid = nn.Sigmoid().to(device) + + with torch.no_grad(): + # Pad test data if needed or just handle boundaries + # estimate.py handles divisions. We assume 1 continuous sequence. + + # We need to batch the sliding windows + indices = list( + range(0, n_samples - window_size + 1, window_sliding)) + + for i in range(0, len(indices), batch_size): + batch_indices = indices[i:i+batch_size] + x_batch = [] + for idx in batch_indices: + x_batch.append(test_data[idx:idx+window_size]) + + if not x_batch: + continue + + x_batch = torch.Tensor(np.stack(x_batch)).to(device) + # (batch, window_size) + y_batch = sigmoid(self.model(x_batch)).squeeze(-1) + + for j, idx in enumerate(batch_indices): + output_values[idx:idx+window_size] += y_batch[j] + n_overlap[idx:idx+window_size] += 1 + + n_overlap[n_overlap == 0] = 1 + self.anomaly_scores = (output_values / n_overlap).cpu().numpy() + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + + def get_result(self): + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/autoencoder.py b/solvers/autoencoder.py new file mode 100644 index 0000000..a36fad0 --- /dev/null +++ b/solvers/autoencoder.py @@ -0,0 +1,85 @@ +from benchopt import BaseSolver + +from sklearn.preprocessing import MinMaxScaler + +from benchmark_utils.models import Autoencoder +from benchmark_utils.predictions import cutoff_scores +from benchmark_utils.windowing import find_period_length + + +class Solver(BaseSolver): + name = "AE" + + install_cmd = "conda" + requirements = ["pytorch", "scikit-learn", "tqdm"] + + parameters = { + "window_size": [10, "auto"], + "num_epochs": [100], + "batch_size": [1024], + "learning_rate": [1e-3], + "hidden_size": [64], + "latent_size": [32], + "cutoff": [None], + } + + test_config = { + "window_size": 10, + "num_epochs": 1, + "batch_size": 8, + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, X_test): + if self.window_size == "auto": + self.window_size = find_period_length(X_train.reshape(-1)) + + # Data received has shape (n_recordings, n_features, n_samples) + n_features = X_train.shape[1] + self.X_train = X_train.reshape(-1, n_features) + self.X_test = X_test.reshape(-1, n_features) + + # For multivariate data, input_size = window_size * n_features + self.clf = Autoencoder( + input_size=self.window_size * n_features, + sliding_window=self.window_size, + latent_size=self.latent_size, + hidden_size=self.hidden_size, + ) + + def run(self, _): + self.clf.fit( + self.X_train, + num_epochs=self.num_epochs, + batch_size=self.batch_size, + learning_rate=float(self.learning_rate), + ) + + self.clf.predict(self.X_test) + anomaly_scores = self.clf.decision_scores_ + + self.anomaly_scores = ( + MinMaxScaler(feature_range=(0, 1)) + .fit_transform(anomaly_scores.reshape(-1, 1)) + .ravel() + ) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + + def skip(self, X_train, X_test): + """Check if the solver can be skipped.""" + if find_period_length(X_train.reshape(-1)) == 0 and ( + self.window_size == "auto" + ): + return True, "Window size is 0" + return False, None + + def get_result(self): + """Return the result of the solver.""" + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/dagmm.py b/solvers/dagmm.py new file mode 100644 index 0000000..148fbc3 --- /dev/null +++ b/solvers/dagmm.py @@ -0,0 +1,76 @@ +from benchopt import BaseSolver + +import pandas as pd +from merlion.models.anomaly.dagmm import DAGMM, DAGMMConfig +from merlion.utils.time_series import TimeSeries + +from benchmark_utils.predictions import cutoff_scores + + +class Solver(BaseSolver): + name = "DAGMM" + + install_cmd = "conda" + requirements = ["pip::salesforce-merlion", "pip::scikit-learn"] + + parameters = { + "gmm_k": [3], + "hidden_size": [256], + "sequence_len": [10], + "num_epochs": [10], + "lr": [1e-3], + "batch_size": [8192], + "lambda_energy": [0.1], + "lambda_cov": [0.005], + "cutoff": [None], + # "device": ["cuda:3"] + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, X_test): + n_features = X_train.shape[1] + self.X_train = X_train.transpose(0, 2, 1).reshape(-1, n_features) + self.X_test = X_test.transpose(0, 2, 1).reshape(-1, n_features) + # Convert to Merlion TimeSeries + # We use a default index since we don't have timestamps + train_df = pd.DataFrame(self.X_train) + test_df = pd.DataFrame(self.X_test) + + # Merlion expects a time index or it will generate one + self.train_data = TimeSeries.from_pd(train_df) + self.test_data = TimeSeries.from_pd(test_df) + + # Configure DAGMM + config = DAGMMConfig( + gmm_k=self.gmm_k, + hidden_size=self.hidden_size, + sequence_len=self.sequence_len, + num_epochs=self.num_epochs, + lr=self.lr, + batch_size=self.batch_size, + lambda_energy=self.lambda_energy, + lambda_cov=self.lambda_cov, + # device=self.device + ) + + self.model = DAGMM(config) + + def run(self, _): + # Train + self.model.train(self.train_data) + + # Predict + # get_anomaly_score returns a TimeSeries of scores + scores_ts = self.model.get_anomaly_score(self.test_data) + self.anomaly_scores = scores_ts.to_pd().values.flatten() + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + + def get_result(self): + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/dif.py b/solvers/dif.py deleted file mode 100644 index 6aeef8e..0000000 --- a/solvers/dif.py +++ /dev/null @@ -1,93 +0,0 @@ -# Deep Isolation Forest -from benchopt import BaseSolver -from benchopt import safe_import_context - -with safe_import_context() as import_ctx: - from pyod.models.dif import DIF - import numpy as np - - -class Solver(BaseSolver): - name = "DIF" - - install_cmd = "conda" - requirements = ["pip:pyod"] - - parameters = { - "contamination": [0.05, 0.1, 0.2], - "window": [True], - "window_size": [20], - "stride": [1], - } - - sampling_strategy = "run_once" - - def set_objective(self, X_train, y_test, X_test): - self.X_train = X_train - self.X_test, self.y_test = X_test, y_test - # Device is automatically selected by the model - # if device=None - self.clf = DIF(contamination=self.contamination, device=None) - - def run(self, _): - # Using only windowed data, parameter used only for consistency - if self.window: - - # Transofrming the data into rolling windowed data - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - self.X_train, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - self.X_test, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - # Flattening the data for the model - flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) - flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) - - self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat - ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score - ) - - def skip(self, X_train, X_test, y_test): - if X_train.shape[0] < self.window_size: - return True, "Not enough samples to create a window" - return False, None - - def get_result(self): - # Anomaly : 1 - # Inlier : 0 - # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) diff --git a/solvers/isolation-forest.py b/solvers/isolation-forest.py deleted file mode 100644 index dac03e3..0000000 --- a/solvers/isolation-forest.py +++ /dev/null @@ -1,90 +0,0 @@ -# Isolation Forest solver - -from benchopt import BaseSolver -from benchopt import safe_import_context - -with safe_import_context() as import_ctx: - from sklearn.ensemble import IsolationForest - import numpy as np - - -class Solver(BaseSolver): - name = "IsolationForest" - - install_cmd = "conda" - requirements = ["scikit-learn"] - - parameters = { - "contamination": [5e-4, 5e-3, 5e-2, 0.1, 0.2, 0.4, 0.5], - "window": [True], - "window_size": [60, 120, 180], - "stride": [1], - } - - sampling_strategy = "run_once" - - def set_objective(self, X_train, y_test, X_test): - self.X_train = X_train - self.X_test, self.y_test = X_test, y_test - self.clf = IsolationForest(contamination=self.contamination) - - def run(self, _): - if self.window: - # We need to transform the data to have a rolling window - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - self.X_train, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - self.X_test, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) - flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) - - self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat - ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score - ) - - def skip(self, X_train, X_test, y_test): - # Skip if dataset size is smaller than window size - if X_train.shape[0] < self.window_size: - return True, "Window size is larger than dataset size. Skipping." - return False, None - - def get_result(self): - # Anomaly : 1 - # Inlier : 0 - # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) diff --git a/solvers/abod.py b/solvers/legacy/abod.py similarity index 52% rename from solvers/abod.py rename to solvers/legacy/abod.py index 6ff02ae..13bca75 100644 --- a/solvers/abod.py +++ b/solvers/legacy/abod.py @@ -1,18 +1,18 @@ # ABOD solver from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from pyod.models.abod import ABOD - import numpy as np +from pyod.models.abod import ABOD +import numpy as np + +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): name = "ABOD" # Angle-Based Outlier Detection install_cmd = "conda" - requirements = ["pip:pyod"] + requirements = ["pip::pyod"] parameters = { "contamination": [5e-4, 0.1, 0.2, 0.3], @@ -20,13 +20,14 @@ class Solver(BaseSolver): "window": [True], "window_size": [20], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test self.clf = ABOD( n_neighbors=self.n_neighbors, contamination=self.contamination, @@ -48,45 +49,27 @@ def run(self, _): self.X_test, window_shape=self.window_size, axis=0 )[::self.stride].transpose(0, 2, 1) - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - # Flattening the data for the model flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) self.clf.fit(flatrain) - - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = self.clf.decision_function(flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) # Function used to skip a solver call when n_neighbors >= window_size - def skip(self, X_train, X_test, y_test): + def skip(self, X_train, X_test): if self.n_neighbors >= self.window_size: return True, "Number of neighbors greater than number of samples." return False, None @@ -95,5 +78,7 @@ def get_result(self): # Anomaly : 1 # Inlier : 0 # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/cblof.py b/solvers/legacy/cblof.py similarity index 51% rename from solvers/cblof.py rename to solvers/legacy/cblof.py index 3e44432..1e65a7b 100644 --- a/solvers/cblof.py +++ b/solvers/legacy/cblof.py @@ -1,18 +1,18 @@ # Cluster Based Local Outlier Factor (CBLOF) solver from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from pyod.models.cblof import CBLOF - import numpy as np +from pyod.models.cblof import CBLOF +import numpy as np + +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): name = "CBLOF" install_cmd = "conda" - requirements = ["pip:pyod"] + requirements = ["pip::pyod"] parameters = { "contamination": [5e-4, 0.01, 0.02, 0.03, 0.04], @@ -20,13 +20,14 @@ class Solver(BaseSolver): "n_clusters": [10], "window_size": [20], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test self.clf = CBLOF( contamination=self.contamination, n_clusters=self.n_clusters @@ -47,44 +48,27 @@ def run(self, _): self.X_test, window_shape=self.window_size, axis=0 )[::self.stride].transpose(0, 2, 1) - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - # Flattening the data for the model flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = self.clf.decision_function(flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) # Skipping the solver call if a condition is met - def skip(self, X_train, X_test, y_test): + def skip(self, X_train, X_test): if X_train.shape[0] < self.window_size: return True, "No enough samples to create a window" return False, None @@ -93,5 +77,7 @@ def get_result(self): # Anomaly : 1 # Inlier : 0 # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/dif.py b/solvers/legacy/dif.py new file mode 100644 index 0000000..36de441 --- /dev/null +++ b/solvers/legacy/dif.py @@ -0,0 +1,79 @@ +# Deep Isolation Forest +from benchopt import BaseSolver + +from pyod.models.dif import DIF +import numpy as np + +from benchmark_utils.predictions import cutoff_scores + + +class Solver(BaseSolver): + name = "DIF" + + install_cmd = "conda" + requirements = ["pip::pyod"] + + parameters = { + "contamination": [0.05, 0.1, 0.2], + "window": [True], + "window_size": [20], + "stride": [1], + "cutoff": [None], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, X_test): + self.X_train = X_train + self.X_test = X_test + # Device is automatically selected by the model + # if device=None + self.clf = DIF(contamination=self.contamination, device=None) + + def run(self, _): + # Using only windowed data, parameter used only for consistency + if self.window: + + # Transofrming the data into rolling windowed data + if self.X_train is not None: + self.Xw_train = np.lib.stride_tricks.sliding_window_view( + self.X_train, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + if self.X_test is not None: + self.Xw_test = np.lib.stride_tricks.sliding_window_view( + self.X_test, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + # Flattening the data for the model + flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) + flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) + + self.clf.fit(flatrain) + anomaly_scores = self.clf.decision_function(flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, + ) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + + def skip(self, X_train, X_test): + if X_train.shape[0] < self.window_size: + return True, "Not enough samples to create a window" + return False, None + + def get_result(self): + # Anomaly : 1 + # Inlier : 0 + # To ignore : -1 + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/isolation-forest.py b/solvers/legacy/isolation-forest.py new file mode 100644 index 0000000..83e8839 --- /dev/null +++ b/solvers/legacy/isolation-forest.py @@ -0,0 +1,106 @@ +# Isolation Forest solver + +from benchopt import BaseSolver + +from sklearn.ensemble import IsolationForest +import numpy as np + +from benchmark_utils.predictions import cutoff_scores + + +class Solver(BaseSolver): + name = "IsolationForest" + + install_cmd = "conda" + requirements = ["scikit-learn"] + + parameters = { + "contamination": [5e-4, 5e-3, 5e-2, 0.1, 0.2, 0.4, 0.5], + "window": [True], + "window_size": [60, 120, 180], + "stride": [1], + "cutoff": [None], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, X_test): + self.X_train = X_train + self.X_test = X_test + n_recordings, n_features, n_samples = self.X_train.shape + self.clf = IsolationForest(contamination=self.contamination) + + def run(self, _): + if self.window: + # We need to transform the data to have a rolling window + if self.X_train is not None: + # Apply sliding window along the time dimension (axis=2) + n_recordings, n_features, n_samples = self.X_train.shape + self.Xw_train = np.lib.stride_tricks.sliding_window_view( + self.X_train, window_shape=self.window_size, axis=2 + )[:, :, ::self.stride].transpose(0, 1, 3, 2) + + if self.X_test is not None: + n_recordings, n_features, n_samples = self.X_test.shape + self.Xw_test = np.lib.stride_tricks.sliding_window_view( + self.X_test, window_shape=self.window_size, axis=2 + )[:, :, ::self.stride].transpose(0, 1, 3, 2) + + # Flatten for sklearn + flatrain = self.Xw_train.reshape( + self.Xw_train.shape[0] * self.Xw_train.shape[1], -1) + flatest = self.Xw_test.reshape( + self.Xw_test.shape[0] * self.Xw_test.shape[1], -1) + + self.clf.fit(flatrain) + anomaly_scores = -self.clf.decision_function(flatest) + + # The results we get has a shape of + n_recordings, n_features, n_windows, _ = self.Xw_test.shape + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + self.anomaly_scores = self.anomaly_scores.reshape( + n_recordings, n_features, n_windows) + else: + # No windowing case + # Flatten the data for sklearn + n_recordings, n_features, n_samples = self.X_train.shape + X_train_flat = self.X_train.reshape(-1, n_features) + X_test_flat = self.X_test.reshape(-1, n_features) + + self.clf.fit(X_train_flat) + self.anomaly_scores = -self.clf.decision_function(X_test_flat) + + # Reshape to (n_recordings, n_samples) for single feature case + # We assume we take the first feature or average across features + self.anomaly_scores = self.anomaly_scores.reshape( + n_recordings, n_samples) + + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + + def skip(self, X_train, X_test): + # Skip if dataset size is smaller than window size + _, _, n_samples = X_train.shape + if n_samples < self.window_size: + return True, "Window size is larger than dataset size. Skipping." + return False, None + + def get_result(self): + # Anomaly : 1 + # Inlier : 0 + # To ignore : -1 + # For now, take the first recording + anomaly_scores = self.anomaly_scores[0] if ( + self.anomaly_scores.ndim > 1 + ) else self.anomaly_scores + result = dict(anomaly_scores=anomaly_scores) + if self.anomaly_predictions is not None: + anomaly_predictions = self.anomaly_predictions[0] if ( + self.anomaly_predictions.ndim > 1 + ) else self.anomaly_predictions + result["anomaly_predictions"] = anomaly_predictions + return result diff --git a/solvers/lof.py b/solvers/legacy/lof.py similarity index 53% rename from solvers/lof.py rename to solvers/legacy/lof.py index 1ce2058..9075caa 100644 --- a/solvers/lof.py +++ b/solvers/legacy/lof.py @@ -1,11 +1,11 @@ # Local Outlier Factor from benchopt import BaseSolver -from benchopt import safe_import_context -with safe_import_context() as import_ctx: - from sklearn.neighbors import LocalOutlierFactor - import numpy as np +from sklearn.neighbors import LocalOutlierFactor +import numpy as np + +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): @@ -20,13 +20,14 @@ class Solver(BaseSolver): "window": [True], "window_size": [20], "stride": [1], + "cutoff": [None], } sampling_strategy = "run_once" - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test self.clf = LocalOutlierFactor( novelty=True, n_neighbors=self.n_neighbors, @@ -46,42 +47,25 @@ def run(self, _): self.X_test, window_shape=self.window_size, axis=0 )[::self.stride].transpose(0, 2, 1) - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) self.clf.fit(flatrain) - raw_y_hat = self.clf.predict(flatest) - raw_anomaly_score = self.clf.decision_function(flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat + anomaly_scores = -self.clf.decision_function(flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) - def skip(self, X_train, y_test, X_test): + def skip(self, X_train, X_test): if self.n_neighbors > self.window_size: return True, "Number of neighbors greater than number of samples." if self.n_neighbors > X_train.shape[0]: @@ -94,5 +78,7 @@ def get_result(self): # Anomaly : 1 # Inlier : 0 # To ignore : -1 - self.y_hat = self.raw_y_hat - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/legacy/ocsvm.py b/solvers/legacy/ocsvm.py new file mode 100644 index 0000000..1813763 --- /dev/null +++ b/solvers/legacy/ocsvm.py @@ -0,0 +1,76 @@ +from benchopt import BaseSolver + +from sklearn.svm import OneClassSVM +import numpy as np + +from benchmark_utils.predictions import cutoff_scores + + +class Solver(BaseSolver): + name = "OCSVM" + + install_cmd = "conda" + requirements = ["scikit-learn"] + + parameters = { + "nu": [0.001, 0.01, 0.05], + "gamma": [1e-5, 1e-2], + "kernel": ["rbf"], + "window": [True], + "window_size": [128], + "stride": [1], + "cutoff": [None], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, X_test): + self.X_train = X_train + self.X_test = X_test + self.clf = OneClassSVM( + nu=self.nu, + kernel=self.kernel, + gamma=self.gamma, + ) + + if self.window: + if self.X_train is not None: + self.Xw_train = np.lib.stride_tricks.sliding_window_view( + self.X_train, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + if self.X_test is not None: + self.Xw_test = np.lib.stride_tricks.sliding_window_view( + self.X_test, window_shape=self.window_size, axis=0 + )[::self.stride].transpose(0, 2, 1) + + self.flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) + self.flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) + + def run(self, _): + if self.window: + self.clf.fit(self.flatrain) + anomaly_scores = -self.clf.decision_function(self.flatest) + + # Anomaly scores + self.anomaly_scores = np.array(anomaly_scores) + padding = max(self.X_test.shape[0] - len(self.anomaly_scores), 0) + self.anomaly_scores = np.append( + np.full(padding, np.nan), + self.anomaly_scores, + ) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + + def skip(self, X_train, X_test): + if X_train.shape[0] < self.window_size: + return True, "Window size is larger than dataset size." + return False, None + + def get_result(self): + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/lstm.py b/solvers/lstm.py index b3e128f..ff4a975 100644 --- a/solvers/lstm.py +++ b/solvers/lstm.py @@ -1,21 +1,23 @@ # LSTM Autoencoder -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - import torch - import torch.nn as nn - import torch.optim as optim - import numpy as np - from torch.utils.data import DataLoader - from tqdm import tqdm - from benchmark_utils.models import AutoEncoderLSTM +import torch +import torch.nn as nn +import torch.optim as optim +import numpy as np +from torch.utils.data import DataLoader +from tqdm import tqdm +from benchmark_utils.models import AutoEncoderLSTM +from benchmark_utils.windowing import make_windowed_dataset +from benchmark_utils.windowing import reconstruct_from_windows +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): name = "LSTM" install_cmd = "conda" - requirements = ["pip:torch", "tqdm"] + requirements = ["pytorch", "tqdm"] sampling_strategy = "run_once" @@ -24,28 +26,28 @@ class Solver(BaseSolver): "batch_size": [32], "n_epochs": [50], "lr": [1e-5], - "window": [True], "window_size": [256], # window_size = seq_len "stride": [1], - "percentile": [97], + "cutoff": [None], "encoder_layers": [32], "decoder_layers": [32], } - def prepare_data(self, *data): - # return tensors on device - return (torch.tensor( - d, dtype=torch.float32, device=self.device) - for d in data) + test_config = { + "embedding_dim": 2, + "batch_size": 1, + "n_epochs": 1, + "window_size": 16, + } - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test self.n_features = X_train.shape[1] self.seq_len = self.window_size @@ -59,33 +61,15 @@ def set_objective(self, X_train, y_test, X_test): self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.criterion = nn.MSELoss() - if self.window: - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - self.X_train, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - self.Xw_train = torch.tensor( - self.Xw_train, dtype=torch.float32 - ) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - self.X_test, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - self.Xw_test = torch.tensor( - self.Xw_test, dtype=torch.float32 - ) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] + self.Xw_train = make_windowed_dataset( + self.X_train, window_size=self.window_size, + stride=self.stride + ) - self.yw_test = torch.tensor( - self.yw_test, dtype=torch.float32 - ) + self.Xw_test = make_windowed_dataset( + self.X_test, window_size=self.window_size, + stride=self.stride + ) self.train_loader = DataLoader( self.Xw_train, batch_size=self.batch_size, shuffle=True, @@ -105,7 +89,7 @@ def run(self, _): for epoch in ti: self.model.train() train_loss = 0 - for i, x in enumerate(self.train_loader): + for x, in self.train_loader: x = x.to(self.device) @@ -120,38 +104,38 @@ def run(self, _): ti.set_postfix(train_loss=f"{train_loss:.5f}") - # Saving the model - torch.save(self.model.state_dict(), "model.pth") - # Test loop self.model.eval() raw_reconstruction = [] - for x in self.test_loader: + for x, in self.test_loader: x = x.to(self.device) - - x_hat = self.model(x) + with torch.no_grad(): + x_hat = self.model(x) raw_reconstruction.append(x_hat.detach().cpu().numpy()) - - raw_reconstruction = np.concatenate(raw_reconstruction, axis=0) - - reconstructed_data = np.concatenate( - [raw_reconstruction[0], raw_reconstruction[1:, -1, :]], axis=0 + reconstructed_data = np.concatenate(raw_reconstruction, axis=0) + reconstructed_data = reconstruct_from_windows( + reconstructed_data, stride=self.stride, + batch=len(self.X_test), n_features=self.n_features ) reconstruction_err = np.mean( np.abs(self.X_test - reconstructed_data), axis=1 ) + self.anomaly_scores = reconstruction_err - self.y_hat = np.where( - reconstruction_err > np.percentile( - reconstruction_err, self.percentile), 1, 0 + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, ) - def skip(self, X_train, X_test, y_test): - if X_train.shape[0] < self.window_size: + def skip(self, X_train, X_test): + if X_train.shape[-1] < self.window_size: return True, "Not enough samples to create a window." return False, None def get_result(self): - return dict(y_hat=self.y_hat) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/matrixprofile.py b/solvers/matrixprofile.py new file mode 100644 index 0000000..7b91d8d --- /dev/null +++ b/solvers/matrixprofile.py @@ -0,0 +1,74 @@ +from benchopt import BaseSolver +from sklearn.preprocessing import MinMaxScaler + +from benchmark_utils.predictions import cutoff_scores +from benchmark_utils.windowing import find_period_length +from TSB_AD.models.MatrixProfile import MatrixProfile + + +class Solver(BaseSolver): + name = "MP" + + install_cmd = "conda" + requirements = ["pip::tsb-ad", "scikit-learn"] + + parameters = { + "window_size": [128, "auto"], + "cutoff": [None], + } + + test_config = { + "dataset": { + "n_features": 1, + }, + "window_size": 8, + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, X_test): + # Shapes received: (n_recordings, n_features, n_samples) + self.X_train = X_train + self.X_test = X_test + + n_features = X_train.shape[1] + + self.X_train = self.X_train.reshape(-1, n_features) + self.X_test = self.X_test.reshape(-1, n_features) + + if self.window_size == "auto": + self.window_size = int(find_period_length(X_train.reshape(-1))) + + self.clf = MatrixProfile( + window=self.window_size, + ) + + def run(self, _): + # Special solver, fitting on X_test + self.clf.fit(self.X_test.reshape(-1)) + anomaly_scores = self.clf.decision_scores_ + self.anomaly_scores = ( + MinMaxScaler(feature_range=(0, 1)) + .fit_transform(anomaly_scores.reshape(-1, 1)) + .ravel() + ) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + + def skip(self, X_train, X_test): + """Check if the solver can be skipped.""" + if (find_period_length(X_train.reshape(-1)) == 0) and ( + self.window_size == "auto"): + return True, "Window size is 0" + if X_train.shape[1] != 1: + return True, "Matrix Profile only supports univariate data" + return False, None + + def get_result(self): + """Return the result of the solver.""" + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/ocsvm.py b/solvers/ocsvm.py deleted file mode 100644 index 268e57c..0000000 --- a/solvers/ocsvm.py +++ /dev/null @@ -1,88 +0,0 @@ -from benchopt import BaseSolver, safe_import_context - -with safe_import_context() as import_ctx: - from sklearn.svm import OneClassSVM - import numpy as np - - -class Solver(BaseSolver): - name = "OCSVM" - - install_cmd = "conda" - requirements = ["scikit-learn"] - - parameters = { - "nu": [0.001, 0.01, 0.05], - "gamma": [1e-5, 1e-2], - "kernel": ["rbf"], - "window": [True], - "window_size": [128], - "stride": [1], - } - - sampling_strategy = "run_once" - - def set_objective(self, X_train, y_test, X_test): - self.X_train = X_train - self.X_test, self.y_test = X_test, y_test - self.clf = OneClassSVM( - nu=self.nu, - kernel=self.kernel, - gamma=self.gamma, - ) - - if self.window: - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - self.X_train, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - self.X_test, window_shape=self.window_size, axis=0 - )[::self.stride].transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - self.flatrain = self.Xw_train.reshape(self.Xw_train.shape[0], -1) - self.flatest = self.Xw_test.reshape(self.Xw_test.shape[0], -1) - - def run(self, _): - if self.window: - self.clf.fit(self.flatrain) - raw_y_hat = self.clf.predict(self.flatest) - raw_anomaly_score = self.clf.decision_function(self.flatest) - - # The results we get has a shape of - result_shape = ( - (self.X_train.shape[0] - self.window_size) // self.stride - ) + 1 - - # Mapping the binary output from {-1, 1} to {1, 0} - # For consistency with the other solvers - self.raw_y_hat = np.array(raw_y_hat) - - # Adding -1 for the non predicted samples - # The first window_size samples are not predicted by the model - self.raw_y_hat = np.where(self.raw_y_hat == -1, 1, 0) - self.raw_y_hat = np.append( - np.full(self.X_train.shape[0] - - result_shape, -1), self.raw_y_hat - ) - - # Anomaly scores (Not used but allows finer thresholding) - self.raw_anomaly_score = np.array(raw_anomaly_score) - self.raw_anomaly_score = np.append( - np.full(result_shape, -1), self.raw_anomaly_score - ) - - def skip(self, X_train, X_test, y_test): - if X_train.shape[0] < self.window_size: - return True, "Window size is larger than dataset size." - return False, None - - def get_result(self): - return dict(y_hat=self.raw_y_hat) diff --git a/solvers/rosecdl.py b/solvers/rosecdl.py new file mode 100644 index 0000000..6ccf54c --- /dev/null +++ b/solvers/rosecdl.py @@ -0,0 +1,100 @@ +from benchopt import BaseSolver + +import torch +from benchmark_utils.predictions import cutoff_scores +from benchmark_utils.windowing import find_period_length +from rosecdl.rosecdl import RoseCDL + + +class Solver(BaseSolver): + name = "RoseCDL" + + install_cmd = "conda" + requirements = [ + "pytorch", "pip::git+https://github.com/tommoral/rosecdl.git" + ] + + parameters = { + "n_components": [1], + "kernel_size": ["auto"], + "lmbd": [0.8], + "scale_lmbd": [False], + "epochs": [70], + "max_batch": [None], + "mini_batch_size": [600], + "sample_window": [1_000], + "optimizer": ["linesearch"], + "n_iterations": [90], + "window": [False], + "outliers_kwargs": [ + { + "method": "mad", + "alpha": 3.5, + "moving_average": None, + "union_channels": True, + "opening_window": True, + }, + ], + "plot": [False], + "cutoff": [None], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, X_test): + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") + + # We receive data in shape (n_recordings, n_features, n_samples) + self.X_train = torch.tensor( + X_train, dtype=torch.float32, device=self.device) + self.X_test = X_test + + if self.kernel_size == "auto": + self.kernel_size = int(find_period_length(X_train.reshape(-1))) + + self.clf = RoseCDL( + n_components=self.n_components, + n_channels=X_train.shape[1], + kernel_size=self.kernel_size, + lmbd=self.lmbd, + scale_lmbd=self.scale_lmbd, + epochs=self.epochs, + max_batch=self.max_batch, + mini_batch_size=self.mini_batch_size, + sample_window=self.sample_window, + optimizer=self.optimizer, + n_iterations=self.n_iterations, + window=self.window, + device=self.device, + outliers_kwargs=self.outliers_kwargs, + ) + + def run(self, _): + self.clf.fit(self.X_train) + del self.X_train # Free GPU memory for X_train after fitting + + xh, zh = self.clf.csc( + torch.tensor(self.X_test, dtype=torch.float32, device=self.device) + ) + err = self.clf.loss_fn.compute_patch_error( + X_hat=xh, + z_hat=zh, + X=torch.tensor(self.X_test, dtype=torch.float32, + device=self.device), + ) + err = err.cpu().detach().numpy() + # Aggregate errors over channels + self.anomaly_scores = err.sum(axis=1).reshape(-1) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + del self.clf # Free GPU memory for the model + torch.cuda.empty_cache() # Release cached GPU memory + + def get_result(self): + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/sktime_lof.py b/solvers/sktime_lof.py deleted file mode 100644 index 31e1c94..0000000 --- a/solvers/sktime_lof.py +++ /dev/null @@ -1,56 +0,0 @@ -from benchopt import BaseSolver -from benchopt import safe_import_context - -with safe_import_context() as import_ctx: - from sktime.annotation.lof import SubLOF - import pandas as pd - import numpy as np - - -class Solver(BaseSolver): - name = "SubLOF" - - install_cmd = "conda" - requirements = ["sktime", "pandas"] - - parameters = { - "n_neighbors": [5, 10, 20, 25, 40], - "window_size": [20, 64, 128], - "leaf_size": [30, 40], - "contamination": ["auto", 0.1, 0.2, 0.3], - } - - sampling_strategy = "run_once" - - def set_objective(self, X_train, y_test, X_test): - self.X_train = pd.DataFrame(X_train) - self.X_test, self.y_test = pd.DataFrame(X_test), y_test - self.clf = SubLOF( - n_neighbors=self.n_neighbors, - window_size=self.window_size, - leaf_size=self.leaf_size, - contamination=self.contamination, - n_jobs=-1, - novelty=True, - ) - - def run(self, _): - self.clf.fit(self.X_train) - self.raw_y_hat = self.clf.predict(self.X_test) - # self.raw_anomaly_score = self.clf.predict_score(self.X_test) - - def skip(self, X_train, y_test, X_test): - if self.n_neighbors > self.window_size: - return True, "Number of neighbors greater than window size" - if self.n_neighbors > X_train.shape[0]: - return True, "Number of neighbors greater than number of samples" - if self.leaf_size > X_train.shape[0]: - return True, "Leaf size greater than number of samples" - if self.window_size > X_train.shape[0]: - return True, "Window size greater than number of samples" - return False, None - - def get_result(self): - self.y_hat = np.zeros(self.X_test.shape[0]) - self.y_hat[self.raw_y_hat] = 1 - return dict(y_hat=self.y_hat) diff --git a/solvers/tsb_chronos.py b/solvers/tsb_chronos.py new file mode 100644 index 0000000..855c842 --- /dev/null +++ b/solvers/tsb_chronos.py @@ -0,0 +1,59 @@ +from benchopt import BaseSolver + +import torch +import numpy as np +from TSB_AD.models.Chronos import Chronos +from TSB_AD.utils.slidingWindows import find_length + +from benchmark_utils.predictions import cutoff_scores + + +class Solver(BaseSolver): + name = "TSB-Chronos" + + install_cmd = "conda" + requirements = ["pip::tsb-ad"] + + parameters = { + "win_size": ["auto"], + "prediction_length": [1], + "model_size": ['base'], + "batch_size": [32], + "cutoff": [None], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, X_test): + _, n_features, _ = X_train.shape + self.data = np.append(X_train, X_test, axis=2) + self.data = self.data.reshape(-1, n_features) + self.X_test = X_test.reshape(-1, n_features) + + if self.win_size == "auto": + self.win_size = int(find_length(X_train.reshape(-1))) + + self.clf = Chronos( + win_size=self.win_size, + input_c=n_features, + prediction_length=self.prediction_length, + model_size=self.model_size, + batch_size=self.batch_size, + ) + + def run(self, _): + self.clf.fit(self.data) + self.anomaly_scores = self.clf.decision_scores_[-len(self.X_test):] + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + + del self.clf # Free memory for the model + torch.cuda.empty_cache() # Release cached GPU memory + + def get_result(self): + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/tsb_timesfm.py b/solvers/tsb_timesfm.py new file mode 100644 index 0000000..cfa591b --- /dev/null +++ b/solvers/tsb_timesfm.py @@ -0,0 +1,52 @@ +from benchopt import BaseSolver + +from importlib.util import find_spec + +import numpy as np +import torch +from TSB_AD.model_wrapper import run_TimesFM + +from benchmark_utils.predictions import cutoff_scores + + +class Solver(BaseSolver): + name = "TSB-TimesFM" + + install_cmd = "conda" + requirements = ["pip::tsb-ad", "pip::timesfm"] + + parameters = { + "win_size": [256], + "cutoff": [None], + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, X_test): + _, n_features, _ = X_train.shape + self.data = np.append(X_train, X_test, axis=2) + self.data = self.data.reshape(-1, n_features) + self.X_test = X_test.reshape(-1, n_features) + + def skip(self, X_train, X_test): + if find_spec("timesfm") is None: + return True, "TSB-TimesFM requires the optional timesfm package." + return False, None + + def run(self, _): + anomaly_scores = run_TimesFM( + data=self.data, + win_size=self.win_size, + ) + self.anomaly_scores = anomaly_scores[-len(self.X_test):] + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + torch.cuda.empty_cache() # Release cached GPU memory + + def get_result(self): + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/tsb_timesnet.py b/solvers/tsb_timesnet.py new file mode 100644 index 0000000..ed431ae --- /dev/null +++ b/solvers/tsb_timesnet.py @@ -0,0 +1,76 @@ +from benchopt import BaseSolver + +import torch +from TSB_AD.models.TimesNet import TimesNet + +from benchmark_utils.predictions import cutoff_scores + + +class Solver(BaseSolver): + name = "TSB-TimesNet" + + install_cmd = "conda" + requirements = ["pip::tsb-ad"] + + parameters = { + "window_size": [256], + "lr": [1e-4], + "epochs": [10], + "batch_size": [128], + "cutoff": [None], + } + + test_config = { + "dataset": { + "n_samples": 512, + "n_features": 2, + "n_anomaly": 32, + }, + "window_size": 32, + "epochs": 1, + "batch_size": 16, + } + + sampling_strategy = "run_once" + + def set_objective(self, X_train, X_test): + _, n_features, _ = X_train.shape + self.X_train = X_train.reshape(-1, n_features) + self.X_test = X_test.reshape(-1, n_features) + + self.clf = TimesNet( + win_size=self.window_size, + enc_in=n_features, + epochs=self.epochs, + batch_size=self.batch_size, + lr=self.lr, + patience=3, + features="M", + lradj="type1", + validation_size=0.2, + ) + + def run(self, _): + self.clf.fit(self.X_train) + self.anomaly_scores = self.clf.decision_function(self.X_test) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) + + del self.clf.model + del self.clf + torch.cuda.empty_cache() # Release cached GPU memory + + def skip(self, X_train, X_test): + if X_train.shape[-1] < self.window_size: + return True, "Not enough training samples to create a window." + if X_test.shape[-1] < self.window_size: + return True, "Not enough testing samples to create a window." + return False, None + + def get_result(self): + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/vae.py b/solvers/vae.py index 085f9af..9dfd400 100644 --- a/solvers/vae.py +++ b/solvers/vae.py @@ -1,81 +1,82 @@ -from benchopt import BaseSolver, safe_import_context +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - from pyod.models.vae import VAE - import numpy as np - import torch +import torch +from pyod.models.vae import VAE + +from benchmark_utils.predictions import cutoff_scores +from benchmark_utils.windowing import make_windows class Solver(BaseSolver): name = "VAE" install_cmd = "conda" - requirements = ["pyod", "tqdm", "pip:torch"] + requirements = ["pyod", "pytorch"] sampling_strategy = "run_once" parameters = { "contamination": [0.005, 0.05, 0.1, 0.2], "n_epochs": [50], - "window": [False], "window_size": [256], "horizon": [0], "stride": [1], "batch_size": [128], - "preprocessing": [True, False], + "preprocessing": [True], "latent_dim": [2, 5, 10], - "batch_norm": [True, False], + "batch_norm": [True], "dropout_rate": [0.1, 0.2, 0.5], + "cutoff": [None], + } + test_config = { + "n_epochs": 1, + "window_size": 16, } - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) self.X_train = X_train - self.X_test, self.y_test = X_test, y_test - - self.clf = VAE(contamination=self.contamination, - preprocessing=self.preprocessing, - batch_size=self.batch_size, - epoch_num=self.n_epochs, - device=self.device, - latent_dim=self.latent_dim, - batch_norm=self.batch_norm, - dropout_rate=self.dropout_rate, - ) - - if self.window: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - X_train, - window_shape=self.window_size+self.horizon, - axis=0 - ).transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - X_test, - window_shape=self.window_size+self.horizon, - axis=0 - ).transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - self.yw_test = torch.tensor( - self.yw_test, dtype=torch.float32 - ) - else: - self.Xw_train = X_train - self.Xw_test = X_test + self.X_test = X_test + + self.Xw_train = make_windows( + X_train, + window_size=self.window_size, + stride=self.stride + ).reshape(-1, self.window_size * X_train.shape[1]) + + self.Xw_test = make_windows( + X_test, + window_size=self.window_size+self.horizon, + stride=self.stride, + padding=True + ).reshape(-1, self.window_size * X_train.shape[1]) + + self.clf = VAE( + contamination=self.contamination, + preprocessing=self.preprocessing, + batch_size=min(self.batch_size, len(self.Xw_train)), + epoch_num=self.n_epochs, + device=self.device, + latent_dim=self.latent_dim, + batch_norm=self.batch_norm, + dropout_rate=self.dropout_rate, + lr=1e-5 + ) def run(self, _): self.clf.fit(self.Xw_train) - self.y_pred = self.clf.predict(self.Xw_test) + self.anomaly_scores = self.clf.decision_function(self.Xw_test) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) def get_result(self): - return dict(y_hat=self.y_pred) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/solvers/vanilla-transformer.py b/solvers/vanilla-transformer.py index 677cfca..11f91dd 100644 --- a/solvers/vanilla-transformer.py +++ b/solvers/vanilla-transformer.py @@ -1,21 +1,24 @@ # Vanilla Transformer -from benchopt import BaseSolver, safe_import_context -from benchmark_utils import mean_overlaping_pred +from benchopt import BaseSolver -with safe_import_context() as import_ctx: - import torch - import torch.nn as nn - import torch.optim as optim - import numpy as np - from tqdm import tqdm - from benchmark_utils.models import TransformerModel +import numpy as np +from tqdm import tqdm +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader + +from benchmark_utils.models import TransformerModel +from benchmark_utils.windowing import make_windowed_dataset +from benchmark_utils.windowing import reconstruct_from_windows +from benchmark_utils.predictions import cutoff_scores class Solver(BaseSolver): name = "Transformer" install_cmd = "conda" - requirements = ["pip:torch", "tqdm"] + requirements = ["pytorch", "tqdm"] sampling_strategy = "run_once" @@ -27,20 +30,23 @@ class Solver(BaseSolver): "n_epochs": [50], "lr": [1e-5], "horizon": [1], - "window": [True], "window_size": [256], "stride": [1], - "percentile": [97], + "cutoff": [None], + } + test_config = { + "n_epochs": 1, + "window_size": 16, } - def set_objective(self, X_train, y_test, X_test): + def set_objective(self, X_train, X_test): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) self.X_train = X_train - self.X_test, self.y_test = X_test, y_test + self.X_test = X_test self.model = TransformerModel( n_features=X_train.shape[1], @@ -57,30 +63,22 @@ def set_objective(self, X_train, y_test, X_test): self.optimizer, mode='min', factor=0.5, patience=5 ) - # Using only windowed data, parameter used only for consistency - if self.window: - if self.X_train is not None: - self.Xw_train = np.lib.stride_tricks.sliding_window_view( - X_train, - window_shape=self.window_size+self.horizon, - axis=0 - ).transpose(0, 2, 1) - - if self.X_test is not None: - self.Xw_test = np.lib.stride_tricks.sliding_window_view( - X_test, - window_shape=self.window_size+self.horizon, - axis=0 - ).transpose(0, 2, 1) - - if self.y_test is not None: - self.yw_test = np.lib.stride_tricks.sliding_window_view( - self.y_test, window_shape=self.window_size, axis=0 - )[::self.stride] - - self.yw_test = torch.tensor( - self.yw_test, dtype=torch.float32 - ) + self.Xw_train = make_windowed_dataset( + X_train, + window_size=self.window_size+self.horizon, + stride=self.stride + ) + self.Xw_test = make_windowed_dataset( + X_test, + window_size=self.window_size+self.horizon, + stride=self.stride + ) + self.train_loader = DataLoader( + self.Xw_train, batch_size=self.batch_size, shuffle=True, + ) + self.test_loader = DataLoader( + self.Xw_test, batch_size=self.batch_size, shuffle=False, + ) def run(self, _): self.model.to(self.device) @@ -96,13 +94,10 @@ def run(self, _): for epoch in ti: self.model.train() total_loss = 0 - for i in range(0, len(self.Xw_train), self.batch_size): - x = torch.tensor( - self.Xw_train[i:i+self.batch_size, :self.window_size, :], - dtype=torch.float32).to(self.device) - y = torch.tensor( - self.Xw_train[i:i+self.batch_size, -self.horizon:, :], - dtype=torch.float32).to(self.device) + for x, in self.train_loader: + x = x.to(self.device) + y = x[:, -self.horizon:] + x = x[:, :-self.horizon] self.optimizer.zero_grad() output = self.model(x) @@ -117,7 +112,9 @@ def run(self, _): total_loss += loss.item() avg_loss = total_loss / (len(self.Xw_train) // self.batch_size) - ti.set_description(f"Epoch {epoch} (loss={avg_loss:.5e})") + ti.set_description( + f"Epoch {epoch} (loss={avg_loss:.5e})" + ) # Learning rate scheduling self.scheduler.step(avg_loss) @@ -126,7 +123,6 @@ def run(self, _): if avg_loss < best_loss: best_loss = avg_loss no_improve = 0 - torch.save(self.model.state_dict(), 'best_model.pth') else: no_improve += 1 if no_improve == patience: @@ -134,52 +130,48 @@ def run(self, _): # Test loop self.model.eval() - batch_size = 1024 all_predictions = [] with torch.no_grad(): - for i in range(0, len(self.Xw_test), batch_size): - batch = torch.tensor( - self.Xw_test[i:i+batch_size, :self.window_size, :], - dtype=torch.float32 - ).to(self.device) - - batch_predictions = self.model(batch) - - if batch_predictions.is_cuda: - batch_predictions = batch_predictions.cpu().numpy() - else: - batch_predictions = batch_predictions.numpy() - - all_predictions.append(batch_predictions) + for x, in self.test_loader: + batch = x[:, :self.window_size].to(self.device) + with torch.no_grad(): + batch_predictions = self.model(batch) + all_predictions.append(batch_predictions.cpu().numpy()) xw_hat = np.concatenate(all_predictions, axis=0) # Continue with the rest of your code for reconstructing predictions x_hat = np.zeros_like(self.X_test) - 1 - x_hat[self.window_size:self.window_size+self.horizon] = xw_hat[0] - x_hat[self.window_size+self.horizon:] = mean_overlaping_pred( - xw_hat, 1) - - # Calculating the percentile value for the threshold - percentile_value = np.percentile( - np.abs(self.X_test[self.window_size:] - x_hat[self.window_size:]), - self.percentile + x_hat[..., self.window_size:] = reconstruct_from_windows( + xw_hat, stride=self.stride, batch=len(self.X_test), + n_features=self.X_test.shape[1] ) - # Thresholding - predictions = np.zeros_like(x_hat)-1 - predictions[self.window_size:] = np.where( - np.abs(self.X_test[self.window_size:] - - x_hat[self.window_size:]) > percentile_value, 1, 0 + reconstruction_err = np.abs( + self.X_test[..., self.window_size:] - x_hat[..., self.window_size:] + ) + self.anomaly_scores = np.full( + self.X_test.shape[:1] + self.X_test.shape[2:], + np.nan, + dtype=float, + ) + self.anomaly_scores[..., self.window_size:] = np.max( + reconstruction_err, axis=1 ) - self.predictions = np.max(predictions, axis=1) + self.anomaly_predictions = cutoff_scores( + self.anomaly_scores, + cutoff=self.cutoff, + ) - def skip(self, X_train, X_test, y_test): - if X_train.shape[0] < self.window_size + self.horizon: + def skip(self, X_train, X_test): + if X_train.shape[-1] < self.window_size + self.horizon: return True, "No enough training samples" return False, None def get_result(self): - return dict(y_hat=self.predictions) + result = dict(anomaly_scores=self.anomaly_scores) + if self.anomaly_predictions is not None: + result["anomaly_predictions"] = self.anomaly_predictions + return result diff --git a/test_config.py b/test_config.py index 92e34d5..3606a74 100644 --- a/test_config.py +++ b/test_config.py @@ -1,21 +1,41 @@ import sys # noqa: F401 +from importlib.util import find_spec import pytest # noqa: F401 from benchopt.utils.sys_info import get_cuda_version -def check_test_solver_install(solver_class): +OPTIONAL_BACKEND_INSTALL_XFAILS = { + "dagmm": "DAGMM depends on the optional salesforce-merlion package.", + "mp": "MP depends on the optional TSB-AD package.", + "rosecdl": "RoseCDL depends on an optional GitHub package.", + "tsb-chronos": "TSB-Chronos depends on the optional TSB-AD backend.", + "tsb-timesfm": "TSB-TimesFM depends on TSB-AD and timesfm.", + "tsb-timesnet": "TSB-TimesNet depends on the optional TSB-AD backend.", +} + + +def check_test_solver_install(benchmark, solver_class): """Hook called in `test_solver_install`. If one solver needs to be skip/xfailed on some particular architecture, call pytest.xfail when detecting the situation. """ - if solver_class.name.lower() == "dif": + solver_name = solver_class.name.lower() + + if solver_name in OPTIONAL_BACKEND_INSTALL_XFAILS: + pytest.xfail(OPTIONAL_BACKEND_INSTALL_XFAILS[solver_name]) + + if solver_name == "dif": if get_cuda_version() is None: pytest.xfail("Deep IsolationForest needs a working GPU hardware.") + if solver_name == "anomalybert": + pytest.xfail("AnomalyBERT needs to be installed locally from repo" + " at https://github.com/Jhryu30/AnomalyBERT.git") + # if solver_class.name.lower() == "lstm": # if get_cuda_version() is None: # pytest.xfail("LSTM needs a working GPU hardware.") @@ -23,3 +43,22 @@ def check_test_solver_install(solver_class): # if solver_class.name.lower() == "transformer": # if get_cuda_version() is None: # pytest.xfail("Transformer needs a working GPU hardware.") + + +def check_test_solver_run(benchmark, solver_class): + """Hook called in `test_solver_run`.""" + if solver_class.name.lower() == "tsb-timesfm": + if find_spec("timesfm") is None: + pytest.xfail( + "TSB-TimesFM needs the optional timesfm package." + ) + + +def check_test_dataset_get_data(benchmark, dataset_class): + if dataset_class.name.lower() in [ + "daphnet", "dodgers", "ecg", "genesis", "ghl", + "iops", "kdd21", "mgab", "mitdb", "nab", + "occupancy", "opportunity", "sensorscope", "smd", + "svdb", "yahoo" + ]: + pytest.xfail(f"{dataset_class.name} dataset is not downloaded.") diff --git a/tests/test_mean_overlaping_pred.py b/tests/test_mean_overlaping_pred.py new file mode 100644 index 0000000..4189fa4 --- /dev/null +++ b/tests/test_mean_overlaping_pred.py @@ -0,0 +1,47 @@ +import numpy as np + +from benchmark_utils import mean_overlaping_pred + + +def test_length_horizon_one_stride_one(): + # 5 windows, horizon=1, stride=1 → reconstructed signal length is 5 + preds = np.arange(5).reshape(5, 1, 1).astype(float) + out = mean_overlaping_pred(preds, stride=1) + assert out.shape == (5, 1) + assert np.allclose(out.ravel(), np.arange(5)) + + +def test_length_horizon_gt_one(): + # 4 windows, H=3, stride=1 → (4-1)*1 + 3 = 6 positions + preds = np.ones((4, 3, 2)) + out = mean_overlaping_pred(preds, stride=1) + assert out.shape == (6, 2) + # every position covered, averaged value is 1.0 + assert np.allclose(out, 1.0) + + +def test_overlap_averages_correctly(): + # H=2, stride=1, 3 windows. Index 1 is covered by windows 0 and 1, + # index 2 by windows 1 and 2. + preds = np.array( + [[[1.0], [2.0]], + [[3.0], [4.0]], + [[5.0], [6.0]]] + ) + out = mean_overlaping_pred(preds, stride=1) + # positions: 0 -> 1, 1 -> mean(2, 3) = 2.5, 2 -> mean(4, 5) = 4.5, 3 -> 6 + assert out.shape == (4, 1) + assert np.allclose(out.ravel(), [1.0, 2.5, 4.5, 6.0]) + + +def test_stride_gt_one_no_overlap(): + # H=2, stride=2 → windows tile end-to-end + preds = np.array( + [[[1.0], [2.0]], + [[3.0], [4.0]], + [[5.0], [6.0]]] + ) + out = mean_overlaping_pred(preds, stride=2) + # (3-1)*2 + 2 = 6 positions, no overlap + assert out.shape == (6, 1) + assert np.allclose(out.ravel(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) diff --git a/tests/test_objective.py b/tests/test_objective.py new file mode 100644 index 0000000..6a50afc --- /dev/null +++ b/tests/test_objective.py @@ -0,0 +1,110 @@ +import numpy as np +import pytest + +from objective import Objective + + +def make_objective(score_metrics=("auc_pr", "auc_roc"), + prediction_metrics=None): + objective = Objective() + objective.score_metrics = score_metrics + objective.prediction_metrics = prediction_metrics + objective.set_data( + X_train=np.empty((1, 1, 6)), + y_test=np.array([0, 0, 1, 0, 1, 0]), + X_test=np.empty((1, 1, 6)), + ) + return objective + + +def test_default_evaluation_uses_score_metrics_only(): + objective = make_objective() + scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2]) + + result = objective.evaluate_result(anomaly_scores=scores) + + assert result["auc_pr"] == pytest.approx(1.0) + assert result["auc_roc"] == pytest.approx(1.0) + assert result["value"] == pytest.approx(0.0) + assert "precision" not in result + + +def test_score_and_prediction_metrics_use_canonical_keys(): + objective = make_objective( + score_metrics=("auc_pr",), + prediction_metrics=("precision",), + ) + scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2]) + predictions = np.array([0, 0, 1, 0, 1, 0]) + + result = objective.evaluate_result( + anomaly_scores=scores, + anomaly_predictions=predictions, + ) + + assert result["auc_pr"] == pytest.approx(1.0) + assert result["precision"] == pytest.approx(1.0) + + +def test_prediction_metrics_are_opt_in(): + objective = make_objective( + prediction_metrics=("precision", "recall", "f1", "zoloss"), + ) + scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2]) + predictions = np.array([0, 0, 1, 0, 1, 0]) + + result = objective.evaluate_result( + anomaly_scores=scores, + anomaly_predictions=predictions, + ) + + assert result["precision"] == pytest.approx(1.0) + assert result["recall"] == pytest.approx(1.0) + assert result["f1"] == pytest.approx(1.0) + assert result["zoloss"] == pytest.approx(0.0) + + +def test_prediction_metrics_require_prediction_array(): + objective = make_objective(prediction_metrics=("precision",)) + scores = np.array([0.1, 0.2, 0.9, 0.1, 0.8, 0.2]) + + with pytest.raises(ValueError, match="anomaly_predictions"): + objective.evaluate_result(anomaly_scores=scores) + + +def test_nan_score_padding_is_masked(): + objective = make_objective() + scores = np.array([np.nan, 0.2, 0.9, 0.1, 0.8, 0.2]) + + result = objective.evaluate_result(anomaly_scores=scores) + + assert result["auc_pr"] == pytest.approx(1.0) + assert result["auc_roc"] == pytest.approx(1.0) + + +def test_prediction_padding_is_masked(): + objective = make_objective( + score_metrics=None, + prediction_metrics=("precision", "recall", "f1"), + ) + predictions = np.array([-1, 0, 1, 0, 1, 0]) + + result = objective.evaluate_result(anomaly_predictions=predictions) + + assert result["precision"] == pytest.approx(1.0) + assert result["recall"] == pytest.approx(1.0) + assert result["f1"] == pytest.approx(1.0) + assert result["value"] == pytest.approx(0.0) + + +def test_prediction_only_metrics_without_primary_value_fallback_to_zero(): + objective = make_objective( + score_metrics=None, + prediction_metrics=("precision",), + ) + predictions = np.array([0, 0, 1, 0, 1, 0]) + + result = objective.evaluate_result(anomaly_predictions=predictions) + + assert result["precision"] == pytest.approx(1.0) + assert result["value"] == pytest.approx(0.0) diff --git a/tests/test_predictions.py b/tests/test_predictions.py new file mode 100644 index 0000000..1e8f9b7 --- /dev/null +++ b/tests/test_predictions.py @@ -0,0 +1,33 @@ +import numpy as np +import pytest + +from benchmark_utils.predictions import cutoff_scores + + +def test_cutoff_scores_returns_none_without_cutoff(): + scores = np.array([0.1, 0.8, 0.2]) + + assert cutoff_scores(scores) is None + + +def test_cutoff_scores_uses_top_score_fraction(): + scores = np.array([0.1, 0.8, 0.2, 0.9]) + + predictions = cutoff_scores(scores, cutoff=0.25) + + np.testing.assert_array_equal(predictions, np.array([0, 0, 0, 1])) + + +def test_cutoff_scores_preserves_nan_padding_as_ignore_label(): + scores = np.array([np.nan, 0.1, 0.8, 0.2, 0.9]) + + predictions = cutoff_scores(scores, cutoff=0.25) + + np.testing.assert_array_equal(predictions, np.array([-1, 0, 0, 0, 1])) + + +def test_cutoff_scores_rejects_invalid_cutoff(): + scores = np.array([0.1, 0.8, 0.2]) + + with pytest.raises(ValueError, match="must be in"): + cutoff_scores(scores, cutoff=1)