From 89ef5dbe26e337a265727738771e6514d42fa6a3 Mon Sep 17 00:00:00 2001 From: MihirT906 Date: Wed, 4 Jan 2023 12:35:09 +0530 Subject: [PATCH 1/8] Add primitive --- orion/primitives/timeseries_postprocessing.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 orion/primitives/timeseries_postprocessing.py diff --git a/orion/primitives/timeseries_postprocessing.py b/orion/primitives/timeseries_postprocessing.py new file mode 100644 index 00000000..e69de29b From be5f9f5eb07a1e3a3f98614c09b6f47c4910100a Mon Sep 17 00:00:00 2001 From: MihirT906 Date: Sat, 21 Jan 2023 09:59:11 +0530 Subject: [PATCH 2/8] Update TS Postprocessing primitive --- 2' | 0 Makefile | 2 +- orion/primitives/timeseries_postprocessing.py | 191 ++++++++++++++++ tests/readme_test/README.md | 185 +++++++++++++++ tests/readme_test/README_evaluate.md | 216 ++++++++++++++++++ 5 files changed, 593 insertions(+), 1 deletion(-) create mode 100644 2' create mode 100644 tests/readme_test/README.md create mode 100644 tests/readme_test/README_evaluate.md diff --git a/2' b/2' new file mode 100644 index 00000000..e69de29b diff --git a/Makefile b/Makefile index 44ef6a0e..6c43929b 100644 --- a/Makefile +++ b/Makefile @@ -308,4 +308,4 @@ docker-jupyter-package: docker-jupyter-save ## Build the docker-jupyter image an .PHONY: checkdeps checkdeps: # Save the currently installed versions of the dependencies as the latest versions - invoke checkdeps $(OUTPUT_PATH) + invoke checkdeps $(OUTPUT_PATH) \ No newline at end of file diff --git a/orion/primitives/timeseries_postprocessing.py b/orion/primitives/timeseries_postprocessing.py index e69de29b..6f5004ba 100644 --- a/orion/primitives/timeseries_postprocessing.py +++ b/orion/primitives/timeseries_postprocessing.py @@ -0,0 +1,191 @@ +import operator +import numpy as np +import pandas as pd + +def extract_dimension(X, dim=None): + """ + The function checks if the dataset being used is valid i.e has a length greater than 0 and contains the dimension required + Args: + X (ndarray): + N-dimensional value sequence to iterate over + dim (int): + Integer indicating the dimension number for a multi-dimensional dataset + Returns: + ndarray: + Returns an nd array that contains a dataset with 2 columns ['timestamp', 'value'] + + """ + + if (len(X) == 0): + return [] + + columns = X.columns.values + + if 'timestamp' not in columns: + X['timestamp'] = X.index.values + + if dim != None: + if dim in columns: + X['value'] = X[dim] + X = pd.DataFrame().assign(timestamp=X['timestamp'], value=X[dim]) + + if 'value' not in X.columns.values: + return [] + + + return X[['timestamp', 'value']] + + + +def rolling_std_thres(X, thres, op = ">", window_size=5): + """ + The function detects anomalies that are flagged through moving standard deviation thresholding + Args: + X (ndarray): + N-dimensional value sequence to iterate over. + thres (float): + Integer used to indicate the threshold of the function + op (str): + String indicating the operator used to compare with the threshold. Possible values are '<', '>', '<=', '>=', '==' + window_size (int): + Integer indicating the number of observations used for each window + + Returns: + ndarray: + Dataframe containing the timestamp and value of the flagged indices + + """ + + ops = {'>': operator.gt, + '<': operator.lt, + '>=': operator.ge, + '<=': operator.le, + '==': operator.eq} + + a = X['value'].rolling(window=window_size).std().values + idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thres)] + return X.loc[idx_arr] + + + +def diff_thres(X, thres = "0.1", op = ">"): + """ + The function detects anomalies that are flagged through moving standard deviation thresholding + Args: + X (ndarray): + N-dimensional value sequence to iterate over. + thres (float): + Integer used to indicate the threshold of the function + op (str): + String indicating the operator used to compare with the threshold. Possible values are '<', '>', '<=', '>=', '==' + + Returns: + ndarray: + Dataframe containing the timestamp and value of the flagged indices + + """ + + ops = {'>': operator.gt, + '<': operator.lt, + '>=': operator.ge, + '<=': operator.le, + '==': operator.eq} + + a = np.diff(X['value']) + idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thres)] + return X.loc[idx_arr] + + +def thresholding(X, thres, op): + """ + The function detects anomalies that are flagged through moving standard deviation thresholding + Args: + X (ndarray): + N-dimensional value sequence to iterate over. + thres (float): + Integer used to indicate the threshold of the function + op (str): + String indicating the operator used to compare with the threshold. Possible values are '<', '>', '<=', '>=', '==' + window_size (int): + Integer indicating the number of observations used for each window + + Returns: + list: + integers indicating the timestamps that were flagged + + """ + + ops = {'>': operator.gt, + '<': operator.lt, + '>=': operator.ge, + '<=': operator.le, + '==': operator.eq} + + a = X['value'] + idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thres)] + return X.loc[idx_arr] + + + + +def get_intervals(y, severity=True): + intervals = np.split(y, np.where(np.diff(y.index.values) > 1)[0] + 1) + if(severity): + return [(interval['timestamp'].values[0], interval['timestamp'].values[-1], np.mean(interval['value'])) for interval in intervals] + else: + return [(interval['timestamp'].values[0], interval['timestamp'].values[-1]) for interval in intervals] + + +def build_anomaly_intervals(y, severity=True, indices=False): + """Group together consecutive anomalous samples in anomaly intervals. + + This is a dummy boundary detection function that groups together + samples that have been consecutively flagged as anomalous and + returns boundaries of anomalous intervals. + + Optionally, it computes the average severity of each interval. + + This detector is here only to serve as reference of what + an boundary detection primitive looks like, and is not intended + to be used in real scenarios. + """ + + timestamps = y['timestamp'] + v = y['value'] + start = None + start_ts = None + intervals = list() + values = list() + for index, (value, timestamp) in enumerate(zip(v, timestamps)): + #if value != 0: + if start_ts is None: + start = index + start_ts = timestamp + if severity: + values.append(value) + + elif start_ts is not None: + interval = [start_ts, timestamp] + if indices: + interval.extend([start, index]) + if severity: + interval.append(np.mean(values)) + values = list() + + intervals.append(tuple(interval)) + + start = None + start_ts = None + + # We might have an open interval at the end + if start_ts is not None: + interval = [start_ts, timestamp] + if indices: + interval.extend([start, index]) + if severity: + interval.append(np.mean(values)) + + intervals.append(tuple(interval)) + + return np.array(intervals) + \ No newline at end of file diff --git a/tests/readme_test/README.md b/tests/readme_test/README.md new file mode 100644 index 00000000..fabfd67d --- /dev/null +++ b/tests/readme_test/README.md @@ -0,0 +1,185 @@ +

+“DAI-Lab” +An open source project from Data to AI Lab at MIT. +

+ +

+“Orion” +

+ +[![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) +[![Python](https://img.shields.io/badge/Python-3.6%20%7C%203.7%20%7C%203.8-blue)](https://badge.fury.io/py/orion-ml) +[![PyPi Shield](https://img.shields.io/pypi/v/orion-ml.svg)](https://pypi.python.org/pypi/orion-ml) +[![Tests](https://github.com/sintel-dev/Orion/workflows/Run%20Tests/badge.svg)](https://github.com/sintel-dev/Orion/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster) +[![Downloads](https://pepy.tech/badge/orion-ml)](https://pepy.tech/project/orion-ml) +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sintel-dev/Orion/master?filepath=tutorials) + +# Orion + +A machine learning library for unsupervised time series anomaly detection. + +| Important Links | | +| ----------------------------------- | -------------------------------------------------------------------- | +| :computer: **[Website]** | Check out the Sintel Website for more information about the project. | +| :book: **[Documentation]** | Quickstarts, User and Development Guides, and API Reference. | +| :star: **[Tutorials]** | Checkout our notebooks | +| :octocat: **[Repository]** | The link to the Github Repository of this library. | +| :scroll: **[License]** | The repository is published under the MIT License. | +| :keyboard: **[Development Status]** | This software is in its Pre-Alpha stage. | +| [![][Slack Logo] **Community**][Community] | Join our Slack Workspace for announcements and discussions. | + +[Website]: https://sintel.dev/ +[Documentation]: https://sintel-dev.github.io/Orion +[Tutorials]: https://github.com/sintel-dev/Orion/tree/master/tutorials +[Repository]: https://github.com/sintel-dev/Orion +[License]: https://github.com/sintel-dev/Orion/blob/master/LICENSE +[Development Status]: https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha +[Community]: https://join.slack.com/t/sintel-space/shared_invite/zt-q147oimb-4HcphcxPfDAM0O9_4PaUtw +[Slack Logo]: https://github.com/sintel-dev/Orion/blob/master/docs/images/slack.png + +# Overview + +Orion is a machine learning library built for *unsupervised time series anomaly detection*. With a given time series data, we provide a number of “verified” ML pipelines (a.k.a Orion pipelines) that identify rare patterns and flag them for expert review. + +The library makes use of a number of **automated machine learning** tools developed under [Data to AI Lab at MIT](https://dai.lids.mit.edu/). + +Read about using an Orion pipeline on NYC taxi dataset in a blog series: + +[Part 1: Learn about unsupervised time series anomaly detection](https://t.co/yIFVM1oRwQ?amp=1) | [Part 2: Learn how we use GANs to solving the problem? ](https://link.medium.com/cGsBD0Fevbb) | [Part 3: How does one evaluate anomaly detection pipelines?](https://link.medium.com/FqCrFXMevbb) +:--------------------------------------:|:---------------------------------------------:|:--------------------------------------------: +![](docs/images/tulog-part-1.png) | ![](docs/images/tulog-part-2.png) | ![](docs/images/tulog-part-3.png) + +**Notebooks:** Discover *Orion* through colab by launching our [notebooks](https://drive.google.com/drive/folders/1FAcCEiE1JDsqaMjGcmiw5a5XuGh13c9Q?usp=sharing)! + +# Quickstart + +## Install with pip + +The easiest and recommended way to install **Orion** is using [pip](https://pip.pypa.io/en/stable/): + +```bash +pip install orion-ml +``` + +This will pull and install the latest stable release from [PyPi](https://pypi.org/). + + +In the following example we show how to use one of the **Orion Pipelines**. + +## Fit an Orion pipeline + +We will load a demo data for this example: + +```python3 +from orion.data import load_signal + +train_data = load_signal('S-1-train') +train_data.head() +``` + +which should show a signal with `timestamp` and `value`. +``` + timestamp value +0 1222819200 -0.366359 +1 1222840800 -0.394108 +2 1222862400 0.403625 +3 1222884000 -0.362759 +4 1222905600 -0.370746 +``` + +In this example we use `lstm_dynamic_threshold` pipeline and set some hyperparameters (in this case training epochs as 5). + +```python3 +from orion import Orion + +hyperparameters = { + 'keras.Sequential.LSTMTimeSeriesRegressor#1': { + 'epochs': 5, + 'verbose': True + } +} + +orion = Orion( + pipeline='lstm_dynamic_threshold', + hyperparameters=hyperparameters +) + +orion.fit(train_data) +``` + +## Detect anomalies using the fitted pipeline +Once it is fitted, we are ready to use it to detect anomalies in our incoming time series: + +```python3 +new_data = load_signal('S-1-new') +anomalies = orion.detect(new_data) +``` +> :warning: Depending on your system and the exact versions that you might have installed some *WARNINGS* may be printed. These can be safely ignored as they do not interfere with the proper behavior of the pipeline. + +The output of the previous command will be a ``pandas.DataFrame`` containing a table of detected anomalies: + +``` + start end severity +0 1394323200 1399701600 0.673494 +``` + +# Leaderboard +In every release, we run Orion benchmark. We maintain an up-to-date leaderboard with the current scoring of the verified pipelines according to the benchmarking procedure. + +We run the benchmark on **11** datasets with their known grounth truth. We record the score of the pipelines on each datasets. To compute the leaderboard table, we showcase the number of wins each pipeline has over the ARIMA pipeline. + +| Pipeline | Outperforms ARIMA | +|---------------------------|--------------------| +| AER | 10 | +| TadGAN | 7 | +| LSTM Dynamic Thresholding | 8 | +| LSTM Autoencoder | 6 | +| Dense Autoencoder | 6 | +| VAE | 6 | +| Azure | 0 | + + +You can find the scores of each pipeline on every signal recorded in the [details Google Sheets document](https://docs.google.com/spreadsheets/d/1HaYDjY-BEXEObbi65fwG0om5d8kbRarhpK4mvOZVmqU/edit?usp=sharing). The summarized results can also be browsed in the following [summary Google Sheets document](https://docs.google.com/spreadsheets/d/1ZPUwYH8LhDovVeuJhKYGXYny7472HXVCzhX6D6PObmg/edit?usp=sharing). + +# Resources + +Additional resources that might be of interest: +* Learn about [benchmarking pipelines](BENCHMARK.md). +* Read about [pipeline evaluation](orion/evaluation/README.md). +* Find out more about [TadGAN](https://arxiv.org/pdf/2009.07769v3.pdf). + +# Citation + +If you use **Orion** which is part of the **Sintel** ecosystem for your research, please consider citing the following paper: + +Sarah Alnegheimish, Dongyu Liu, Carles Sala, Laure Berti-Equille, Kalyan Veeramachaneni. [Sintel: A Machine Learning Framework to Extract Insights from Signals](https://dl.acm.org/doi/pdf/10.1145/3514221.3517910). +``` +@inproceedings{alnegheimish2022sintel, + title={Sintel: A Machine Learning Framework to Extract Insights from Signals}, + author={Alnegheimish, Sarah and Liu, Dongyu and Sala, Carles and Berti-Equille, Laure and Veeramachaneni, Kalyan}, + booktitle={Proceedings of the 2022 International Conference on Management of Data}, + pages = {1855–1865}, + numpages = {11}, + publisher={Association for Computing Machinery}, + doi = {10.1145/3514221.3517910}, + series = {SIGMOD '22}, + year={2022} +} +``` + + +If you use **TadGAN** for your research, please consider citing the following paper: + +Alexander Geiger, Dongyu Liu, Sarah Alnegheimish, Alfredo Cuesta-Infante, Kalyan Veeramachaneni. [TadGAN - Time Series Anomaly Detection Using Generative Adversarial Networks](https://arxiv.org/pdf/2009.07769v3.pdf). + +``` +@inproceedings{geiger2020tadgan, + title={TadGAN: Time Series Anomaly Detection Using Generative Adversarial Networks}, + author={Geiger, Alexander and Liu, Dongyu and Alnegheimish, Sarah and Cuesta-Infante, Alfredo and Veeramachaneni, Kalyan}, + booktitle={2020 IEEE International Conference on Big Data (IEEE BigData)}, + pages={33-43}, + doi={10.1109/BigData50022.2020.9378139}, + organization={IEEE}, + year={2020} +} +``` diff --git a/tests/readme_test/README_evaluate.md b/tests/readme_test/README_evaluate.md new file mode 100644 index 00000000..375d8115 --- /dev/null +++ b/tests/readme_test/README_evaluate.md @@ -0,0 +1,216 @@ +# Evaluation + +This document explains the evaluation subpackage accompanied with Orion. It is used in order to evaluate how good a pipeline is at detecting anomalies. +In order to use this framework, we require two main arguments: known anomalies, and detected anomalies. + +## Anomaly types + +There are two approaches to defined anomalies: +- _Point anomalies_ which are identified by a single value in the time series. +- _Contextual anomalies_ which are identified by an anomalous interval, specifically the start/end timestamps. + +```python3 +# Example + +point_anomaly = [1222819200, 1222828100, 1223881200] + +contextual_anomaly = [(1222819200, 1392768000), + (1392768000, 1398729600), + (1398729600, 1399356000)] +``` + +We have created an evaluator for both types. +We also provide a suite of transformation functions in `utils.py` to help with converting one type to another. + + +## Calculating a Score + +Here we describe how we compute a score of how close a set of previously known anomalies and a set of detected anomalies are. + +### Point Scoring + +In point anomalies, we perform a point-wise comparison at each timestamp; this is done on a second (s) based frequency. + +#### Scoring Input + +The information that we have is: + +* The time series start (min) and end (max) timestamps. +* A list of timestamps for the known anomalies. +* A list of timestamps for the detected anomalies. + +An example of this would be: + +* Timeseries start, end + +```python3 +data_span = (1222819200, 1222819205) +``` + +* Known anomalies: + +```python3 +ground_truth = [ + 1222819200, + 1222819201, + 1222819202 +] +``` + +* Detected anomalies: + +```python3 +anomalies = [ + 1222819201, + 1222819202, + 1222819203 +] +``` + +#### Scoring process: Reformat as labels + +The solution implemented for point anomalies is to compute a list of labels, 1s and 0s, and then use the scikit-learn confusion matrix function as an intermediate to finding the accuracy, precision, recall, and f1 scores. + +For this we generate a sequence of the same length as `data_span` and fill the corresponding anomalies within the correct placement. + +Continuing on the previous example, we obtain the following: + +```python3 +truth = [1, 1, 1, 0, 0, 0] +detected = [0, 1, 1, 1, 0, 0] +``` + +This results with the following true negative (tn), false positive (fp), false negative (fn), true positive (tp): + +```python3 +from sklearn.metrics import confusion_matrix + +tn, fp, fn, tp = confusion_matrix(truth, detected).ravel() +``` + +Since we have the result of the confusion matrix, we can now compute the accuracy, precision, recall, and f1 score to evaluate the performance of the model. + +```python3 +# accuracy score +tn + tp / (tn + fp + fn + tp) # 0.667 +``` + +This entire process is implemented within the point metrics +```python3 +from orion.evaluation.point import point_accuracy, point_f1_score + +start, end = data_span + +point_accuracy(ground_truth, anomalies, start=start, end=end) # 0.667 +point_f1_score(ground_truth, anomalies, start=start, end=end) # 0.667 +``` + +### Contextual Scoring + +In contextual anomalies, we can compare the detected anomalies to the ground truth in two approaches: weighted segment, and overlap segment. + +#### Scoring Input + +The information that we have is: + +* The time series start (min) and end (max) timestamps. +* A list of start/stop pairs of timestamps for the known anomalies. +* A list of start/stop pairs of timestamps for the detected anomalies. + +An example of this would be: + +* Timeseries start, end + +```python3 +data_span = (1222819200, 1442016000) +``` + +* Known anomalies (in this case only one): + +```python3 +ground_truth = [ + (1392768000, 1402423200) +] +``` + +* Detected anomalies (in this case only one): + +```python3 +anomalies = [ + (1398729600, 1399356000) +] +``` + +#### Scoring process: Reformat as labels with weights (weighted segment) + +The solution implemented in Orion has been to use all the previous information to compute a list of labels, 1s and 0s, and then use the scikit-learn confusion matrix function passing a weights array as an intermediate to finding the accuracy, precision, recall, and f1 scores. + +Continuing on the previous example, we do the following: + +1. Make a sorted set of all the timestamps and compute consecutive intervals: + +```python3 +intervals = [ + (1222819200, 1392768000), + (1392768000, 1398729600), + (1398729600, 1399356001), + (1399356001, 1402423201), + (1402423201, 1442016000) +] +``` + +2. For both the known and detected anomalies sequences, compute a label for each interval using 1 if the interval intersects with one of the anomaly intervals in the sequence: + +```python3 +truth = [0, 1, 1, 1, 0] +detected = [0, 0, 1, 0, 0] +``` + +3. Compute a vector of weights using the lengths of the intervals: + +```python3 +weights = [169948800, 5961600, 626401, 3067200, 39592799] +``` + +4. Compute the confusion matrix using labels and weights: + +```python3 +from sklearn.metrics import confusion_matrix + +tn, fp, fn, tp = confusion_matrix( + truth, detected, sample_weight=weights, labels=[0, 1]).ravel() +``` + +5. Compute a score: + +```python3 +# accuracy score +tn + tp / (tn + fp + fn + tp) # 0.959 +``` + +This entire process is implemented within the contextual metrics +```python3 +from orion.evaluation import contextual_accuracy, contextual_f1_score + + +start, end = data_span + +contextual_accuracy(ground_truth, anomalies, start=start, end=end) # 0.959 +contextual_f1_score(ground_truth, anomalies, start=start, end=end) # 0.122 +``` + +#### Scoring process: Look for overlap between anomalies (overlap segment) + +In this methodology, we are more concerned with whether or not we were able to find an anomaly; even just a part of it. It records: + +* a true positive if a known anomalous window overlaps any detected windows. +* a false negative if a known anomalous window does not overlap any detected windows. +* a false positive if a detected window does not overlap any known anomalous region. + +To use this objective, we pass ``weighted=False`` in the metric method of choice. + +```python3 +start, end = data_span + +contextual_f1_score(ground_truth, anomalies, start=start, end=end, weighted=False) # 1.0 +``` From 62b9a84fedab71c9911a0c9f274eb8ff6d8d6f71 Mon Sep 17 00:00:00 2001 From: MihirT906 Date: Sat, 21 Jan 2023 10:21:01 +0530 Subject: [PATCH 3/8] Add TS postprocessing primitive JSONs --- ...ostprocessing.build_anomaly_intervals.json | 31 ++++++++++++++++ ....timeseries_postprocessing.diff_thres.json | 32 ++++++++++++++++ ...ries_postprocessing.extract_dimension.json | 27 ++++++++++++++ ...meseries_postprocessing.get_intervals.json | 27 ++++++++++++++ ...ries_postprocessing.rolling_std_thres.json | 37 +++++++++++++++++++ ...imeseries_postprocessing.thresholding.json | 33 +++++++++++++++++ 6 files changed, 187 insertions(+) create mode 100644 orion/primitives/jsons/orion.primitives.timeseries_postprocessing.build_anomaly_intervals.json create mode 100644 orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thres.json create mode 100644 orion/primitives/jsons/orion.primitives.timeseries_postprocessing.extract_dimension.json create mode 100644 orion/primitives/jsons/orion.primitives.timeseries_postprocessing.get_intervals.json create mode 100644 orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thres.json create mode 100644 orion/primitives/jsons/orion.primitives.timeseries_postprocessing.thresholding.json diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.build_anomaly_intervals.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.build_anomaly_intervals.json new file mode 100644 index 00000000..cd6f1f10 --- /dev/null +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.build_anomaly_intervals.json @@ -0,0 +1,31 @@ +{ + "name": "build_anomaly_intervals", + "primitive": "orion.primitives.timeseries_postprocessing.build_anomaly_intervals", + "description": "Builds anomaly intervals", + "produce": { + "args": [ + { + "name": "y", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "tunable": { + "severity": { + "type": "bool", + "default": true + }, + "indices": { + "type": "bool", + "default": false + } + } + } +} \ No newline at end of file diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thres.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thres.json new file mode 100644 index 00000000..4e0c1a6b --- /dev/null +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thres.json @@ -0,0 +1,32 @@ +{ + "name": "diff_thres", + "primitive": "orion.primitives.timeseries_postprocessing.diff_thres", + "description": "Detects anomalies by calculating n-th order discrete difference", + "produce": { + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "tunable": { + "thres": { + "type": "float", + "default": "0.1" + }, + "op": { + "type": "str", + "default": ">", + "options": ["<", ">", "<=", ">=", "=="] + } + } + } +} \ No newline at end of file diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.extract_dimension.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.extract_dimension.json new file mode 100644 index 00000000..414dc417 --- /dev/null +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.extract_dimension.json @@ -0,0 +1,27 @@ +{ + "name": "extract_dimension", + "primitive": "orion.primitives.timeseries_postprocessing.extract_dimension", + "description": "Dataset checks and slicing dimension", + "produce": { + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "tunable": { + "dim": { + "type": "int", + "default": null + } + } + } +} \ No newline at end of file diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.get_intervals.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.get_intervals.json new file mode 100644 index 00000000..1d2e2742 --- /dev/null +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.get_intervals.json @@ -0,0 +1,27 @@ +{ + "name": "get_intervals", + "primitive": "orion.primitives.timeseries_postprocessing.get_intervals", + "description": "Builds anomaly intervals", + "produce": { + "args": [ + { + "name": "y", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "tunable": { + "severity": { + "type": "bool", + "default": true + } + } + } +} \ No newline at end of file diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thres.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thres.json new file mode 100644 index 00000000..e411a5f1 --- /dev/null +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thres.json @@ -0,0 +1,37 @@ +{ + "name": "rolling_std_thres", + "primitive": "orion.primitives.timeseries_postprocessing.rolling_std_thres", + "description": "Detects anomalies through moving standard deviation thresholding", + "produce": { + "args": [ + { + "name": "X", + "type": "ndarray" + } + + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "tunable": { + "thres": { + "type": "float", + "default": 10 + }, + "op": { + "type": "str", + "default": ">", + "options": ["<", ">", "<=", ">=", "=="] + }, + "window_size": { + "type": "int", + "default": "5" + } + } + } +} \ No newline at end of file diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.thresholding.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.thresholding.json new file mode 100644 index 00000000..b8880a1c --- /dev/null +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.thresholding.json @@ -0,0 +1,33 @@ +{ + "name": "thresholding", + "primitive": "orion.primitives.timeseries_postprocessing.thresholding", + "description": "Detects anomalies through simple thresholding", + "produce": { + "args": [ + { + "name": "X", + "type": "ndarray" + } + + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "tunable": { + "thres": { + "type": "float", + "default": 10 + }, + "op": { + "type": "str", + "default": ">", + "options": ["<", ">", "<=", ">=", "=="] + } + } + } +} \ No newline at end of file From 239a1e1092c6e442502d72b544826537ca571c15 Mon Sep 17 00:00:00 2001 From: MihirT906 Date: Sat, 21 Jan 2023 12:17:17 +0530 Subject: [PATCH 4/8] Add SOL Pipelines --- .../sandbox/SOL/diff_OMNI_SDM_3_11.json | 16 ++++++++++++++++ .../sandbox/SOL/movstd_OMNI_SDM_3_11.json | 17 +++++++++++++++++ .../sandbox/SOL/movstd_numenta_AISD.json | 17 +++++++++++++++++ .../sandbox/SOL/thres_OMNI_SDM_3_11.json | 16 ++++++++++++++++ .../sandbox/SOL/thres_yahoo_real1.json | 16 ++++++++++++++++ 5 files changed, 82 insertions(+) create mode 100644 orion/pipelines/sandbox/SOL/diff_OMNI_SDM_3_11.json create mode 100644 orion/pipelines/sandbox/SOL/movstd_OMNI_SDM_3_11.json create mode 100644 orion/pipelines/sandbox/SOL/movstd_numenta_AISD.json create mode 100644 orion/pipelines/sandbox/SOL/thres_OMNI_SDM_3_11.json create mode 100644 orion/pipelines/sandbox/SOL/thres_yahoo_real1.json diff --git a/orion/pipelines/sandbox/SOL/diff_OMNI_SDM_3_11.json b/orion/pipelines/sandbox/SOL/diff_OMNI_SDM_3_11.json new file mode 100644 index 00000000..79ee6307 --- /dev/null +++ b/orion/pipelines/sandbox/SOL/diff_OMNI_SDM_3_11.json @@ -0,0 +1,16 @@ +{ + "primitives": [ + "orion.primitives.timeseries_postprocessing.extract_dimension", + "orion.primitives.timeseries_postprocessing.diff_thres", + "orion.primitives.timeseries_postprocessing.get_intervals" + ], + "init_params": { + "orion.primitives.timeseries_postprocessing.diff_thres#1": { + "thres": 0.1, + "op": ">" + }, + "orion.primitives.intervals.get_intervals#1": { + "severity": true + } + } +} diff --git a/orion/pipelines/sandbox/SOL/movstd_OMNI_SDM_3_11.json b/orion/pipelines/sandbox/SOL/movstd_OMNI_SDM_3_11.json new file mode 100644 index 00000000..3f9e3497 --- /dev/null +++ b/orion/pipelines/sandbox/SOL/movstd_OMNI_SDM_3_11.json @@ -0,0 +1,17 @@ +{ + "primitives": [ + "orion.primitives.timeseries_postprocessing.extract_dimension", + "orion.primitives.timeseries_postprocessing.rolling_std_thres", + "orion.primitives.timeseries_postprocessing.get_intervals" + ], + "init_params": { + "orion.primitives.timeseries_postprocessing.rolling_std_thres#1": { + "thres": 10, + "op": ">", + "window_size": 5 + }, + "orion.primitives.intervals.get_intervals#1": { + "severity": false + } + } +} diff --git a/orion/pipelines/sandbox/SOL/movstd_numenta_AISD.json b/orion/pipelines/sandbox/SOL/movstd_numenta_AISD.json new file mode 100644 index 00000000..d2ac9ae2 --- /dev/null +++ b/orion/pipelines/sandbox/SOL/movstd_numenta_AISD.json @@ -0,0 +1,17 @@ +{ + "primitives": [ + "orion.primitives.timeseries_postprocessing.extract_dimension", + "orion.primitives.timeseries_postprocessing.rolling_std_thres", + "orion.primitives.timeseries_postprocessing.get_intervals" + ], + "init_params": { + "orion.primitives.timeseries_postprocessing.rolling_std_thres#1": { + "thres": 10, + "op": ">", + "window_size": 5 + }, + "orion.primitives.intervals.get_intervals#1": { + "severity": true + } + } +} diff --git a/orion/pipelines/sandbox/SOL/thres_OMNI_SDM_3_11.json b/orion/pipelines/sandbox/SOL/thres_OMNI_SDM_3_11.json new file mode 100644 index 00000000..9c97bcac --- /dev/null +++ b/orion/pipelines/sandbox/SOL/thres_OMNI_SDM_3_11.json @@ -0,0 +1,16 @@ +{ + "primitives": [ + "orion.primitives.timeseries_postprocessing.extract_dimension", + "orion.primitives.timeseries_postprocessing.thresholding", + "orion.primitives.timeseries_postprocessing.get_intervals" + ], + "init_params": { + "orion.primitives.timeseries_postprocessing.thresholding#1": { + "thres": 0.01, + "op": "<" + }, + "orion.primitives.intervals.get_intervals#1": { + "severity": true + } + } +} diff --git a/orion/pipelines/sandbox/SOL/thres_yahoo_real1.json b/orion/pipelines/sandbox/SOL/thres_yahoo_real1.json new file mode 100644 index 00000000..42dabbc2 --- /dev/null +++ b/orion/pipelines/sandbox/SOL/thres_yahoo_real1.json @@ -0,0 +1,16 @@ +{ + "primitives": [ + "orion.primitives.timeseries_postprocessing.extract_dimension", + "orion.primitives.timeseries_postprocessing.thresholding", + "orion.primitives.timeseries_postprocessing.get_intervals" + ], + "init_params": { + "orion.primitives.timeseries_postprocessing.thresholding#1": { + "thres": 0.45, + "op": ">" + }, + "orion.primitives.intervals.get_intervals#1": { + "severity": true + } + } +} From 680f6fdc3ea1724eb9ef207a28a3cee26b3d50dc Mon Sep 17 00:00:00 2001 From: MihirT906 Date: Sun, 12 Feb 2023 15:18:01 +0530 Subject: [PATCH 5/8] Update OPs to global variable --- orion/primitives/timeseries_postprocessing.py | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/orion/primitives/timeseries_postprocessing.py b/orion/primitives/timeseries_postprocessing.py index 6f5004ba..e6b6a934 100644 --- a/orion/primitives/timeseries_postprocessing.py +++ b/orion/primitives/timeseries_postprocessing.py @@ -2,6 +2,12 @@ import numpy as np import pandas as pd +ops = {'>': operator.gt, + '<': operator.lt, + '>=': operator.ge, + '<=': operator.le, + '==': operator.eq} + def extract_dimension(X, dim=None): """ The function checks if the dataset being used is valid i.e has a length greater than 0 and contains the dimension required @@ -56,12 +62,6 @@ def rolling_std_thres(X, thres, op = ">", window_size=5): """ - ops = {'>': operator.gt, - '<': operator.lt, - '>=': operator.ge, - '<=': operator.le, - '==': operator.eq} - a = X['value'].rolling(window=window_size).std().values idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thres)] return X.loc[idx_arr] @@ -84,12 +84,6 @@ def diff_thres(X, thres = "0.1", op = ">"): Dataframe containing the timestamp and value of the flagged indices """ - - ops = {'>': operator.gt, - '<': operator.lt, - '>=': operator.ge, - '<=': operator.le, - '==': operator.eq} a = np.diff(X['value']) idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thres)] @@ -115,11 +109,6 @@ def thresholding(X, thres, op): """ - ops = {'>': operator.gt, - '<': operator.lt, - '>=': operator.ge, - '<=': operator.le, - '==': operator.eq} a = X['value'] idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thres)] From 3d8596f3565f97952dfef8d7bfdcade151e07280 Mon Sep 17 00:00:00 2001 From: MihirT906 Date: Sat, 25 Feb 2023 15:34:21 +0530 Subject: [PATCH 6/8] Update Docstrings --- ...ostprocessing.build_anomaly_intervals.json | 31 ----- orion/primitives/timeseries_postprocessing.py | 125 ++++++------------ 2 files changed, 42 insertions(+), 114 deletions(-) delete mode 100644 orion/primitives/jsons/orion.primitives.timeseries_postprocessing.build_anomaly_intervals.json diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.build_anomaly_intervals.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.build_anomaly_intervals.json deleted file mode 100644 index cd6f1f10..00000000 --- a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.build_anomaly_intervals.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "name": "build_anomaly_intervals", - "primitive": "orion.primitives.timeseries_postprocessing.build_anomaly_intervals", - "description": "Builds anomaly intervals", - "produce": { - "args": [ - { - "name": "y", - "type": "ndarray" - } - ], - "output": [ - { - "name": "y", - "type": "ndarray" - } - ] - }, - "hyperparameters": { - "tunable": { - "severity": { - "type": "bool", - "default": true - }, - "indices": { - "type": "bool", - "default": false - } - } - } -} \ No newline at end of file diff --git a/orion/primitives/timeseries_postprocessing.py b/orion/primitives/timeseries_postprocessing.py index e6b6a934..a0fb9b5e 100644 --- a/orion/primitives/timeseries_postprocessing.py +++ b/orion/primitives/timeseries_postprocessing.py @@ -9,19 +9,21 @@ '==': operator.eq} def extract_dimension(X, dim=None): - """ - The function checks if the dataset being used is valid i.e has a length greater than 0 and contains the dimension required + """Validate data dimension. + + The function checks if the dataset being used is valid i.e has a length + greater than 0 and contains the dimension required + Args: X (ndarray): N-dimensional value sequence to iterate over dim (int): Integer indicating the dimension number for a multi-dimensional dataset + Returns: ndarray: Returns an nd array that contains a dataset with 2 columns ['timestamp', 'value'] - """ - if (len(X) == 0): return [] @@ -38,143 +40,100 @@ def extract_dimension(X, dim=None): if 'value' not in X.columns.values: return [] - return X[['timestamp', 'value']] - def rolling_std_thres(X, thres, op = ">", window_size=5): - """ - The function detects anomalies that are flagged through moving standard deviation thresholding + """Apply moving standard deviation thesholding. + + The function flags anomalies based on moving standard deviation thresholding + Args: X (ndarray): N-dimensional value sequence to iterate over. thres (float): Integer used to indicate the threshold of the function op (str): - String indicating the operator used to compare with the threshold. Possible values are '<', '>', '<=', '>=', '==' + String indicating the operator used to compare with the threshold. + Possible values are '<', '>', '<=', '>=', '==' window_size (int): Integer indicating the number of observations used for each window Returns: ndarray: Dataframe containing the timestamp and value of the flagged indices - - """ - + """ a = X['value'].rolling(window=window_size).std().values idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thres)] return X.loc[idx_arr] - def diff_thres(X, thres = "0.1", op = ">"): - """ - The function detects anomalies that are flagged through moving standard deviation thresholding + """Apply discrete difference thresholding. + + The function flags anomalies based on n-th discrete difference thresholding + Args: X (ndarray): N-dimensional value sequence to iterate over. thres (float): Integer used to indicate the threshold of the function op (str): - String indicating the operator used to compare with the threshold. Possible values are '<', '>', '<=', '>=', '==' + String indicating the operator used to compare with the threshold. + Possible values are '<', '>', '<=', '>=', '==' Returns: ndarray: Dataframe containing the timestamp and value of the flagged indices - """ - a = np.diff(X['value']) idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thres)] return X.loc[idx_arr] def thresholding(X, thres, op): - """ - The function detects anomalies that are flagged through moving standard deviation thresholding + """Apply simple thresholding. + + The function flags anomalies based on simple thresholding + Args: X (ndarray): N-dimensional value sequence to iterate over. thres (float): Integer used to indicate the threshold of the function op (str): - String indicating the operator used to compare with the threshold. Possible values are '<', '>', '<=', '>=', '==' - window_size (int): - Integer indicating the number of observations used for each window + String indicating the operator used to compare with the threshold. + Possible values are '<', '>', '<=', '>=', '==' Returns: list: integers indicating the timestamps that were flagged - """ - - a = X['value'] idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thres)] return X.loc[idx_arr] +def get_intervals(y, severity=True): + """Group together consecutive anomalies in anomaly internals. + The function groups together samples that have been consecutively flagged + as an anomaly and returns the lower and upper bound of the boundary. + Optionally, it computes the average severity of each interval. -def get_intervals(y, severity=True): + Args: + y (ndarray): + N-dimensional array containing the flagged anomalies of the dataset + thres (bool): + Optional. Indicates whether the average severity of each interval + should be calculated + + Returns: + ndarray: + Array containing the anomaly intervals + """ intervals = np.split(y, np.where(np.diff(y.index.values) > 1)[0] + 1) if(severity): return [(interval['timestamp'].values[0], interval['timestamp'].values[-1], np.mean(interval['value'])) for interval in intervals] else: - return [(interval['timestamp'].values[0], interval['timestamp'].values[-1]) for interval in intervals] - - -def build_anomaly_intervals(y, severity=True, indices=False): - """Group together consecutive anomalous samples in anomaly intervals. - - This is a dummy boundary detection function that groups together - samples that have been consecutively flagged as anomalous and - returns boundaries of anomalous intervals. - - Optionally, it computes the average severity of each interval. - - This detector is here only to serve as reference of what - an boundary detection primitive looks like, and is not intended - to be used in real scenarios. - """ - - timestamps = y['timestamp'] - v = y['value'] - start = None - start_ts = None - intervals = list() - values = list() - for index, (value, timestamp) in enumerate(zip(v, timestamps)): - #if value != 0: - if start_ts is None: - start = index - start_ts = timestamp - if severity: - values.append(value) - - elif start_ts is not None: - interval = [start_ts, timestamp] - if indices: - interval.extend([start, index]) - if severity: - interval.append(np.mean(values)) - values = list() - - intervals.append(tuple(interval)) - - start = None - start_ts = None - - # We might have an open interval at the end - if start_ts is not None: - interval = [start_ts, timestamp] - if indices: - interval.extend([start, index]) - if severity: - interval.append(np.mean(values)) - - intervals.append(tuple(interval)) - - return np.array(intervals) - \ No newline at end of file + return [(interval['timestamp'].values[0], interval['timestamp'].values[-1]) for interval in intervals] \ No newline at end of file From 06e5dbb2d3233dbd182d8ccd049d99ec2436fa45 Mon Sep 17 00:00:00 2001 From: MihirT906 Date: Sat, 25 Feb 2023 16:16:15 +0530 Subject: [PATCH 7/8] Revert "Update TS Postprocessing primitive" This reverts commit be5f9f5eb07a1e3a3f98614c09b6f47c4910100a. --- 2' | 0 Makefile | 2 +- tests/readme_test/README.md | 185 ----------------------- tests/readme_test/README_evaluate.md | 216 --------------------------- 4 files changed, 1 insertion(+), 402 deletions(-) delete mode 100644 2' delete mode 100644 tests/readme_test/README.md delete mode 100644 tests/readme_test/README_evaluate.md diff --git a/2' b/2' deleted file mode 100644 index e69de29b..00000000 diff --git a/Makefile b/Makefile index 6c43929b..44ef6a0e 100644 --- a/Makefile +++ b/Makefile @@ -308,4 +308,4 @@ docker-jupyter-package: docker-jupyter-save ## Build the docker-jupyter image an .PHONY: checkdeps checkdeps: # Save the currently installed versions of the dependencies as the latest versions - invoke checkdeps $(OUTPUT_PATH) \ No newline at end of file + invoke checkdeps $(OUTPUT_PATH) diff --git a/tests/readme_test/README.md b/tests/readme_test/README.md deleted file mode 100644 index fabfd67d..00000000 --- a/tests/readme_test/README.md +++ /dev/null @@ -1,185 +0,0 @@ -

-“DAI-Lab” -An open source project from Data to AI Lab at MIT. -

- -

-“Orion” -

- -[![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) -[![Python](https://img.shields.io/badge/Python-3.6%20%7C%203.7%20%7C%203.8-blue)](https://badge.fury.io/py/orion-ml) -[![PyPi Shield](https://img.shields.io/pypi/v/orion-ml.svg)](https://pypi.python.org/pypi/orion-ml) -[![Tests](https://github.com/sintel-dev/Orion/workflows/Run%20Tests/badge.svg)](https://github.com/sintel-dev/Orion/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster) -[![Downloads](https://pepy.tech/badge/orion-ml)](https://pepy.tech/project/orion-ml) -[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sintel-dev/Orion/master?filepath=tutorials) - -# Orion - -A machine learning library for unsupervised time series anomaly detection. - -| Important Links | | -| ----------------------------------- | -------------------------------------------------------------------- | -| :computer: **[Website]** | Check out the Sintel Website for more information about the project. | -| :book: **[Documentation]** | Quickstarts, User and Development Guides, and API Reference. | -| :star: **[Tutorials]** | Checkout our notebooks | -| :octocat: **[Repository]** | The link to the Github Repository of this library. | -| :scroll: **[License]** | The repository is published under the MIT License. | -| :keyboard: **[Development Status]** | This software is in its Pre-Alpha stage. | -| [![][Slack Logo] **Community**][Community] | Join our Slack Workspace for announcements and discussions. | - -[Website]: https://sintel.dev/ -[Documentation]: https://sintel-dev.github.io/Orion -[Tutorials]: https://github.com/sintel-dev/Orion/tree/master/tutorials -[Repository]: https://github.com/sintel-dev/Orion -[License]: https://github.com/sintel-dev/Orion/blob/master/LICENSE -[Development Status]: https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha -[Community]: https://join.slack.com/t/sintel-space/shared_invite/zt-q147oimb-4HcphcxPfDAM0O9_4PaUtw -[Slack Logo]: https://github.com/sintel-dev/Orion/blob/master/docs/images/slack.png - -# Overview - -Orion is a machine learning library built for *unsupervised time series anomaly detection*. With a given time series data, we provide a number of “verified” ML pipelines (a.k.a Orion pipelines) that identify rare patterns and flag them for expert review. - -The library makes use of a number of **automated machine learning** tools developed under [Data to AI Lab at MIT](https://dai.lids.mit.edu/). - -Read about using an Orion pipeline on NYC taxi dataset in a blog series: - -[Part 1: Learn about unsupervised time series anomaly detection](https://t.co/yIFVM1oRwQ?amp=1) | [Part 2: Learn how we use GANs to solving the problem? ](https://link.medium.com/cGsBD0Fevbb) | [Part 3: How does one evaluate anomaly detection pipelines?](https://link.medium.com/FqCrFXMevbb) -:--------------------------------------:|:---------------------------------------------:|:--------------------------------------------: -![](docs/images/tulog-part-1.png) | ![](docs/images/tulog-part-2.png) | ![](docs/images/tulog-part-3.png) - -**Notebooks:** Discover *Orion* through colab by launching our [notebooks](https://drive.google.com/drive/folders/1FAcCEiE1JDsqaMjGcmiw5a5XuGh13c9Q?usp=sharing)! - -# Quickstart - -## Install with pip - -The easiest and recommended way to install **Orion** is using [pip](https://pip.pypa.io/en/stable/): - -```bash -pip install orion-ml -``` - -This will pull and install the latest stable release from [PyPi](https://pypi.org/). - - -In the following example we show how to use one of the **Orion Pipelines**. - -## Fit an Orion pipeline - -We will load a demo data for this example: - -```python3 -from orion.data import load_signal - -train_data = load_signal('S-1-train') -train_data.head() -``` - -which should show a signal with `timestamp` and `value`. -``` - timestamp value -0 1222819200 -0.366359 -1 1222840800 -0.394108 -2 1222862400 0.403625 -3 1222884000 -0.362759 -4 1222905600 -0.370746 -``` - -In this example we use `lstm_dynamic_threshold` pipeline and set some hyperparameters (in this case training epochs as 5). - -```python3 -from orion import Orion - -hyperparameters = { - 'keras.Sequential.LSTMTimeSeriesRegressor#1': { - 'epochs': 5, - 'verbose': True - } -} - -orion = Orion( - pipeline='lstm_dynamic_threshold', - hyperparameters=hyperparameters -) - -orion.fit(train_data) -``` - -## Detect anomalies using the fitted pipeline -Once it is fitted, we are ready to use it to detect anomalies in our incoming time series: - -```python3 -new_data = load_signal('S-1-new') -anomalies = orion.detect(new_data) -``` -> :warning: Depending on your system and the exact versions that you might have installed some *WARNINGS* may be printed. These can be safely ignored as they do not interfere with the proper behavior of the pipeline. - -The output of the previous command will be a ``pandas.DataFrame`` containing a table of detected anomalies: - -``` - start end severity -0 1394323200 1399701600 0.673494 -``` - -# Leaderboard -In every release, we run Orion benchmark. We maintain an up-to-date leaderboard with the current scoring of the verified pipelines according to the benchmarking procedure. - -We run the benchmark on **11** datasets with their known grounth truth. We record the score of the pipelines on each datasets. To compute the leaderboard table, we showcase the number of wins each pipeline has over the ARIMA pipeline. - -| Pipeline | Outperforms ARIMA | -|---------------------------|--------------------| -| AER | 10 | -| TadGAN | 7 | -| LSTM Dynamic Thresholding | 8 | -| LSTM Autoencoder | 6 | -| Dense Autoencoder | 6 | -| VAE | 6 | -| Azure | 0 | - - -You can find the scores of each pipeline on every signal recorded in the [details Google Sheets document](https://docs.google.com/spreadsheets/d/1HaYDjY-BEXEObbi65fwG0om5d8kbRarhpK4mvOZVmqU/edit?usp=sharing). The summarized results can also be browsed in the following [summary Google Sheets document](https://docs.google.com/spreadsheets/d/1ZPUwYH8LhDovVeuJhKYGXYny7472HXVCzhX6D6PObmg/edit?usp=sharing). - -# Resources - -Additional resources that might be of interest: -* Learn about [benchmarking pipelines](BENCHMARK.md). -* Read about [pipeline evaluation](orion/evaluation/README.md). -* Find out more about [TadGAN](https://arxiv.org/pdf/2009.07769v3.pdf). - -# Citation - -If you use **Orion** which is part of the **Sintel** ecosystem for your research, please consider citing the following paper: - -Sarah Alnegheimish, Dongyu Liu, Carles Sala, Laure Berti-Equille, Kalyan Veeramachaneni. [Sintel: A Machine Learning Framework to Extract Insights from Signals](https://dl.acm.org/doi/pdf/10.1145/3514221.3517910). -``` -@inproceedings{alnegheimish2022sintel, - title={Sintel: A Machine Learning Framework to Extract Insights from Signals}, - author={Alnegheimish, Sarah and Liu, Dongyu and Sala, Carles and Berti-Equille, Laure and Veeramachaneni, Kalyan}, - booktitle={Proceedings of the 2022 International Conference on Management of Data}, - pages = {1855–1865}, - numpages = {11}, - publisher={Association for Computing Machinery}, - doi = {10.1145/3514221.3517910}, - series = {SIGMOD '22}, - year={2022} -} -``` - - -If you use **TadGAN** for your research, please consider citing the following paper: - -Alexander Geiger, Dongyu Liu, Sarah Alnegheimish, Alfredo Cuesta-Infante, Kalyan Veeramachaneni. [TadGAN - Time Series Anomaly Detection Using Generative Adversarial Networks](https://arxiv.org/pdf/2009.07769v3.pdf). - -``` -@inproceedings{geiger2020tadgan, - title={TadGAN: Time Series Anomaly Detection Using Generative Adversarial Networks}, - author={Geiger, Alexander and Liu, Dongyu and Alnegheimish, Sarah and Cuesta-Infante, Alfredo and Veeramachaneni, Kalyan}, - booktitle={2020 IEEE International Conference on Big Data (IEEE BigData)}, - pages={33-43}, - doi={10.1109/BigData50022.2020.9378139}, - organization={IEEE}, - year={2020} -} -``` diff --git a/tests/readme_test/README_evaluate.md b/tests/readme_test/README_evaluate.md deleted file mode 100644 index 375d8115..00000000 --- a/tests/readme_test/README_evaluate.md +++ /dev/null @@ -1,216 +0,0 @@ -# Evaluation - -This document explains the evaluation subpackage accompanied with Orion. It is used in order to evaluate how good a pipeline is at detecting anomalies. -In order to use this framework, we require two main arguments: known anomalies, and detected anomalies. - -## Anomaly types - -There are two approaches to defined anomalies: -- _Point anomalies_ which are identified by a single value in the time series. -- _Contextual anomalies_ which are identified by an anomalous interval, specifically the start/end timestamps. - -```python3 -# Example - -point_anomaly = [1222819200, 1222828100, 1223881200] - -contextual_anomaly = [(1222819200, 1392768000), - (1392768000, 1398729600), - (1398729600, 1399356000)] -``` - -We have created an evaluator for both types. -We also provide a suite of transformation functions in `utils.py` to help with converting one type to another. - - -## Calculating a Score - -Here we describe how we compute a score of how close a set of previously known anomalies and a set of detected anomalies are. - -### Point Scoring - -In point anomalies, we perform a point-wise comparison at each timestamp; this is done on a second (s) based frequency. - -#### Scoring Input - -The information that we have is: - -* The time series start (min) and end (max) timestamps. -* A list of timestamps for the known anomalies. -* A list of timestamps for the detected anomalies. - -An example of this would be: - -* Timeseries start, end - -```python3 -data_span = (1222819200, 1222819205) -``` - -* Known anomalies: - -```python3 -ground_truth = [ - 1222819200, - 1222819201, - 1222819202 -] -``` - -* Detected anomalies: - -```python3 -anomalies = [ - 1222819201, - 1222819202, - 1222819203 -] -``` - -#### Scoring process: Reformat as labels - -The solution implemented for point anomalies is to compute a list of labels, 1s and 0s, and then use the scikit-learn confusion matrix function as an intermediate to finding the accuracy, precision, recall, and f1 scores. - -For this we generate a sequence of the same length as `data_span` and fill the corresponding anomalies within the correct placement. - -Continuing on the previous example, we obtain the following: - -```python3 -truth = [1, 1, 1, 0, 0, 0] -detected = [0, 1, 1, 1, 0, 0] -``` - -This results with the following true negative (tn), false positive (fp), false negative (fn), true positive (tp): - -```python3 -from sklearn.metrics import confusion_matrix - -tn, fp, fn, tp = confusion_matrix(truth, detected).ravel() -``` - -Since we have the result of the confusion matrix, we can now compute the accuracy, precision, recall, and f1 score to evaluate the performance of the model. - -```python3 -# accuracy score -tn + tp / (tn + fp + fn + tp) # 0.667 -``` - -This entire process is implemented within the point metrics -```python3 -from orion.evaluation.point import point_accuracy, point_f1_score - -start, end = data_span - -point_accuracy(ground_truth, anomalies, start=start, end=end) # 0.667 -point_f1_score(ground_truth, anomalies, start=start, end=end) # 0.667 -``` - -### Contextual Scoring - -In contextual anomalies, we can compare the detected anomalies to the ground truth in two approaches: weighted segment, and overlap segment. - -#### Scoring Input - -The information that we have is: - -* The time series start (min) and end (max) timestamps. -* A list of start/stop pairs of timestamps for the known anomalies. -* A list of start/stop pairs of timestamps for the detected anomalies. - -An example of this would be: - -* Timeseries start, end - -```python3 -data_span = (1222819200, 1442016000) -``` - -* Known anomalies (in this case only one): - -```python3 -ground_truth = [ - (1392768000, 1402423200) -] -``` - -* Detected anomalies (in this case only one): - -```python3 -anomalies = [ - (1398729600, 1399356000) -] -``` - -#### Scoring process: Reformat as labels with weights (weighted segment) - -The solution implemented in Orion has been to use all the previous information to compute a list of labels, 1s and 0s, and then use the scikit-learn confusion matrix function passing a weights array as an intermediate to finding the accuracy, precision, recall, and f1 scores. - -Continuing on the previous example, we do the following: - -1. Make a sorted set of all the timestamps and compute consecutive intervals: - -```python3 -intervals = [ - (1222819200, 1392768000), - (1392768000, 1398729600), - (1398729600, 1399356001), - (1399356001, 1402423201), - (1402423201, 1442016000) -] -``` - -2. For both the known and detected anomalies sequences, compute a label for each interval using 1 if the interval intersects with one of the anomaly intervals in the sequence: - -```python3 -truth = [0, 1, 1, 1, 0] -detected = [0, 0, 1, 0, 0] -``` - -3. Compute a vector of weights using the lengths of the intervals: - -```python3 -weights = [169948800, 5961600, 626401, 3067200, 39592799] -``` - -4. Compute the confusion matrix using labels and weights: - -```python3 -from sklearn.metrics import confusion_matrix - -tn, fp, fn, tp = confusion_matrix( - truth, detected, sample_weight=weights, labels=[0, 1]).ravel() -``` - -5. Compute a score: - -```python3 -# accuracy score -tn + tp / (tn + fp + fn + tp) # 0.959 -``` - -This entire process is implemented within the contextual metrics -```python3 -from orion.evaluation import contextual_accuracy, contextual_f1_score - - -start, end = data_span - -contextual_accuracy(ground_truth, anomalies, start=start, end=end) # 0.959 -contextual_f1_score(ground_truth, anomalies, start=start, end=end) # 0.122 -``` - -#### Scoring process: Look for overlap between anomalies (overlap segment) - -In this methodology, we are more concerned with whether or not we were able to find an anomaly; even just a part of it. It records: - -* a true positive if a known anomalous window overlaps any detected windows. -* a false negative if a known anomalous window does not overlap any detected windows. -* a false positive if a detected window does not overlap any known anomalous region. - -To use this objective, we pass ``weighted=False`` in the metric method of choice. - -```python3 -start, end = data_span - -contextual_f1_score(ground_truth, anomalies, start=start, end=end, weighted=False) # 1.0 -``` From 1a8d2031ad2781c8eba74ee99ab31f61b6dc43d6 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Thu, 16 Mar 2023 00:49:42 +0000 Subject: [PATCH 8/8] add tests + fix lint --- .../sandbox/SOL/diff_OMNI_SDM_3_11.json | 6 +- .../sandbox/SOL/movstd_OMNI_SDM_3_11.json | 6 +- .../sandbox/SOL/movstd_numenta_AISD.json | 6 +- ...imeseries_postprocessing.diff_thresh.json} | 6 +- ...es_postprocessing.rolling_std_thresh.json} | 6 +- orion/primitives/timeseries_postprocessing.py | 134 +++++++------ .../test_timeseries_postprocessing.py | 186 ++++++++++++++++++ 7 files changed, 273 insertions(+), 77 deletions(-) rename orion/primitives/jsons/{orion.primitives.timeseries_postprocessing.diff_thres.json => orion.primitives.timeseries_postprocessing.diff_thresh.json} (92%) rename orion/primitives/jsons/{orion.primitives.timeseries_postprocessing.rolling_std_thres.json => orion.primitives.timeseries_postprocessing.rolling_std_thresh.json} (91%) create mode 100644 tests/primitives/test_timeseries_postprocessing.py diff --git a/orion/pipelines/sandbox/SOL/diff_OMNI_SDM_3_11.json b/orion/pipelines/sandbox/SOL/diff_OMNI_SDM_3_11.json index 79ee6307..b700ecb7 100644 --- a/orion/pipelines/sandbox/SOL/diff_OMNI_SDM_3_11.json +++ b/orion/pipelines/sandbox/SOL/diff_OMNI_SDM_3_11.json @@ -1,12 +1,12 @@ { "primitives": [ "orion.primitives.timeseries_postprocessing.extract_dimension", - "orion.primitives.timeseries_postprocessing.diff_thres", + "orion.primitives.timeseries_postprocessing.diff_thresh", "orion.primitives.timeseries_postprocessing.get_intervals" ], "init_params": { - "orion.primitives.timeseries_postprocessing.diff_thres#1": { - "thres": 0.1, + "orion.primitives.timeseries_postprocessing.diff_thresh#1": { + "thresh": 0.1, "op": ">" }, "orion.primitives.intervals.get_intervals#1": { diff --git a/orion/pipelines/sandbox/SOL/movstd_OMNI_SDM_3_11.json b/orion/pipelines/sandbox/SOL/movstd_OMNI_SDM_3_11.json index 3f9e3497..c221a4bc 100644 --- a/orion/pipelines/sandbox/SOL/movstd_OMNI_SDM_3_11.json +++ b/orion/pipelines/sandbox/SOL/movstd_OMNI_SDM_3_11.json @@ -1,12 +1,12 @@ { "primitives": [ "orion.primitives.timeseries_postprocessing.extract_dimension", - "orion.primitives.timeseries_postprocessing.rolling_std_thres", + "orion.primitives.timeseries_postprocessing.rolling_std_thresh", "orion.primitives.timeseries_postprocessing.get_intervals" ], "init_params": { - "orion.primitives.timeseries_postprocessing.rolling_std_thres#1": { - "thres": 10, + "orion.primitives.timeseries_postprocessing.rolling_std_thresh#1": { + "thresh": 10, "op": ">", "window_size": 5 }, diff --git a/orion/pipelines/sandbox/SOL/movstd_numenta_AISD.json b/orion/pipelines/sandbox/SOL/movstd_numenta_AISD.json index d2ac9ae2..84a52ce8 100644 --- a/orion/pipelines/sandbox/SOL/movstd_numenta_AISD.json +++ b/orion/pipelines/sandbox/SOL/movstd_numenta_AISD.json @@ -1,12 +1,12 @@ { "primitives": [ "orion.primitives.timeseries_postprocessing.extract_dimension", - "orion.primitives.timeseries_postprocessing.rolling_std_thres", + "orion.primitives.timeseries_postprocessing.rolling_std_thresh", "orion.primitives.timeseries_postprocessing.get_intervals" ], "init_params": { - "orion.primitives.timeseries_postprocessing.rolling_std_thres#1": { - "thres": 10, + "orion.primitives.timeseries_postprocessing.rolling_std_thresh#1": { + "thresh": 10, "op": ">", "window_size": 5 }, diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thres.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thresh.json similarity index 92% rename from orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thres.json rename to orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thresh.json index 4e0c1a6b..3fa33da3 100644 --- a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thres.json +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thresh.json @@ -1,6 +1,6 @@ { - "name": "diff_thres", - "primitive": "orion.primitives.timeseries_postprocessing.diff_thres", + "name": "diff_thresh", + "primitive": "orion.primitives.timeseries_postprocessing.diff_thresh", "description": "Detects anomalies by calculating n-th order discrete difference", "produce": { "args": [ @@ -18,7 +18,7 @@ }, "hyperparameters": { "tunable": { - "thres": { + "thresh": { "type": "float", "default": "0.1" }, diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thres.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thresh.json similarity index 91% rename from orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thres.json rename to orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thresh.json index e411a5f1..a1621b4a 100644 --- a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thres.json +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thresh.json @@ -1,6 +1,6 @@ { - "name": "rolling_std_thres", - "primitive": "orion.primitives.timeseries_postprocessing.rolling_std_thres", + "name": "rolling_std_thresh", + "primitive": "orion.primitives.timeseries_postprocessing.rolling_std_thresh", "description": "Detects anomalies through moving standard deviation thresholding", "produce": { "args": [ @@ -19,7 +19,7 @@ }, "hyperparameters": { "tunable": { - "thres": { + "thresh": { "type": "float", "default": 10 }, diff --git a/orion/primitives/timeseries_postprocessing.py b/orion/primitives/timeseries_postprocessing.py index a0fb9b5e..dc1acc83 100644 --- a/orion/primitives/timeseries_postprocessing.py +++ b/orion/primitives/timeseries_postprocessing.py @@ -1,4 +1,5 @@ import operator + import numpy as np import pandas as pd @@ -8,112 +9,113 @@ '<=': operator.le, '==': operator.eq} + def extract_dimension(X, dim=None): """Validate data dimension. - The function checks if the dataset being used is valid i.e has a length - greater than 0 and contains the dimension required - + The function checks if the dataset being used is valid i.e has a length + greater than 0 and contains the dimension required. + Args: - X (ndarray): - N-dimensional value sequence to iterate over - dim (int): - Integer indicating the dimension number for a multi-dimensional dataset - + X (pd.DataFrame): + Data to validate and extract dimension from. + dim (str): + Column indicating the dimension number for a multi-dimensional dataset + Returns: - ndarray: - Returns an nd array that contains a dataset with 2 columns ['timestamp', 'value'] + pd.DataFrame: + Returns a dataframe that contains a dataset with 2 columns ['timestamp', 'value'] """ - if (len(X) == 0): + if len(X) == 0: return [] - + columns = X.columns.values - + if 'timestamp' not in columns: X['timestamp'] = X.index.values - - if dim != None: - if dim in columns: - X['value'] = X[dim] - X = pd.DataFrame().assign(timestamp=X['timestamp'], value=X[dim]) - - if 'value' not in X.columns.values: - return [] - + X = X.reset_index(drop=True) + + if dim is not None and dim in columns: + X['value'] = X[dim] + X = pd.DataFrame().assign(timestamp=X['timestamp'], value=X[dim]) + + if 'value' not in columns: + return [] + return X[['timestamp', 'value']] -def rolling_std_thres(X, thres, op = ">", window_size=5): +def rolling_std_thresh(X, thresh, op=">", window_size=5): """Apply moving standard deviation thesholding. - The function flags anomalies based on moving standard deviation thresholding + The function flags anomalies based on moving standard deviation thresholding. Args: - X (ndarray): + X (pd.DataFrame): N-dimensional value sequence to iterate over. - thres (float): - Integer used to indicate the threshold of the function + thresh (float): + Float used to indicate the threshold of the function. op (str): String indicating the operator used to compare with the threshold. - Possible values are '<', '>', '<=', '>=', '==' + Possible values are '<', '>', '<=', '>=', '=='. window_size (int): - Integer indicating the number of observations used for each window + Integer indicating the number of observations used for each window. Returns: - ndarray: - Dataframe containing the timestamp and value of the flagged indices - """ + list: + List of indices indicating the timestamps that were flagged. + """ a = X['value'].rolling(window=window_size).std().values - idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thres)] - return X.loc[idx_arr] + idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thresh)] + return X.iloc[idx_arr] -def diff_thres(X, thres = "0.1", op = ">"): +def diff_thresh(X, thresh=0.1, op=">"): """Apply discrete difference thresholding. - The function flags anomalies based on n-th discrete difference thresholding + The function flags anomalies based on n-th discrete difference thresholding. Args: - X (ndarray): + X (ndarray): N-dimensional value sequence to iterate over. - thres (float): - Integer used to indicate the threshold of the function + thresh (float): + Integer used to indicate the threshold of the function. op (str): String indicating the operator used to compare with the threshold. - Possible values are '<', '>', '<=', '>=', '==' + Possible values are '<', '>', '<=', '>=', '=='. Returns: - ndarray: - Dataframe containing the timestamp and value of the flagged indices + list: + List of indices indicating the timestamps that were flagged. """ a = np.diff(X['value']) - idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thres)] - return X.loc[idx_arr] + idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thresh)] + return X.iloc[idx_arr] -def thresholding(X, thres, op): +def thresholding(X, thresh, op): """Apply simple thresholding. The function flags anomalies based on simple thresholding Args: - X (ndarray): + X (ndarray): N-dimensional value sequence to iterate over. - thres (float): - Integer used to indicate the threshold of the function + thresh (float): + Integer used to indicate the threshold of the function. op (str): String indicating the operator used to compare with the threshold. - Possible values are '<', '>', '<=', '>=', '==' + Possible values are '<', '>', '<=', '>=', '=='. Returns: list: - integers indicating the timestamps that were flagged + List of indices indicating the timestamps that were flagged. """ a = X['value'] - idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thres)] - return X.loc[idx_arr] - - + idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thresh)] + return X.iloc[idx_arr] + + def get_intervals(y, severity=True): """Group together consecutive anomalies in anomaly internals. @@ -122,18 +124,26 @@ def get_intervals(y, severity=True): Optionally, it computes the average severity of each interval. Args: - y (ndarray): - N-dimensional array containing the flagged anomalies of the dataset - thres (bool): + y (ndarray): + N-dimensional array containing the flagged anomalies of the dataset. + severity (bool): Optional. Indicates whether the average severity of each interval - should be calculated + should be calculated. Returns: ndarray: Array containing the anomaly intervals """ intervals = np.split(y, np.where(np.diff(y.index.values) > 1)[0] + 1) - if(severity): - return [(interval['timestamp'].values[0], interval['timestamp'].values[-1], np.mean(interval['value'])) for interval in intervals] - else: - return [(interval['timestamp'].values[0], interval['timestamp'].values[-1]) for interval in intervals] \ No newline at end of file + + anomalies = list() + for interval in intervals: + timestamp = interval['timestamp'].values + + if severity: + anomalies.append((timestamp[0], timestamp[-1], np.mean(interval['value']))) + + else: + anomalies.append((timestamp[0], timestamp[-1])) + + return anomalies diff --git a/tests/primitives/test_timeseries_postprocessing.py b/tests/primitives/test_timeseries_postprocessing.py new file mode 100644 index 00000000..5f3e64e8 --- /dev/null +++ b/tests/primitives/test_timeseries_postprocessing.py @@ -0,0 +1,186 @@ +import pandas as pd +import pytest + +from orion.primitives.timeseries_postprocessing import ( + diff_thresh, extract_dimension, get_intervals, rolling_std_thresh, thresholding) + + +@pytest.fixture +def data(): + return pd.DataFrame({ + "timestamp": list(range(1, 11)), + "value": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + "test": [1, 2] * 5, + }) + + +@pytest.fixture +def signal(): + return pd.DataFrame({ + "timestamp": list(range(1, 11)), + "value": [0, 0, 1, 1, 5, 5, 10, 10, 2, 1], + }) + + +def test_extract_dimension(data): + # Setup + expected = data[['timestamp', 'value']] + + # Run + output = extract_dimension(data) + + # Assert + pd.testing.assert_frame_equal(output, expected) + + +def test_extract_dimension_dim(data): + # Setup + expected = data[['timestamp', 'test']] + expected.columns = ['timestamp', 'value'] + + # Run + output = extract_dimension(data, dim='test') + + # Assert + pd.testing.assert_frame_equal(output, expected) + + +def test_extract_dimension_empty(): + # Setup + data = pd.DataFrame() + + # Run + output = extract_dimension(data) + + # Assert + output == [] + + +def test_extract_dimension_timestamp_index(data): + # Setup + expected = data[['timestamp', 'value']] + data = data.set_index('timestamp') + + # Run + output = extract_dimension(data) + + # Assert + pd.testing.assert_frame_equal(output, expected) + + +def test_extract_dimension_no_value(data): + # Setup + data = data[['timestamp', 'test']] + + # Run + output = extract_dimension(data) + + # Assert + output == [] + + +def test_rolling_std_thresh(signal): + # Setup + expected = signal.iloc[[6, 7, 8, 9]] + + # Run + output = rolling_std_thresh(signal, 3) + + # Assert + output == expected + + +def test_rolling_std_thresh_one(signal): + # Setup + expected = signal.iloc[[9]] + + # Run + output = rolling_std_thresh(signal, 4) + + # Assert + output == expected + + +def test_rolling_std_thresh_empty(signal): + # Run + output = rolling_std_thresh(signal, 10) + + # Assert + len(output) == 0 + + +def test_diff_thresh(signal): + # Setup + expected = signal.iloc[[3, 5]] + + # Run + output = diff_thresh(signal, 1) + + # Assert + output == expected + + +def test_diff_thresh_equal(signal): + # Setup + expected = signal.iloc[[1, 3, 5]] + + # Run + output = diff_thresh(signal, 1, ">=") + + # Assert + output == expected + + +def test_thresholding(signal): + # Setup + expected = signal.iloc[[6, 7]] + + # Run + output = thresholding(signal, 5, ">") + + # Assert + output == expected + + +def test_thresholding_less(signal): + # Setup + expected = signal.iloc[[0, 1, 2, 3, 8, 9]] + + # Run + output = thresholding(signal, 5, "<") + + # Assert + output == expected + + +def test_get_intervals(signal): + # Setup + indicies = signal.iloc[[6, 7]] + expected = (7, 8) + + # Run + output = get_intervals(indicies, False) + + # Assert + assert isinstance(output, list) + assert len(output[0]) == 2 + + assert output[0][0] == expected[0] + assert output[0][1] == expected[1] + + +def test_get_intervals_severity(signal): + # Setup + indicies = signal.iloc[[6, 7]] + expected = (7, 8, 10) + + # Run + output = get_intervals(indicies) + + # Assert + assert isinstance(output, list) + assert len(output[0]) == 3 + + assert output[0][0] == expected[0] + assert output[0][1] == expected[1] + assert output[0][2] == expected[2]